ソースを参照

Fix: enable domain check for some reject data source

SukkaW 2 年 前
コミット
ca169b9db5

+ 1 - 1
Build/build-reject-domainset.ts

@@ -37,7 +37,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
           domainSets.add(host);
         });
       })),
-      ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2])),
+      ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2], entry[3])),
       ...ADGUARD_FILTERS.map(input => {
         const promise = typeof input === 'string'
           ? processFilterRules(input)

+ 2 - 1
Build/lib/cached-tld-parse.ts

@@ -3,6 +3,7 @@ import { createCache } from './cache-apply';
 import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
 
 const cache = createCache('cached-tld-parse', true);
+const cache2 = createCache('cached-tld-parse2', true);
 
 const sharedConfig = { allowPrivateDomains: true };
 const sharedConfig2 = { allowPrivateDomains: true, detectIp: false };
@@ -10,7 +11,7 @@ const sharedConfig2 = { allowPrivateDomains: true, detectIp: false };
 /** { allowPrivateDomains: true } */
 export const parse = (domain: string) => cache.sync(domain, () => tldts.parse(domain, sharedConfig));
 /** { allowPrivateDomains: true, detectIp: false } */
-export const parse2 = (domain: string) => cache.sync(domain, () => tldts.parse(domain, sharedConfig2));
+export const parse2 = (domain: string) => cache2.sync(domain, () => tldts.parse(domain, sharedConfig2));
 
 let gothillGetDomainCache: ReturnType<typeof createCache> | null = null;
 export const createCachedGorhillGetDomain = (gorhill: PublicSuffixList) => {

+ 2 - 2
Build/lib/get-phishing-domains.ts

@@ -85,8 +85,8 @@ const BLACK_TLD = new Set([
 
 export const getPhishingDomains = () => traceAsync('get phishing domains', async () => {
   const [domainSet, domainSet2, gorhill] = await Promise.all([
-    processHosts('https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true, TTL.THREE_HOURS()),
-    processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS()),
+    processDomainLists('https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, false, TTL.THREE_HOURS()),
+    processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true, true, TTL.THREE_HOURS()),
     getGorhillPublicSuffixPromise()
   ]);
   domainSet2.forEach((domain) => domainSet.add(domain));

+ 7 - 4
Build/lib/normalize-domain.ts

@@ -5,11 +5,14 @@ export const normalizeDomain = (domain: string) => {
   if (isProbablyIpv4(domain)) return null;
 
   const parsed = tldts.parse2(domain);
-  if (parsed.isIp) return null;
+  // if (parsed.isIp) return null;
+  if (!parsed.hostname) return null;
   if (!parsed.isIcann && !parsed.isPrivate) return null;
 
-  const h = parsed.hostname;
-  if (!h) return null;
+  let h = parsed.hostname;
+  if (h[0] === '.') h = h.slice(1);
+  if (h.endsWith('.')) h = h.slice(0, -1);
 
-  return h[0] === '.' ? h.slice(1) : h;
+  if (h) return h;
+  return null;
 };

+ 12 - 3
Build/lib/parse-filter.ts

@@ -11,17 +11,23 @@ import { normalizeDomain } from './normalize-domain';
 import { fetchAssets } from './fetch-assets';
 import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';
 
-const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
+const DEBUG_DOMAIN_TO_FIND: string | null = '.j3.4z0vc.chileinsumos.cl'; // example.com | null
 let foundDebugDomain = false;
 
-export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
+export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) {
   return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply(
     domainListsUrl,
     async () => {
       const domainSets = new Set<string>();
 
       for await (const line of await fetchRemoteTextByLine(domainListsUrl)) {
-        const domainToAdd = processLine(line);
+        let domainToAdd = processLine(line);
+        if (!domainToAdd) continue;
+
+        if (!skipDomainCheck) {
+          domainToAdd = normalizeDomain(domainToAdd);
+        }
+
         if (!domainToAdd) continue;
 
         if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
@@ -123,6 +129,9 @@ export async function processFilterRules(
 
         const flag = result[1];
         const hostname = result[0];
+        // if (hostname.endsWith('.')) {
+        //   hostname = hostname.slice(0, -1);
+        // }
 
         if (DEBUG_DOMAIN_TO_FIND) {
           if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {

+ 8 - 8
Build/lib/reject-data-source.ts

@@ -31,20 +31,20 @@ export const HOSTS = [
 export const DOMAIN_LISTS = [
   // CoinBlockerList
   // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
-  ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', true, TTL.TWO_WEEKS()],
+  ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', true, true, TTL.TWO_WEEKS()],
   // BarbBlock
   // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
-  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()],
+  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, true, TTL.TWO_WEEKS()],
   // DigitalSide Threat-Intel - OSINT Hub
   // Update once per day
-  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, TTL.ONE_DAY()],
+  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, true, TTL.ONE_DAY()],
   // AdGuard CNAME Filter Combined
   // Update on a 7 days basis, so we add a 3 hours cache ttl
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()]
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, true, TTL.THREE_DAYS()]
 ] as const;
 
 export const ADGUARD_FILTERS = [