Browse Source

Chore: prefer domain list

SukkaW 1 year ago
parent
commit
e4429a62ee

+ 3 - 3
Build/build-reject-domainset.ts

@@ -36,14 +36,14 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
       let shouldStop = false;
       let shouldStop = false;
       await Promise.all([
       await Promise.all([
         // Parse from remote hosts & domain lists
         // Parse from remote hosts & domain lists
-        ...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2], entry[3]).then(setAddFromArrayCurried(domainSets))),
+        ...HOSTS.map(entry => processHosts(childSpan, ...entry).then(setAddFromArrayCurried(domainSets))),
 
 
-        ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2]).then(setAddFromArrayCurried(domainSets))),
+        ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(setAddFromArrayCurried(domainSets))),
 
 
         ...ADGUARD_FILTERS.map(input => (
         ...ADGUARD_FILTERS.map(input => (
           typeof input === 'string'
           typeof input === 'string'
             ? processFilterRules(childSpan, input)
             ? processFilterRules(childSpan, input)
-            : processFilterRules(childSpan, input[0], input[1], input[2])
+            : processFilterRules(childSpan, ...input)
         ).then(({ white, black, foundDebugDomain }) => {
         ).then(({ white, black, foundDebugDomain }) => {
           if (foundDebugDomain) {
           if (foundDebugDomain) {
             // eslint-disable-next-line sukka/no-single-return -- not single return
             // eslint-disable-next-line sukka/no-single-return -- not single return

+ 2 - 0
Build/lib/append-array-in-place.ts

@@ -19,3 +19,5 @@ export function appendArrayInPlace<T>(dest: T[], source: T[]) {
   }
   }
   return dest;
   return dest;
 }
 }
+
+export const appendArrayInPlaceCurried = <T>(dest: T[]) => (source: T[]) => appendArrayInPlace(dest, source);

+ 5 - 6
Build/lib/get-phishing-domains.ts

@@ -4,7 +4,8 @@ import { getSubdomain, getPublicSuffix } from 'tldts-experimental';
 import { TTL } from './cache-filesystem';
 import { TTL } from './cache-filesystem';
 
 
 import type { Span } from '../trace';
 import type { Span } from '../trace';
-import { appendArrayInPlace } from './append-array-in-place';
+import { appendArrayInPlace, appendArrayInPlaceCurried } from './append-array-in-place';
+import { PHISHING_DOMAIN_LISTS } from './reject-data-source';
 
 
 const BLACK_TLD = new Set([
 const BLACK_TLD = new Set([
   'accountant',
   'accountant',
@@ -101,12 +102,10 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
   const gorhill = await getGorhillPublicSuffixPromise();
   const gorhill = await getGorhillPublicSuffixPromise();
 
 
   const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
   const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
-    const [domainSet, domainSet2] = await Promise.all([
-      processDomainLists(curSpan, 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, TTL.THREE_HOURS()),
-      processDomainLists(curSpan, 'https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS())
-    ]);
+    const domainSet: string[] = [];
 
 
-    appendArrayInPlace(domainSet, domainSet2);
+    (await Promise.all(PHISHING_DOMAIN_LISTS.map(entry => processDomainLists(curSpan, ...entry))))
+      .forEach(appendArrayInPlaceCurried(domainSet));
 
 
     return domainSet;
     return domainSet;
   });
   });

+ 29 - 12
Build/lib/parse-filter.ts

@@ -16,24 +16,41 @@ const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
 let foundDebugDomain = false;
 let foundDebugDomain = false;
 const temporaryBypass = DEBUG_DOMAIN_TO_FIND !== null;
 const temporaryBypass = DEBUG_DOMAIN_TO_FIND !== null;
 
 
-export function processDomainLists(span: Span, domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
-  return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn(() => fsFetchCache.apply(
+const domainListLineCb = (l: string, set: string[], includeAllSubDomain: boolean, meta: string) => {
+  let line = processLine(l);
+  if (!line) return;
+
+  line = normalizeDomain(line);
+  if (!line) return;
+
+  if (DEBUG_DOMAIN_TO_FIND && line.includes(DEBUG_DOMAIN_TO_FIND)) {
+    console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
+    foundDebugDomain = true;
+  }
+
+  set.push(includeAllSubDomain ? `.${line}` : line);
+};
+
+export function processDomainLists(span: Span, domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) {
+  return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply(
     domainListsUrl,
     domainListsUrl,
     async () => {
     async () => {
       const domainSets: string[] = [];
       const domainSets: string[] = [];
 
 
-      for await (const line of await fetchRemoteTextByLine(domainListsUrl)) {
-        let domainToAdd = processLine(line);
-        if (!domainToAdd) continue;
-        domainToAdd = normalizeDomain(domainToAdd);
-        if (!domainToAdd) continue;
-
-        if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
-          console.warn(picocolors.red(domainListsUrl), '(black)', domainToAdd.replaceAll(DEBUG_DOMAIN_TO_FIND, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
-          foundDebugDomain = true;
+      if (mirrors == null || mirrors.length === 0) {
+        for await (const l of await fetchRemoteTextByLine(domainListsUrl)) {
+          domainListLineCb(l, domainSets, includeAllSubDomain, domainListsUrl);
         }
         }
+      } else {
+        const filterRules = await childSpan
+          .traceChild('download domain list')
+          .traceAsyncFn(() => fetchAssets(domainListsUrl, mirrors).then(text => text.split('\n')));
 
 
-        domainSets.push(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
+        childSpan.traceChild('parse domain list').traceSyncFn(() => {
+          for (let i = 0, len = filterRules.length; i < len; i++) {
+            domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
+          }
+        });
       }
       }
 
 
       return domainSets;
       return domainSets;

+ 43 - 26
Build/lib/reject-data-source.ts

@@ -19,46 +19,63 @@ export const HOSTS: HostsSource[] = [
   ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.THREE_DAYS()],
   ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.THREE_DAYS()],
   // ad-wars is not actively maintained, so we set a 7 days cache ttl
   // ad-wars is not actively maintained, so we set a 7 days cache ttl
   ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()],
   ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()],
-  ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()],
-  // Curben's UrlHaus Malicious URL Blocklist
-  [
-    'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt',
-    [
-      'https://urlhaus-filter.pages.dev/urlhaus-filter-hosts.txt',
-      'https://malware-filter.gitlab.io/urlhaus-filter/urlhaus-filter-hosts.txt'
-    ],
-    true,
-    TTL.THREE_HOURS()
-  ]
-  // Curben's Phishing URL Blocklist
-  // Covered by lib/get-phishing-domains.ts
-  // 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt'
-  // 'https://phishing-filter.pages.dev/phishing-filter-agh.txt'
-  // ['https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true],
+  ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()]
 ] as const;
 ] as const;
 
 
-export const DOMAIN_LISTS = [
+export const DOMAIN_LISTS: HostsSource[] = [
   // CoinBlockerList
   // CoinBlockerList
   // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
   // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
-  ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', true, TTL.TWO_WEEKS()],
+  ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', [], true, TTL.TWO_WEEKS()],
   // BarbBlock
   // BarbBlock
   // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
   // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
-  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()],
+  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', [], true, TTL.TWO_WEEKS()],
   // DigitalSide Threat-Intel - OSINT Hub
   // DigitalSide Threat-Intel - OSINT Hub
   // Update once per day
   // Update once per day
-  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, TTL.ONE_DAY()],
+  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', [], true, TTL.ONE_DAY()],
   // Curben's PUP Domains Blocklist
   // Curben's PUP Domains Blocklist
   // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
   // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
   // 'https://pup-filter.pages.dev/pup-filter-agh.txt'
   // 'https://pup-filter.pages.dev/pup-filter-agh.txt'
   // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl
   // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl
-  ['https://curbengh.github.io/pup-filter/pup-filter-domains.txt', true, TTL.TWO_WEEKS()],
+  [
+    'https://curbengh.github.io/pup-filter/pup-filter-domains.txt',
+    [
+      'https://pup-filter.pages.dev/pup-filter-domains.txt',
+      'https://malware-filter.gitlab.io/pup-filter/pup-filter-domains.txt'
+    ],
+    true, TTL.TWO_WEEKS()
+  ],
+  // Curben's UrlHaus Malicious URL Blocklist
+  [
+    'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-domains.txt',
+    [
+      'https://urlhaus-filter.pages.dev/urlhaus-filter-domains.txt',
+      'https://malware-filter.gitlab.io/malware-filter/urlhaus-filter-domains.txt'
+    ],
+    true, TTL.THREE_HOURS()
+  ],
   // AdGuard CNAME Filter Combined
   // AdGuard CNAME Filter Combined
   // Update on a 7 days basis, so we add a 3 hours cache ttl
   // Update on a 7 days basis, so we add a 3 hours cache ttl
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()]
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', [], true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', [], true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', [], true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', [], true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', [], true, TTL.THREE_DAYS()]
+] as const;
+
+export const PHISHING_DOMAIN_LISTS: [HostsSource, HostsSource] = [
+  [
+    'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt',
+    [
+      'https://phishing-filter.pages.dev/phishing-filter-domains.txt',
+      'https://malware-filter.gitlab.io/malware-filter/phishing-filter-domains.txt'
+    ],
+    true, TTL.THREE_HOURS()
+  ],
+  [
+    'https://phishing.army/download/phishing_army_blocklist.txt',
+    [],
+    true, TTL.THREE_HOURS()
+  ]
 ] as const;
 ] as const;
 
 
 type AdGuardFilterSource = string | [main: string, mirrors: string[] | null, ttl: number];
 type AdGuardFilterSource = string | [main: string, mirrors: string[] | null, ttl: number];