浏览代码

Refactor: split reject and reject_extra

SukkaW 1 年之前
父节点
当前提交
f129152da8
共有 3 个文件被更改,包括 139 次插入79 次删除
  1. 74 25
      Build/build-reject-domainset.ts
  2. 63 52
      Build/constants/reject-data-source.ts
  3. 2 2
      Build/lib/get-phishing-domains.ts

+ 74 - 25
Build/build-reject-domainset.ts

@@ -4,7 +4,7 @@ import path from 'path';
 import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
 import { createTrie } from './lib/trie';
 
-import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS } from './constants/reject-data-source';
+import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source';
 import { createRuleset, compareAndWriteFile } from './lib/create-file';
 import { domainDeduper } from './lib/domain-deduper';
 import createKeywordFilter from './lib/aho-corasick';
@@ -29,6 +29,9 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
   const domainSets = new Set<string>();
   const appendArrayToDomainSets = setAddFromArrayCurried(domainSets);
 
+  const domainSetsExtra = new Set<string>();
+  const appendArrayToDomainSetsExtra = setAddFromArrayCurried(domainSetsExtra);
+
   // Parse from AdGuard Filters
   const shouldStop = await span
     .traceChild('download and process hosts / adblock filter rules')
@@ -38,7 +41,11 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
       await Promise.all([
         // Parse from remote hosts & domain lists
         HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSets)),
+        HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSetsExtra)),
+
         DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSets)),
+        DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSetsExtra)),
+
         ADGUARD_FILTERS.map(
           input => processFilterRules(childSpan, ...input)
             .then(({ white, black, foundDebugDomain }) => {
@@ -51,6 +58,19 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
               setAddFromArray(domainSets, black);
             })
         ),
+        ADGUARD_FILTERS_EXTRA.map(
+          input => processFilterRules(childSpan, ...input)
+            .then(({ white, black, foundDebugDomain }) => {
+              if (foundDebugDomain) {
+                // eslint-disable-next-line sukka/no-single-return -- not single return
+                shouldStop = true;
+                // we should not break here, as we want to see full matches from all data source
+              }
+              setAddFromArray(filterRuleWhitelistDomainSets, white);
+              setAddFromArray(domainSetsExtra, black);
+            })
+        ),
+
         ([
           'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt',
           'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt'
@@ -60,7 +80,7 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
             setAddFromArray(filterRuleWhitelistDomainSets, black);
           })
         )),
-        getPhishingDomains(childSpan).then(appendArrayToDomainSets),
+        getPhishingDomains(childSpan).then(appendArrayToDomainSetsExtra),
         getRejectSukkaConfPromise.then(appendArrayToDomainSets)
       ].flat());
       // eslint-disable-next-line sukka/no-single-return -- not single return
@@ -71,7 +91,7 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
     process.exit(1);
   }
 
-  console.log(`Import ${domainSets.size} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`);
+  console.log(`Import ${domainSets.size} + ${domainSetsExtra.size} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`);
 
   // Dedupe domainSets
   const domainKeywordsSet = await span.traceChildAsync('collect black keywords/suffixes', async () => {
@@ -91,25 +111,38 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
     return domainKeywordsSet;
   });
 
-  const trie = span.traceChildSync('create smol trie while deduping black keywords', () => {
-    const trie = createTrie(null, true, true);
+  const [baseTrie, extraTrie] = span.traceChildSync('create smol trie while deduping black keywords', () => {
+    const baseTrie = createTrie(null, true, true);
+    const extraTrie = createTrie(null, true, true);
 
     const kwfilter = createKeywordFilter(domainKeywordsSet);
 
     for (const domain of domainSets) {
       // exclude keyword when creating trie
       if (!kwfilter(domain)) {
-        trie.add(domain);
+        baseTrie.add(domain);
       }
     }
 
-    return trie;
+    for (const domain of domainSetsExtra) {
+      // exclude keyword when creating trie
+      if (!kwfilter(domain)) {
+        extraTrie.add(domain);
+      }
+    }
+
+    return [baseTrie, extraTrie] as const;
   });
 
-  span.traceChildSync('dedupe from white suffixes', () => filterRuleWhitelistDomainSets.forEach(trie.whitelist));
+  span.traceChildSync('dedupe from white suffixes (base)', () => filterRuleWhitelistDomainSets.forEach(baseTrie.whitelist));
+  span.traceChildSync('dedupe from white suffixes and base (extra)', () => {
+    domainSets.forEach(extraTrie.whitelist);
+    filterRuleWhitelistDomainSets.forEach(extraTrie.whitelist);
+  });
 
   // Dedupe domainSets
-  const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain', () => domainDeduper(trie));
+  const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainDeduper(baseTrie));
+  const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainDeduper(extraTrie));
 
   console.log(`Final size ${dudupedDominArray.length}`);
 
@@ -118,7 +151,7 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
     subdomainMap: domainArraySubdomainMap
   } = span.traceChildSync(
     'build map for stat and sort',
-    () => buildParseDomainMap(dudupedDominArray)
+    () => buildParseDomainMap(dudupedDominArray.concat(dudupedDominArrayExtra))
   );
 
   // Create reject stats
@@ -136,30 +169,46 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy
       return sort(Array.from(statMap.entries()).filter(a => a[1] > 9), (a, b) => (b[1] - a[1]) || a[0].localeCompare(b[0]));
     });
 
-  const description = [
-    ...SHARED_DESCRIPTION,
-    '',
-    'The domainset supports AD blocking, tracking protection, privacy protection, anti-phishing, anti-mining',
-    '',
-    'Build from:',
-    ...HOSTS.map(host => ` - ${host[0]}`),
-    ...DOMAIN_LISTS.map(domainList => ` - ${domainList[0]}`),
-    ...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`),
-    ' - https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt',
-    ' - https://phishing.army/download/phishing_army_blocklist.txt'
-  ];
-
   return Promise.all([
     createRuleset(
       span,
       'Sukka\'s Ruleset - Reject Base',
-      description,
+      [
+        ...SHARED_DESCRIPTION,
+        '',
+        'The domainset supports AD blocking, tracking protection, privacy protection, anti-phishing, anti-mining',
+        '',
+        'Build from:',
+        ...HOSTS.map(host => ` - ${host[0]}`),
+        ...DOMAIN_LISTS.map(domainList => ` - ${domainList[0]}`),
+        ...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`)
+      ],
       new Date(),
-      span.traceChildSync('sort reject domainset', () => sortDomains(dudupedDominArray, domainArrayMainDomainMap, domainArraySubdomainMap)),
+      span.traceChildSync('sort reject domainset (base)', () => sortDomains(dudupedDominArray, domainArrayMainDomainMap, domainArraySubdomainMap)),
       'domainset',
       path.resolve(import.meta.dir, '../List/domainset/reject.conf'),
       path.resolve(import.meta.dir, '../Clash/domainset/reject.txt')
     ),
+    createRuleset(
+      span,
+      'Sukka\'s Ruleset - Reject Extra',
+      [
+        ...SHARED_DESCRIPTION,
+        '',
+        'The domainset supports AD blocking, tracking protection, privacy protection, anti-phishing, anti-mining',
+        '',
+        'Build from:',
+        ...HOSTS_EXTRA.map(host => ` - ${host[0]}`),
+        ...DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`),
+        ...ADGUARD_FILTERS_EXTRA.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`),
+        ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`)
+      ],
+      new Date(),
+      span.traceChildSync('sort reject domainset (extra)', () => sortDomains(dudupedDominArrayExtra, domainArrayMainDomainMap, domainArraySubdomainMap)),
+      'domainset',
+      path.resolve(import.meta.dir, '../List/domainset/reject_extra.conf'),
+      path.resolve(import.meta.dir, '../Clash/domainset/reject_extra.txt')
+    ),
     compareAndWriteFile(
       span,
       rejectDomainsStats.map(([domain, count]) => `${domain}${' '.repeat(100 - domain.length)}${count}`),

+ 63 - 52
Build/constants/reject-data-source.ts

@@ -9,29 +9,28 @@ export const HOSTS: HostsSource[] = [
     true,
     TTL.THREE_HOURS()
   ],
-  // Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller
-  ['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()],
+
   // no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
   ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', null, true, TTL.THREE_DAYS()],
   // have not been updated for more than a year, so we set a 14 days cache ttl
   ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', null, true, TTL.TWO_WEEKS()],
   ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', null, false, TTL.THREE_DAYS()],
   ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.THREE_DAYS()],
-  // ad-wars is not actively maintained, so we set a 7 days cache ttl
-  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()],
   ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()]
 ] as const;
 
+export const HOSTS_EXTRA: HostsSource[] = [
+  // Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller
+  ['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()],
+  // ad-wars is not actively maintained, so we set a 7 days cache ttl
+  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()]
+];
+
 export const DOMAIN_LISTS: HostsSource[] = [
   // CoinBlockerList
   // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
   ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', [], true, TTL.TWO_WEEKS()],
-  // BarbBlock
-  // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
-  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', [], true, TTL.TWO_WEEKS()],
-  // DigitalSide Threat-Intel - OSINT Hub
-  // Update once per day
-  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', [], true, TTL.ONE_DAY()],
+
   // Curben's PUP Domains Blocklist
   // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
   // 'https://pup-filter.pages.dev/pup-filter-agh.txt'
@@ -52,7 +51,16 @@ export const DOMAIN_LISTS: HostsSource[] = [
       'https://malware-filter.gitlab.io/malware-filter/urlhaus-filter-domains.txt'
     ],
     true, TTL.THREE_HOURS()
-  ],
+  ]
+] as const;
+
+export const DOMAIN_LISTS_EXTRA: HostsSource[] = [
+  // BarbBlock
+  // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
+  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', [], true, TTL.TWO_WEEKS()],
+  // DigitalSide Threat-Intel - OSINT Hub
+  // Update once per day
+  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', [], true, TTL.ONE_DAY()],
   // AdGuard CNAME Filter Combined
   // Update on a 7 days basis, so we add a 3 hours cache ttl
   ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', [], true, TTL.THREE_DAYS()],
@@ -60,9 +68,9 @@ export const DOMAIN_LISTS: HostsSource[] = [
   ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', [], true, TTL.THREE_DAYS()],
   ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', [], true, TTL.THREE_DAYS()],
   ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', [], true, TTL.THREE_DAYS()]
-] as const;
+];
 
-export const PHISHING_DOMAIN_LISTS: [HostsSource, HostsSource] = [
+export const PHISHING_DOMAIN_LISTS_EXTRA: [HostsSource, HostsSource] = [
   [
     'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt',
     [
@@ -114,6 +122,46 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
     ],
     TTL.TWLVE_HOURS()
   ],
+  // AdGuard Base Filter
+  ['https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt', null, TTL.THREE_HOURS()],
+  // AdGuard Mobile AD
+  ['https://filters.adtidy.org/extension/ublock/filters/11_optimized.txt', null, TTL.THREE_HOURS()],
+  // AdGuard Tracking Protection
+  ['https://filters.adtidy.org/extension/ublock/filters/3_optimized.txt', null, TTL.THREE_HOURS()],
+  // AdGuard Chinese filter (EasyList China + AdGuard Chinese filter)
+  ['https://filters.adtidy.org/extension/ublock/filters/224_optimized.txt', null, TTL.THREE_HOURS()],
+  // AdGuard Annoyances filter
+  ['https://filters.adtidy.org/android/filters/14_optimized.txt', null, TTL.THREE_HOURS()],
+  // GameConsoleAdblockList
+  // Update almost once per 1 to 3 months, let's set a 10 days cache ttl
+  ['https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', null, TTL.TEN_DAYS()],
+  // PiHoleBlocklist
+  // Update almost once per 3 months, let's set a 10 days cache ttl
+  [
+    'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt',
+    [
+      'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'
+    ],
+    TTL.TEN_DAYS()
+  ],
+  // Spam404
+  // Not actively maintained, let's use a 10 days cache ttl
+  ['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()],
+  // Brave First Party & First Party CNAME
+  ['https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', null, TTL.ONE_DAY()]
+] as const;
+
+export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
+  // EasyList Germany filter
+  [
+    'https://easylist.to/easylistgermany/easylistgermany.txt',
+    [
+      'https://easylist-downloads.adblockplus.org/easylistgermany.txt'
+    ],
+    TTL.TWLVE_HOURS()
+  ],
+  // AdGuard Japanese filter
+  ['https://filters.adtidy.org/extension/ublock/filters/7_optimized.txt', null, TTL.THREE_HOURS()],
   // uBlock Origin Filter List
   [
     'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt',
@@ -152,45 +200,8 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
       'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
     ],
     TTL.THREE_HOURS()
-  ],
-  // AdGuard Base Filter
-  ['https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt', null, TTL.THREE_HOURS()],
-  // AdGuard Mobile AD
-  ['https://filters.adtidy.org/extension/ublock/filters/11_optimized.txt', null, TTL.THREE_HOURS()],
-  // AdGuard Tracking Protection
-  ['https://filters.adtidy.org/extension/ublock/filters/3_optimized.txt', null, TTL.THREE_HOURS()],
-  // AdGuard Japanese filter
-  ['https://filters.adtidy.org/extension/ublock/filters/7_optimized.txt', null, TTL.THREE_HOURS()],
-  // AdGuard Chinese filter (EasyList China + AdGuard Chinese filter)
-  ['https://filters.adtidy.org/extension/ublock/filters/224_optimized.txt', null, TTL.THREE_HOURS()],
-  // AdGuard Annoyances filter
-  ['https://filters.adtidy.org/android/filters/14_optimized.txt', null, TTL.THREE_HOURS()],
-  // EasyList Germany filter
-  [
-    'https://easylist.to/easylistgermany/easylistgermany.txt',
-    [
-      'https://easylist-downloads.adblockplus.org/easylistgermany.txt'
-    ],
-    TTL.TWLVE_HOURS()
-  ],
-  // GameConsoleAdblockList
-  // Update almost once per 1 to 3 months, let's set a 10 days cache ttl
-  ['https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', null, TTL.TEN_DAYS()],
-  // PiHoleBlocklist
-  // Update almost once per 3 months, let's set a 10 days cache ttl
-  [
-    'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt',
-    [
-      'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'
-    ],
-    TTL.TEN_DAYS()
-  ],
-  // Spam404
-  // Not actively maintained, let's use a 10 days cache ttl
-  ['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()],
-  // Brave First Party & First Party CNAME
-  ['https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', null, TTL.ONE_DAY()]
-] as const;
+  ]
+];
 
 export const PREDEFINED_WHITELIST = [
   '.localhost',

+ 2 - 2
Build/lib/get-phishing-domains.ts

@@ -3,7 +3,7 @@ import * as tldts from 'tldts-experimental';
 
 import type { Span } from '../trace';
 import { appendArrayInPlaceCurried } from './append-array-in-place';
-import { PHISHING_DOMAIN_LISTS } from '../constants/reject-data-source';
+import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source';
 import { looseTldtsOpt } from '../constants/loose-tldts-opt';
 import picocolors from 'picocolors';
 import createKeywordFilter from './aho-corasick';
@@ -133,7 +133,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
   const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
     const domainArr: string[] = [];
 
-    (await Promise.all(PHISHING_DOMAIN_LISTS.map(entry => processDomainLists(curSpan, ...entry))))
+    (await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry))))
       .forEach(appendArrayInPlaceCurried(domainArr));
 
     return domainArr;