浏览代码

Update Reject Infra & Data Source

SukkaW 1 年之前
父节点
当前提交
0d1fddcb81

+ 21 - 29
Build/build-reject-domainset.ts

@@ -4,7 +4,7 @@ import process from 'node:process';
 
 import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
 
-import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source';
+import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source';
 import { compareAndWriteFile } from './lib/create-file';
 import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
 import { task } from './trace';
@@ -44,7 +44,8 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
       ...HOSTS_EXTRA.map(host => ` - ${host[0]}`),
       ...DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`),
       ...ADGUARD_FILTERS_EXTRA.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`),
-      ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`)
+      ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`),
+      ...PHISHING_HOSTS_EXTRA.map(host => ` - ${host[0]}`)
     ]);
 
   const appendArrayToRejectOutput = rejectOutput.addFromDomainset.bind(rejectOutput);
@@ -91,18 +92,25 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
               appendArrayToRejectExtraOutput(black);
             })
         ),
-
-        ([
-          'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt',
-          'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt'
-        ].map(
-          input => processFilterRules(childSpan, input).then(({ white, black }) => {
-            setAddFromArray(filterRuleWhitelistDomainSets, white);
-            setAddFromArray(filterRuleWhitelistDomainSets, black);
-          })
-        )),
+        ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => {
+          setAddFromArray(filterRuleWhitelistDomainSets, white);
+          setAddFromArray(filterRuleWhitelistDomainSets, black);
+        })),
         getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput),
-        readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf')).then(appendArrayToRejectOutput)
+        readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf')).then(appendArrayToRejectOutput),
+        // Dedupe domainSets
+        span.traceChildAsync('collect black keywords/suffixes', async () => {
+          /** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */
+          for await (const line of readFileByLine(path.resolve(__dirname, '../Source/non_ip/reject.conf'))) {
+            const [type, value] = line.split(',');
+            if (type === 'DOMAIN-KEYWORD') {
+              rejectOutput.addDomainKeyword(value); // Add for later deduplication
+              rejectExtraOutput.addDomainKeyword(value); // Add for later deduplication
+            } else if (type === 'DOMAIN-SUFFIX') {
+              filterRuleWhitelistDomainSets.add('.' + value);
+            }
+          }
+        })
       ].flat());
       // eslint-disable-next-line sukka/no-single-return -- not single return
       return shouldStop;
@@ -112,22 +120,6 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
     process.exit(1);
   }
 
-  // Dedupe domainSets
-  await span.traceChildAsync('collect black keywords/suffixes', async () => {
-    /** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */
-    for await (const line of readFileByLine(path.resolve(__dirname, '../Source/non_ip/reject.conf'))) {
-      const [type, value] = line.split(',');
-
-      if (type === 'DOMAIN-KEYWORD') {
-        rejectOutput.addDomainKeyword(value); // Add for later deduplication
-        rejectExtraOutput.addDomainKeyword(value); // Add for later deduplication
-      } else if (type === 'DOMAIN-SUFFIX') {
-        rejectOutput.whitelistDomain('.' + value); // Add for later deduplication
-        rejectExtraOutput.whitelistDomain('.' + value); // Add for later deduplication
-      }
-    }
-  });
-
   await Promise.all([
     rejectOutput.done(),
     rejectExtraOutput.done()

+ 59 - 32
Build/constants/reject-data-source.ts

@@ -3,14 +3,12 @@ import { TTL } from '../lib/cache-filesystem';
 type HostsSource = [main: string, mirrors: string[] | null, includeAllSubDomain: boolean, ttl: number];
 
 export const HOSTS: HostsSource[] = [
-  // no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
-  ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', null, true, TTL.THREE_DAYS()],
   // have not been updated for more than a year, so we set a 14 days cache ttl
   ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', null, true, TTL.TWO_WEEKS()],
   ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', null, false, TTL.ONE_WEEK()],
   ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.ONE_WEEK()],
   ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()],
-  ['https://raw.githubusercontent.com/durablenapkin/block/refs/heads/master/tvstream.txt', null, true, TTL.THREE_HOURS()]
+  ['https://raw.githubusercontent.com/durablenapkin/block/master/tvstream.txt', null, true, TTL.THREE_HOURS()]
 ];
 
 export const HOSTS_EXTRA: HostsSource[] = [
@@ -24,7 +22,7 @@ export const HOSTS_EXTRA: HostsSource[] = [
   // Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller
   ['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()],
   // ad-wars is not actively maintained, so we set a 7 days cache ttl
-  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()]
+  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.TWO_WEEKS()]
 ];
 
 export const DOMAIN_LISTS: HostsSource[] = [
@@ -86,9 +84,12 @@ export const PHISHING_DOMAIN_LISTS_EXTRA: HostsSource[] = [
     'https://phishing.army/download/phishing_army_blocklist.txt',
     [],
     true, TTL.THREE_HOURS()
-  ],
+  ]
+];
+
+export const PHISHING_HOSTS_EXTRA: HostsSource[] = [
   [
-    'https://raw.githubusercontent.com/durablenapkin/scamblocklist/refs/heads/master/hosts.txt',
+    'https://raw.githubusercontent.com/durablenapkin/scamblocklist/master/hosts.txt',
     [],
     true, TTL.TWLVE_HOURS()
   ]
@@ -97,14 +98,16 @@ export const PHISHING_DOMAIN_LISTS_EXTRA: HostsSource[] = [
 type AdGuardFilterSource = [main: string, mirrors: string[] | null, ttl: number, allowThirdParty?: boolean];
 
 export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
+  // no coin list adguard list is more maintained than its hosts
+  ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/nocoin.txt', [], TTL.TWO_WEEKS()],
   // EasyList
   [
     'https://easylist.to/easylist/easylist.txt',
     [
-      'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt',
-      'https://ublockorigin.pages.dev/thirdparties/easylist.txt',
       'https://easylist-downloads.adblockplus.org/easylist.txt',
       'https://secure.fanboy.co.nz/easylist.txt',
+      'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt',
+      'https://ublockorigin.pages.dev/thirdparties/easylist.txt',
       'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easylist.txt'
     ],
     TTL.TWLVE_HOURS()
@@ -113,11 +116,11 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
   [
     'https://easylist.to/easylist/easyprivacy.txt',
     [
-      'https://secure.fanboy.co.nz/easyprivacy.txt',
-      'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt',
       'https://easylist-downloads.adblockplus.org/easyprivacy.txt',
+      'https://secure.fanboy.co.nz/easyprivacy.txt',
       'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easyprivacy.txt',
-      'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt'
+      'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt',
+      'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt'
     ],
     TTL.TWLVE_HOURS()
   ],
@@ -150,23 +153,46 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
     ],
     TTL.TEN_DAYS()
   ],
-  // Brave First Party & First Party CNAME
-  ['https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', null, TTL.ONE_DAY()]
+  // uBlock Origin Unbreak
+  [
+    'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt',
+    [
+      'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
+    ],
+    TTL.THREE_HOURS()
+  ]
 ];
 
-export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
-  // AdGuard Annoyances filter
-  ['https://filters.adtidy.org/android/filters/14_optimized.txt', null, TTL.THREE_HOURS(), true],
-  // AdGuard Cookie Notices
-  ['https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt', null, TTL.THREE_HOURS(), true],
-  // EasyList Germany filter
+export const ADGUARD_FILTERS_WHITELIST: AdGuardFilterSource[] = [
   [
-    'https://easylist.to/easylistgermany/easylistgermany.txt',
+    'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/exceptions.txt',
     [
-      'https://easylist-downloads.adblockplus.org/easylistgermany.txt'
+      'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt'
     ],
-    TTL.TWLVE_HOURS()
+    TTL.THREE_HOURS()
   ],
+  [
+    'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/exclusions.txt',
+    [
+      'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt'
+    ],
+    TTL.THREE_HOURS()
+  ]
+];
+
+export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
+  // AdGuard Annoyances filter
+  ['https://filters.adtidy.org/extension/ublock/filters/14_optimized.txt', null, TTL.THREE_HOURS(), true],
+  // AdGuard Cookie Notices, included in Annoyances filter
+  // ['https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt', null, TTL.THREE_HOURS(), true],
+  // EasyList Germany filter, not even included in extra for now
+  // [
+  //   'https://easylist.to/easylistgermany/easylistgermany.txt',
+  //   [
+  //     'https://easylist-downloads.adblockplus.org/easylistgermany.txt'
+  //   ],
+  //   TTL.TWLVE_HOURS()
+  // ],
   // AdGuard Japanese filter
   ['https://filters.adtidy.org/extension/ublock/filters/7_optimized.txt', null, TTL.THREE_HOURS()],
   // uBlock Origin Filter List
@@ -177,8 +203,8 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
     ],
     TTL.THREE_HOURS()
   ],
-  // AdGuard Popup Overlay
-  ['https://filters.adtidy.org/extension/ublock/filters/19_optimized.txt', null, TTL.THREE_HOURS(), true],
+  // AdGuard Popup Overlay - included in Annoyances filter
+  // ['https://filters.adtidy.org/extension/ublock/filters/19_optimized.txt', null, TTL.THREE_HOURS(), true],
   // AdGuard Mobile Banner
   // almost all generic rule
   // ['https://filters.adtidy.org/extension/ublock/filters/20_optimized.txt', null, TTL.THREE_HOURS()],
@@ -205,14 +231,6 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
   //     'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
   //   ]
   // ],
-  // uBlock Origin Unbreak
-  [
-    'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt',
-    [
-      'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
-    ],
-    TTL.THREE_HOURS()
-  ],
   // uBlock Origin Annoyances
   [
     'https://ublockorigin.github.io/uAssetsCDN/filters/annoyances.min.txt',
@@ -229,6 +247,8 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
     ],
     TTL.THREE_HOURS()
   ],
+  // Dandelion Sprout's Annoyances
+  ['https://filters.adtidy.org/extension/ublock/filters/250_optimized.txt', null, TTL.THREE_HOURS(), true],
   // EasyList - Newsletters
   [
     'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist-newsletters.txt',
@@ -253,6 +273,12 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
       'https://secure.fanboy.co.nz/fanboy-cookiemonster_ubo.txt'
     ],
     TTL.TWLVE_HOURS()
+  ],
+  // Bypass Paywall Cleaner
+  [
+    'https://gitflic.ru/project/magnolia1234/bypass-paywalls-clean-filters/blob/raw?file=bpc-paywall-filter.txt',
+    [],
+    TTL.ONE_DAY()
   ]
 ];
 
@@ -269,6 +295,7 @@ export const PREDEFINED_WHITELIST = [
   '.ip6-allhosts',
   '.mcastprefix',
   '.skk.moe',
+  '.cdn.cloudflare.net', // Surge/Clash doesn't support CNAME
   'analytics.google.com',
   '.cloud.answerhub.com',
   'ae01.alicdn.com',

+ 4 - 2
Build/lib/get-phishing-domains.ts

@@ -1,10 +1,10 @@
-import { processDomainLists } from './parse-filter';
+import { processDomainLists, processHosts } from './parse-filter';
 import * as tldts from 'tldts-experimental';
 
 import { dummySpan } from '../trace';
 import type { Span } from '../trace';
 import { appendArrayInPlaceCurried } from './append-array-in-place';
-import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source';
+import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source';
 import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt';
 import picocolors from 'picocolors';
 import createKeywordFilter from './aho-corasick';
@@ -162,6 +162,8 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
 
     (await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry, cacheKey))))
       .forEach(appendArrayInPlaceCurried(domainArr));
+    (await Promise.all(PHISHING_HOSTS_EXTRA.map(entry => processHosts(curSpan, ...entry, cacheKey))))
+      .forEach(appendArrayInPlaceCurried(domainArr));
 
     return domainArr;
   });

+ 11 - 3
Build/lib/parse-filter.ts

@@ -45,7 +45,11 @@ const domainListLineCb = (l: string, set: string[], includeAllSubDomain: boolean
 
 const cacheKey = createCacheKey(__filename);
 
-export function processDomainLists(span: Span, domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null, extraCacheKey: (input: string) => string = identity) {
+export function processDomainLists(
+  span: Span,
+  domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false,
+  ttl: number | null = null, extraCacheKey: (input: string) => string = identity
+) {
   return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply(
     extraCacheKey(cacheKey(domainListsUrl)),
     async () => {
@@ -100,9 +104,13 @@ const hostsLineCb = (l: string, set: string[], includeAllSubDomain: boolean, met
   set.push(includeAllSubDomain ? `.${domain}` : domain);
 };
 
-export function processHosts(span: Span, hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) {
+export function processHosts(
+  span: Span,
+  hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false,
+  ttl: number | null = null, extraCacheKey: (input: string) => string = identity
+) {
   return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply(
-    cacheKey(hostsUrl),
+    extraCacheKey(cacheKey(hostsUrl)),
     async () => {
       const domainSets: string[] = [];
 

+ 1 - 1
Build/lib/rules/ruleset.ts

@@ -214,8 +214,8 @@ export class RulesetOutput extends RuleOutput<Preprocessed> {
       }
     }
 
-    console.error(picocolors.bold('Parsed Failed'));
     if (parsedFailures.length > 0) {
+      console.error(picocolors.bold('Parsed Failed'));
       console.table(parsedFailures);
     }