浏览代码

Fix: force normalize reject domains

SukkaW 2 年之前
父节点
当前提交
53fc370774
共有 6 个文件被更改,包括 38 次插入36 次删除
  1. 3 3
      Build/build-reject-domainset.ts
  2. 6 3
      Build/lib/get-phishing-domains.ts
  3. 9 11
      Build/lib/parse-filter.ts
  4. 18 18
      Build/lib/reject-data-source.ts
  5. 二进制
      bun.lockb
  6. 2 1
      package.json

+ 3 - 3
Build/build-reject-domainset.ts

@@ -32,12 +32,12 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
     const [gorhill] = await Promise.all([
       getGorhillPublicSuffixPromise(),
       // Parse from remote hosts & domain lists
-      ...HOSTS.map(entry => processHosts(entry[0], entry[1], entry[2], entry[3]).then(hosts => {
+      ...HOSTS.map(entry => processHosts(entry[0], entry[1], entry[2]).then(hosts => {
         hosts.forEach(host => {
           domainSets.add(host);
         });
       })),
-      ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2], entry[3])),
+      ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2])),
       ...ADGUARD_FILTERS.map(input => {
         const promise = typeof input === 'string'
           ? processFilterRules(input)
@@ -154,7 +154,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
       dudupedDominArray.reduce<Record<string, number>>((acc, cur) => {
         const suffix = tldts.getDomain(cur, { allowPrivateDomains: false, detectIp: false, validateHostname: false });
         if (suffix) {
-          acc[suffix] = (acc[suffix] ?? 0) + 1;
+          acc[suffix] = (acc[suffix] || 0) + 1;
         }
         return acc;
       }, {})

+ 6 - 3
Build/lib/get-phishing-domains.ts

@@ -6,6 +6,7 @@ import { createTrie } from './trie';
 import { createCachedGorhillGetDomain } from './cached-tld-parse';
 import { processLine } from './process-line';
 import { TTL } from './cache-filesystem';
+import { isCI } from 'ci-info';
 
 const WHITELIST_DOMAIN = new Set([
   'w3s.link',
@@ -85,11 +86,13 @@ const BLACK_TLD = new Set([
 
 export const getPhishingDomains = () => traceAsync('get phishing domains', async () => {
   const [domainSet, domainSet2, gorhill] = await Promise.all([
-    processDomainLists('https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, false, TTL.THREE_HOURS()),
-    processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true, true, TTL.THREE_HOURS()),
+    processDomainLists('https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, TTL.THREE_HOURS()),
+    isCI
+      ? processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS())
+      : null,
     getGorhillPublicSuffixPromise()
   ]);
-  domainSet2.forEach((domain) => domainSet.add(domain));
+  domainSet2?.forEach((domain) => domainSet.add(domain));
 
   traceSync.skip('* whitelisting phishing domains', () => {
     const trieForRemovingWhiteListed = createTrie(domainSet);

+ 9 - 11
Build/lib/parse-filter.ts

@@ -11,10 +11,10 @@ import { normalizeDomain } from './normalize-domain';
 import { fetchAssets } from './fetch-assets';
 import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';
 
-const DEBUG_DOMAIN_TO_FIND: string | null = '.j3.4z0vc.chileinsumos.cl'; // example.com | null
+const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
 let foundDebugDomain = false;
 
-export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) {
+export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
   return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply(
     domainListsUrl,
     async () => {
@@ -23,11 +23,7 @@ export function processDomainLists(domainListsUrl: string, includeAllSubDomain =
       for await (const line of await fetchRemoteTextByLine(domainListsUrl)) {
         let domainToAdd = processLine(line);
         if (!domainToAdd) continue;
-
-        if (!skipDomainCheck) {
-          domainToAdd = normalizeDomain(domainToAdd);
-        }
-
+        domainToAdd = normalizeDomain(domainToAdd);
         if (!domainToAdd) continue;
 
         if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
@@ -48,7 +44,7 @@ export function processDomainLists(domainListsUrl: string, includeAllSubDomain =
     }
   ));
 }
-export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) {
+export function processHosts(hostsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
   return traceAsync(`- processHosts: ${hostsUrl}`, () => fsCache.apply(
     hostsUrl,
     async () => {
@@ -71,10 +67,12 @@ export function processHosts(hostsUrl: string, includeAllSubDomain = false, skip
           foundDebugDomain = true;
         }
 
-        const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain);
-        if (domainToAdd) {
-          domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
+        const domainToAdd = normalizeDomain(_domain);
+        if (!domainToAdd) {
+          continue;
         }
+
+        domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
       }
 
       console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));

+ 18 - 18
Build/lib/reject-data-source.ts

@@ -1,21 +1,21 @@
 import { TTL } from './cache-filesystem';
 
 export const HOSTS = [
-  ['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true, false, TTL.THREE_HOURS()],
-  ['https://someonewhocares.org/hosts/hosts', true, false, TTL.THREE_HOURS()],
+  ['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true, TTL.THREE_HOURS()],
+  ['https://someonewhocares.org/hosts/hosts', true, TTL.THREE_HOURS()],
   // no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
-  ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false, false, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false, TTL.THREE_DAYS()],
   // have not been updated for more than a year, so we set a 14 days cache ttl
-  ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true, false, TTL.TWO_WEEKS()],
-  ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', false, false, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', false, false, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true, TTL.TWO_WEEKS()],
+  ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', false, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', false, TTL.THREE_DAYS()],
   // ad-wars is not actively maintained, so we set a 7 days cache ttl
-  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false, false, TTL.ONE_WEEK()],
-  ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true, false, TTL.THREE_HOURS()],
+  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false, TTL.ONE_WEEK()],
+  ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true, TTL.THREE_HOURS()],
   // Curben's UrlHaus Malicious URL Blocklist
   // 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt',
   // 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
-  ['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, true, TTL.THREE_HOURS()],
+  ['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, TTL.THREE_HOURS()],
   // Curben's Phishing URL Blocklist
   // Covered by lib/get-phishing-domains.ts
   // 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt'
@@ -25,26 +25,26 @@ export const HOSTS = [
   // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
   // 'https://pup-filter.pages.dev/pup-filter-agh.txt'
   // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl
-  ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, TTL.TWO_WEEKS()]
+  ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, TTL.TWO_WEEKS()]
 ] as const;
 
 export const DOMAIN_LISTS = [
   // CoinBlockerList
   // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
-  ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', true, true, TTL.TWO_WEEKS()],
+  ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', true, TTL.TWO_WEEKS()],
   // BarbBlock
   // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
-  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, true, TTL.TWO_WEEKS()],
+  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()],
   // DigitalSide Threat-Intel - OSINT Hub
   // Update once per day
-  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, true, TTL.ONE_DAY()],
+  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, TTL.ONE_DAY()],
   // AdGuard CNAME Filter Combined
   // Update on a 7 days basis, so we add a 3 hours cache ttl
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, true, TTL.THREE_DAYS()],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, true, TTL.THREE_DAYS()]
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()]
 ] as const;
 
 export const ADGUARD_FILTERS = [

二进制
bun.lockb


+ 2 - 1
package.json

@@ -34,8 +34,9 @@
     "@eslint-sukka/node": "4.1.10-beta.2",
     "@eslint-sukka/ts": "4.1.10-beta.2",
     "@types/async-retry": "^1.4.8",
+    "@types/bun": "^1.0.0",
     "@types/tar-stream": "^3.1.3",
-    "bun-types": "^1.0.18-1",
+    "bun-types": "^1.0.21",
     "eslint": "^8.56.0",
     "eslint-config-sukka": "4.1.10-beta.2",
     "eslint-formatter-sukka": "4.1.9",