ソースを参照

Housekeeping [skip ci]

SukkaW 8 ヶ月 前
コミット
3d1514f0d1
5 ファイル変更459 行追加438 行削除
  1. 29 26
      Build/build-microsoft-cdn.ts
  2. 37 40
      Build/lib/create-file.worker.ts
  3. 147 143
      Build/lib/get-phishing-domains.ts
  4. 8 8
      package.json
  5. 238 221
      pnpm-lock.yaml

+ 29 - 26
Build/build-microsoft-cdn.ts

@@ -5,41 +5,44 @@ import { RulesetOutput } from './lib/rules/ruleset';
 import Worktank from 'worktank';
 
 const pool = new Worktank({
-  name: 'build-internal-reverse-chn-cidr',
-  size: 1,
-  timeout: 10000, // The maximum number of milliseconds to wait for the result from the worker, if exceeded the worker is terminated and the execution promise rejects
-  warmup: true,
-  autoterminate: 30000, // The interval of milliseconds at which to check if the pool can be automatically terminated, to free up resources, workers will be spawned up again if needed
-  env: {},
-  methods: {
+  pool: {
+    name: 'build-internal-reverse-chn-cidr',
+    size: 1 // The number of workers to keep in the pool, if more workers are needed they will be spawned up to this limit
+  },
+  worker: {
+    autoAbort: 10000,
+    autoTerminate: 30000, // The interval of milliseconds at which to check if the pool can be automatically terminated, to free up resources, workers will be spawned up again if needed
+    autoInstantiate: true,
+    methods: {
     // eslint-disable-next-line object-shorthand -- workertank
-    getMicrosoftCdnRuleset: async function (importMetaUrl: string): Promise<[domains: string[], domainSuffixes: string[]]> {
+      getMicrosoftCdnRuleset: async function (importMetaUrl: string): Promise<[domains: string[], domainSuffixes: string[]]> {
       // TODO: createRequire is a temporary workaround for https://github.com/nodejs/node/issues/51956
-      const { default: module } = await import('node:module');
-      const __require = module.createRequire(importMetaUrl);
+        const { default: module } = await import('node:module');
+        const __require = module.createRequire(importMetaUrl);
 
-      const { HostnameSmolTrie } = __require('./lib/trie');
-      const { PROBE_DOMAINS, DOMAINS, DOMAIN_SUFFIXES, BLACKLIST } = __require('./constants/microsoft-cdn') as typeof import('./constants/microsoft-cdn');
-      const { fetchRemoteTextByLine } = __require('./lib/fetch-text-by-line') as typeof import('./lib/fetch-text-by-line');
-      const { appendArrayInPlace } = __require('foxts/append-array-in-place') as typeof import('foxts/append-array-in-place');
-      const { extractDomainsFromFelixDnsmasq } = __require('./lib/parse-dnsmasq') as typeof import('./lib/parse-dnsmasq');
+        const { HostnameSmolTrie } = __require('./lib/trie');
+        const { PROBE_DOMAINS, DOMAINS, DOMAIN_SUFFIXES, BLACKLIST } = __require('./constants/microsoft-cdn') as typeof import('./constants/microsoft-cdn');
+        const { fetchRemoteTextByLine } = __require('./lib/fetch-text-by-line') as typeof import('./lib/fetch-text-by-line');
+        const { appendArrayInPlace } = __require('foxts/append-array-in-place') as typeof import('foxts/append-array-in-place');
+        const { extractDomainsFromFelixDnsmasq } = __require('./lib/parse-dnsmasq') as typeof import('./lib/parse-dnsmasq');
 
-      const trie = new HostnameSmolTrie();
+        const trie = new HostnameSmolTrie();
 
-      for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) {
-        const domain = extractDomainsFromFelixDnsmasq(line);
-        if (domain) {
-          trie.add(domain);
+        for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) {
+          const domain = extractDomainsFromFelixDnsmasq(line);
+          if (domain) {
+            trie.add(domain);
+          }
         }
-      }
 
-      // remove blacklist domain from trie, to prevent them from being included in the later dump
-      BLACKLIST.forEach(black => trie.whitelist(black));
+        // remove blacklist domain from trie, to prevent them from being included in the later dump
+        BLACKLIST.forEach(black => trie.whitelist(black));
 
-      const domains: string[] = DOMAINS;
-      const domainSuffixes = appendArrayInPlace(PROBE_DOMAINS.flatMap(domain => trie.find(domain)), DOMAIN_SUFFIXES);
+        const domains: string[] = DOMAINS;
+        const domainSuffixes = appendArrayInPlace(PROBE_DOMAINS.flatMap(domain => trie.find(domain)), DOMAIN_SUFFIXES);
 
-      return [domains, domainSuffixes] as const;
+        return [domains, domainSuffixes] as const;
+      }
     }
   }
 });

+ 37 - 40
Build/lib/create-file.worker.ts

@@ -2,55 +2,52 @@ import Worktank from 'worktank';
 import os from 'node:os';
 import process from 'node:process';
 import type { Span } from '../trace';
+import { availableParallelism } from 'foxts/available-parallelism';
 
 const pool = new Worktank({
-  name: 'process-phishing-domains',
-  size: Math.max(
-    1,
-    (
-      'availableParallelism' in os
-        ? os.availableParallelism()
-        : (os as typeof import('node:os')).cpus().length
-    ) - 1
-  ),
-  timeout: 10000, // The maximum number of milliseconds to wait for the result from the worker, if exceeded the worker is terminated and the execution promise rejects
-  warmup: true,
-  autoterminate: 30000, // The interval of milliseconds at which to check if the pool can be automatically terminated, to free up resources, workers will be spawned up again if needed
-  env: {},
-  methods: {
+  pool: {
+    name: 'process-phishing-domains',
+    size: (availableParallelism(os) - 1) || 1
+  },
+  worker: {
+    autoAbort: 10000, // The maximum number of milliseconds to wait for the result from the worker, if exceeded the worker is terminated and the execution promise rejects
+    autoTerminate: 30000, // The interval of milliseconds at which to check if the pool can be automatically terminated, to free up resources, workers will be spawned up again if needed
+    env: {},
+    methods: {
     // eslint-disable-next-line object-shorthand -- workertank
-    compareAndWriteFile: async function (
-      linesA: string[], filePath: string,
-      importMetaUrl: string
-    ): Promise<void> {
-      const { default: module } = await import('node:module');
-      const __require = module.createRequire(importMetaUrl);
+      compareAndWriteFile: async function (
+        linesA: string[], filePath: string,
+        importMetaUrl: string
+      ): Promise<void> {
+        const { default: module } = await import('node:module');
+        const __require = module.createRequire(importMetaUrl);
 
-      const fs = __require('fs') as typeof import('fs');
-      const { readFileByLine } = __require('./fetch-text-by-line') as typeof import('./fetch-text-by-line');
-      const { fileEqual } = __require('./create-file') as typeof import('./create-file');
-      const path = __require('node:path') as typeof import('node:path');
-      const { fastStringArrayJoin } = __require('foxts/fast-string-array-join') as typeof import('foxts/fast-string-array-join');
-      const picocolors = __require('picocolors') as typeof import('picocolors');
+        const fs = __require('fs') as typeof import('fs');
+        const { readFileByLine } = __require('./fetch-text-by-line') as typeof import('./fetch-text-by-line');
+        const { fileEqual } = __require('./create-file') as typeof import('./create-file');
+        const path = __require('node:path') as typeof import('node:path');
+        const { fastStringArrayJoin } = __require('foxts/fast-string-array-join') as typeof import('foxts/fast-string-array-join');
+        const picocolors = __require('picocolors') as typeof import('picocolors');
 
-      let isEqual = false;
-      if (fs.existsSync(filePath)) {
-        isEqual = await fileEqual(linesA, readFileByLine(filePath));
-      } else {
-        console.log(`${filePath} does not exists, writing...`);
+        let isEqual = false;
+        if (fs.existsSync(filePath)) {
+          isEqual = await fileEqual(linesA, readFileByLine(filePath));
+        } else {
+          console.log(`${filePath} does not exists, writing...`);
         // isEqual = false; // isEqual is false by default anyway
-      }
+        }
 
-      if (isEqual) {
-        console.log(picocolors.gray(picocolors.dim(`same content, bail out writing: ${filePath}`)));
-        return;
-      }
+        if (isEqual) {
+          console.log(picocolors.gray(picocolors.dim(`same content, bail out writing: ${filePath}`)));
+          return;
+        }
 
-      const dir = path.dirname(filePath);
-      if (!fs.existsSync(dir)) {
-        fs.mkdirSync(dir, { recursive: true });
+        const dir = path.dirname(filePath);
+        if (!fs.existsSync(dir)) {
+          fs.mkdirSync(dir, { recursive: true });
+        }
+        fs.writeFileSync(filePath, fastStringArrayJoin(linesA, '\n') + '\n', { encoding: 'utf-8' });
       }
-      fs.writeFileSync(filePath, fastStringArrayJoin(linesA, '\n') + '\n', { encoding: 'utf-8' });
     }
   }
 });

+ 147 - 143
Build/lib/get-phishing-domains.ts

@@ -5,189 +5,193 @@ import type { Span } from '../trace';
 import type { TldTsParsed } from './normalize-domain';
 
 const pool = new Worktank({
-  name: 'process-phishing-domains',
-  size: 1,
-  timeout: 20000, // The maximum number of milliseconds to wait for the result from the worker, if exceeded the worker is terminated and the execution promise rejects
-  warmup: true,
-  autoterminate: 30000, // The interval of milliseconds at which to check if the pool can be automatically terminated, to free up resources, workers will be spawned up again if needed
-  env: {},
-  methods: {
-    // eslint-disable-next-line object-shorthand -- workertank
-    getPhishingDomains: async function (
-      importMetaUrl: string,
-      /** require.main === module */ isDebug = false
-    ): Promise<string[]> {
+  pool: {
+    name: 'process-phishing-domains',
+    size: 1
+  },
+  worker: {
+    autoAbort: 20000, // The maximum number of milliseconds to wait for the result from the worker, if exceeded the worker is terminated and the execution promise rejects
+    autoInstantiate: true,
+    autoTerminate: 30000, // The interval of milliseconds at which to check if the pool can be automatically terminated, to free up resources, workers will be spawned up again if needed
+    env: {},
+    methods: {
+      // eslint-disable-next-line object-shorthand -- workertank
+      getPhishingDomains: async function (
+        importMetaUrl: string,
+        /** require.main === module */ isDebug = false
+      ): Promise<string[]> {
       // TODO: createRequire is a temporary workaround for https://github.com/nodejs/node/issues/51956
-      const { default: module } = await import('node:module');
-      const __require = module.createRequire(importMetaUrl);
+        const { default: module } = await import('node:module');
+        const __require = module.createRequire(importMetaUrl);
 
-      const picocolors = __require('picocolors') as typeof import('picocolors');
-      const tldts = __require('tldts-experimental') as typeof import('tldts-experimental');
+        const picocolors = __require('picocolors') as typeof import('picocolors');
+        const tldts = __require('tldts-experimental') as typeof import('tldts-experimental');
 
-      const { appendArrayInPlaceCurried } = __require('foxts/append-array-in-place') as typeof import('foxts/append-array-in-place');
+        const { appendArrayInPlaceCurried } = __require('foxts/append-array-in-place') as typeof import('foxts/append-array-in-place');
 
-      const { loosTldOptWithPrivateDomains } = __require('../constants/loose-tldts-opt') as typeof import('../constants/loose-tldts-opt');
-      const { BLACK_TLD, WHITELIST_MAIN_DOMAINS, leathalKeywords, lowKeywords, sensitiveKeywords } = __require('../constants/phishing-score-source') as typeof import('../constants/phishing-score-source');
-      const { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } = __require('../constants/reject-data-source') as typeof import('../constants/reject-data-source');
-      const { dummySpan } = __require('../trace') as typeof import('../trace');
-      const NullPrototypeObject = __require('null-prototype-object') as typeof import('null-prototype-object');
+        const { loosTldOptWithPrivateDomains } = __require('../constants/loose-tldts-opt') as typeof import('../constants/loose-tldts-opt');
+        const { BLACK_TLD, WHITELIST_MAIN_DOMAINS, leathalKeywords, lowKeywords, sensitiveKeywords } = __require('../constants/phishing-score-source') as typeof import('../constants/phishing-score-source');
+        const { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } = __require('../constants/reject-data-source') as typeof import('../constants/reject-data-source');
+        const { dummySpan } = __require('../trace') as typeof import('../trace');
+        const NullPrototypeObject = __require('null-prototype-object') as typeof import('null-prototype-object');
 
-      const { processHostsWithPreload } = __require('./parse-filter/hosts') as typeof import('./parse-filter/hosts');
-      const { processDomainListsWithPreload } = __require('./parse-filter/domainlists') as typeof import('./parse-filter/domainlists');
+        const { processHostsWithPreload } = __require('./parse-filter/hosts') as typeof import('./parse-filter/hosts');
+        const { processDomainListsWithPreload } = __require('./parse-filter/domainlists') as typeof import('./parse-filter/domainlists');
 
-      const downloads = [
-        ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry)),
-        ...PHISHING_HOSTS_EXTRA.map(entry => processHostsWithPreload(...entry))
-      ];
+        const downloads = [
+          ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry)),
+          ...PHISHING_HOSTS_EXTRA.map(entry => processHostsWithPreload(...entry))
+        ];
 
-      const domainArr: string[] = [];
+        const domainArr: string[] = [];
 
-      const domainGroups = await Promise.all(downloads.map(task => task(dummySpan)));
-      domainGroups.forEach(appendArrayInPlaceCurried(domainArr));
+        const domainGroups = await Promise.all(downloads.map(task => task(dummySpan)));
+        domainGroups.forEach(appendArrayInPlaceCurried(domainArr));
 
-      // return domainArr;
+        // return domainArr;
 
-      const domainCountMap = new Map<string, number>();
-      const domainScoreMap: Record<string, number> = new NullPrototypeObject();
+        const domainCountMap = new Map<string, number>();
+        const domainScoreMap: Record<string, number> = new NullPrototypeObject();
 
-      let line = '';
-      let tld: string | null = '';
-      let apexDomain: string | null = '';
-      let subdomain: string | null = '';
-      let parsed: TldTsParsed;
+        let line = '';
+        let tld: string | null = '';
+        let apexDomain: string | null = '';
+        let subdomain: string | null = '';
+        let parsed: TldTsParsed;
 
-      // const set = new Set<string>();
-      // let duplicateCount = 0;
+        // const set = new Set<string>();
+        // let duplicateCount = 0;
 
-      for (let i = 0, len = domainArr.length; i < len; i++) {
-        line = domainArr[i];
+        for (let i = 0, len = domainArr.length; i < len; i++) {
+          line = domainArr[i];
 
-        // if (set.has(line)) {
-        //   duplicateCount++;
-        // } else {
-        //   set.add(line);
-        // }
+          // if (set.has(line)) {
+          //   duplicateCount++;
+          // } else {
+          //   set.add(line);
+          // }
 
-        parsed = tldts.parse(line, loosTldOptWithPrivateDomains);
-        if (parsed.isPrivate) {
-          continue;
-        }
+          parsed = tldts.parse(line, loosTldOptWithPrivateDomains);
+          if (parsed.isPrivate) {
+            continue;
+          }
 
-        tld = parsed.publicSuffix;
-        apexDomain = parsed.domain;
+          tld = parsed.publicSuffix;
+          apexDomain = parsed.domain;
 
-        if (!tld) {
-          console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
-          continue;
-        }
-        if (!apexDomain) {
-          console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
-          continue;
-        }
-        if (WHITELIST_MAIN_DOMAINS.has(apexDomain)) {
-          continue;
-        }
-
-        domainCountMap.set(
-          apexDomain,
-          domainCountMap.has(apexDomain)
-            ? domainCountMap.get(apexDomain)! + 1
-            : 1
-        );
-
-        let score = 0;
-
-        if (apexDomain in domainScoreMap) {
-          score = domainScoreMap[apexDomain];
-        } else {
-          if (BLACK_TLD.has(tld)) {
-            score += 3;
-          } else if (tld.length > 6) {
-            score += 2;
+          if (!tld) {
+            console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
+            continue;
           }
-          if (apexDomain.length >= 18) {
-            score += 0.5;
+          if (!apexDomain) {
+            console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
+            continue;
+          }
+          if (WHITELIST_MAIN_DOMAINS.has(apexDomain)) {
+            continue;
           }
-        }
 
-        subdomain = parsed.subdomain;
+          domainCountMap.set(
+            apexDomain,
+            domainCountMap.has(apexDomain)
+              ? domainCountMap.get(apexDomain)! + 1
+              : 1
+          );
+
+          let score = 0;
+
+          if (apexDomain in domainScoreMap) {
+            score = domainScoreMap[apexDomain];
+          } else {
+            if (BLACK_TLD.has(tld)) {
+              score += 3;
+            } else if (tld.length > 6) {
+              score += 2;
+            }
+            if (apexDomain.length >= 18) {
+              score += 0.5;
+            }
+          }
 
-        if (subdomain) {
-          score += calcDomainAbuseScore(subdomain, line);
-        }
+          subdomain = parsed.subdomain;
 
-        domainScoreMap[apexDomain] = score;
-      }
+          if (subdomain) {
+            score += calcDomainAbuseScore(subdomain, line);
+          }
 
-      domainCountMap.forEach((count, apexDomain) => {
-        const score = domainScoreMap[apexDomain];
-        if (
-        // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
-          (score >= 24)
-          || (score >= 16 && count >= 7)
-          || (score >= 13 && count >= 11)
-          || (score >= 5 && count >= 14)
-          || (score >= 3 && count >= 21)
-          || (score >= 1 && count >= 60)
-        ) {
-          domainArr.push('.' + apexDomain);
+          domainScoreMap[apexDomain] = score;
         }
-      });
-
-      if (isDebug) {
-        console.log({
-          v: 1,
-          score: domainScoreMap['com-ticketry.world'],
-          count: domainCountMap.get('com-ticketry.world'),
-          domainArrLen: domainArr.length
-        });
-      }
 
-      return domainArr;
+        domainCountMap.forEach((count, apexDomain) => {
+          const score = domainScoreMap[apexDomain];
+          if (
+          // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
+            (score >= 24)
+            || (score >= 16 && count >= 7)
+            || (score >= 13 && count >= 11)
+            || (score >= 5 && count >= 14)
+            || (score >= 3 && count >= 21)
+            || (score >= 1 && count >= 60)
+          ) {
+            domainArr.push('.' + apexDomain);
+          }
+        });
 
-      function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) {
-        if (leathalKeywords(fullDomain)) {
-          return 100;
+        if (isDebug) {
+          console.log({
+            v: 1,
+            score: domainScoreMap['com-ticketry.world'],
+            count: domainCountMap.get('com-ticketry.world'),
+            domainArrLen: domainArr.length
+          });
         }
 
-        let weight = 0;
+        return domainArr;
+
+        function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) {
+          if (leathalKeywords(fullDomain)) {
+            return 100;
+          }
+
+          let weight = 0;
 
-        const hitLowKeywords = lowKeywords(fullDomain);
-        const sensitiveKeywordsHit = sensitiveKeywords(fullDomain);
+          const hitLowKeywords = lowKeywords(fullDomain);
+          const sensitiveKeywordsHit = sensitiveKeywords(fullDomain);
 
-        if (sensitiveKeywordsHit) {
-          weight += 15;
-          if (hitLowKeywords) {
-            weight += 10;
+          if (sensitiveKeywordsHit) {
+            weight += 15;
+            if (hitLowKeywords) {
+              weight += 10;
+            }
+          } else if (hitLowKeywords) {
+            weight += 2;
           }
-        } else if (hitLowKeywords) {
-          weight += 2;
-        }
 
-        const subdomainLength = subdomain.length;
+          const subdomainLength = subdomain.length;
 
-        if (subdomainLength > 6) {
-          weight += 0.015;
+          if (subdomainLength > 6) {
+            weight += 0.015;
 
-          if (subdomainLength > 13) {
-            weight += 0.2;
-            if (subdomainLength > 20) {
-              weight += 1;
-              if (subdomainLength > 30) {
-                weight += 5;
-                if (subdomainLength > 40) {
-                  weight += 10;
+            if (subdomainLength > 13) {
+              weight += 0.2;
+              if (subdomainLength > 20) {
+                weight += 1;
+                if (subdomainLength > 30) {
+                  weight += 5;
+                  if (subdomainLength > 40) {
+                    weight += 10;
+                  }
                 }
               }
-            }
 
-            if (subdomain.indexOf('.', 1) > 1) {
-              weight += 1;
+              if (subdomain.indexOf('.', 1) > 1) {
+                weight += 1;
+              }
             }
           }
-        }
 
-        return weight;
+          return weight;
+        }
       }
     }
   }

+ 8 - 8
package.json

@@ -20,20 +20,20 @@
   "author": "",
   "license": "ISC",
   "dependencies": {
-    "@ghostery/adblocker": "^2.11.2",
+    "@ghostery/adblocker": "^2.11.3",
     "@henrygd/queue": "^1.0.7",
     "@mitata/counters": "^0.0.8",
     "async-retry": "^1.3.3",
     "better-sqlite3": "^12.2.0",
     "ci-info": "^4.3.0",
     "cli-progress": "^3.12.0",
-    "csv-parse": "^5.6.0",
+    "csv-parse": "^6.1.0",
     "dns2": "^2.1.0",
     "fast-cidr-tools": "^0.3.2",
     "fast-fifo": "^1.3.2",
     "fast-uri": "^3.0.6",
     "fdir": "^6.4.6",
-    "foxts": "^3.9.0",
+    "foxts": "^3.10.0",
     "hash-wasm": "^4.12.0",
     "json-stringify-pretty-compact": "3.0.0",
     "null-prototype-object": "^1.2.0",
@@ -46,7 +46,7 @@
     "undici-cache-store-better-sqlite3": "^1.0.0",
     "whoiser": "^1.18.0",
     "why-is-node-running": "^3.2.2",
-    "worktank": "^2.7.3",
+    "worktank": "^3.0.2",
     "xbits": "^0.2.0",
     "yaml": "^2.8.0",
     "yauzl-promise": "^4.0.0"
@@ -54,21 +54,21 @@
   "devDependencies": {
     "@eslint-sukka/node": "^6.22.1",
     "@swc-node/register": "^1.10.10",
-    "@swc/core": "^1.13.1",
+    "@swc/core": "^1.13.2",
     "@types/async-retry": "^1.4.9",
     "@types/better-sqlite3": "^7.6.13",
     "@types/cli-progress": "^3.11.6",
     "@types/dns2": "^2.0.9",
     "@types/fast-fifo": "^1.3.0",
     "@types/mocha": "^10.0.10",
-    "@types/node": "^24.0.15",
+    "@types/node": "^24.1.0",
     "@types/punycode": "^2.1.4",
     "@types/tar-fs": "^2.0.4",
     "@types/yauzl-promise": "^4.0.1",
-    "eslint": "^9.31.0",
+    "eslint": "^9.32.0",
     "eslint-config-sukka": "^6.22.1",
     "eslint-formatter-sukka": "^6.22.1",
-    "expect": "^30.0.4",
+    "expect": "^30.0.5",
     "mitata": "^1.0.34",
     "mocha": "^11.7.1",
     "tinyexec": "^1.0.1",

ファイルの差分が大きいため隠しています
+ 238 - 221
pnpm-lock.yaml


この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません