浏览代码

Perf: speed-up tldts

SukkaW 1 年之前
父节点
当前提交
aa3cb9e586

+ 5 - 4
Build/build-domestic-ruleset.ts

@@ -56,10 +56,11 @@ export const buildDomesticRuleset = task(import.meta.path, async (span) => {
                   : []
               ),
               ...domains.flatMap((domain) => [
-              `${domain} = server:${dns}`,
-              `*.${domain} = server:${dns}`
-            ])
-      ])
+                `${domain} = server:${dns}`,
+                `*.${domain} = server:${dns}`
+              ])
+            ]
+          )
       ],
       path.resolve(import.meta.dir, '../Modules/sukka_local_dns_mapping.sgmodule')
     )

+ 0 - 1
Build/build-internal-cdn-rules.ts

@@ -4,7 +4,6 @@ import { readFileByLine } from './lib/fetch-text-by-line';
 import { sortDomains } from './lib/stable-sort-domain';
 import { task } from './trace';
 import { compareAndWriteFile } from './lib/create-file';
-import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
 import { domainDeduper } from './lib/domain-deduper';
 import { sort } from './lib/timsort';
 

+ 1 - 1
Build/lib/get-phishing-domains.test.ts

@@ -1,5 +1,5 @@
 // eslint-disable-next-line import-x/no-unresolved -- bun
-import { describe, expect, it } from 'bun:test';
+import { describe, it } from 'bun:test';
 
 import { calcDomainAbuseScore } from './get-phishing-domains';
 

+ 11 - 4
Build/lib/get-phishing-domains.ts

@@ -1,9 +1,8 @@
 import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
 import { processDomainLists } from './parse-filter';
-import { getSubdomain } from 'tldts';
+import { getSubdomain, getPublicSuffix } from 'tldts-experimental';
 import { TTL } from './cache-filesystem';
 
-import { add as SetAdd } from 'mnemonist/set';
 import type { Span } from '../trace';
 import { appendArrayInPlace } from './append-array-in-place';
 
@@ -90,6 +89,14 @@ const BLACK_TLD = new Set([
   'design'
 ]);
 
+const tldtsOpt: Parameters<typeof getSubdomain>[1] = {
+  allowPrivateDomains: false,
+  extractHostname: false,
+  validateHostname: false,
+  detectIp: false,
+  mixedInputs: false
+};
+
 export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
   const gorhill = await getGorhillPublicSuffixPromise();
 
@@ -117,7 +124,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
         continue;
       }
 
-      const tld = gorhill.getPublicSuffix(safeGorhillLine);
+      const tld = getPublicSuffix(safeGorhillLine, tldtsOpt);
       if (!tld || !BLACK_TLD.has(tld)) continue;
 
       domainCountMap[apexDomain] ||= 0;
@@ -174,7 +181,7 @@ export function calcDomainAbuseScore(line: string) {
     }
   }
 
-  const subdomain = getSubdomain(line, { detectIp: false });
+  const subdomain = getSubdomain(line, tldtsOpt);
 
   if (subdomain) {
     if (subdomain.slice(1).includes('.')) {

+ 4 - 4
Build/lib/stable-sort-domain.ts

@@ -1,7 +1,7 @@
 // tldts-experimental is way faster than tldts, but very little bit inaccurate
 // (since it is hashes based). But the result is still deterministic, which is
 // enough when sorting.
-import * as tldts from 'tldts-experimental';
+import { getDomain, getSubdomain } from 'tldts-experimental';
 import { sort } from './timsort';
 
 export const compare = (a: string, b: string) => {
@@ -9,7 +9,7 @@ export const compare = (a: string, b: string) => {
   return (a.length - b.length) || a.localeCompare(b);
 };
 
-const tldtsOpt: Parameters<typeof tldts.getDomain>[1] = {
+const tldtsOpt: Parameters<typeof getDomain>[1] = {
   allowPrivateDomains: false,
   extractHostname: false,
   validateHostname: false,
@@ -24,11 +24,11 @@ export const sortDomains = (inputs: string[]) => {
   for (let i = 0, len = inputs.length; i < len; i++) {
     const cur = inputs[i];
     if (!domainMap.has(cur)) {
-      const topD = tldts.getDomain(cur, tldtsOpt);
+      const topD = getDomain(cur, tldtsOpt);
       domainMap.set(cur, topD ?? cur);
     }
     if (!subdomainMap.has(cur)) {
-      const subD = tldts.getSubdomain(cur, tldtsOpt);
+      const subD = getSubdomain(cur, tldtsOpt);
       subdomainMap.set(cur, subD ?? cur);
     }
   }

+ 53 - 0
Build/lib/tldts.bench.ts

@@ -0,0 +1,53 @@
+import { fetchRemoteTextByLine } from './fetch-text-by-line';
+import { processLineFromReadline } from './process-line';
+
+import { bench, group, run } from 'mitata';
+
+import * as tldts from 'tldts';
+import * as tldtsExperimental from 'tldts-experimental';
+import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
+
+(async () => {
+  const data = await processLineFromReadline(await fetchRemoteTextByLine('https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt'));
+
+  const gorhill = await getGorhillPublicSuffixPromise();
+  const tldtsOpt: Parameters<typeof tldts.getDomain>[1] = {
+    allowPrivateDomains: false,
+    extractHostname: false,
+    validateHostname: false,
+    detectIp: false,
+    mixedInputs: false
+  };
+
+  (['getDomain', 'getPublicSuffix', 'getSubdomain'] as const).forEach(methodName => {
+    group(methodName, () => {
+      if (methodName in gorhill) {
+        bench('gorhill', () => {
+          for (let i = 0, len = data.length; i < len; i++) {
+            const line = data[i];
+            const safeGorhillLine = line[0] === '.' ? line.slice(1) : line;
+
+            // @ts-expect-error -- type guarded
+            gorhill[methodName](safeGorhillLine);
+          }
+        });
+      }
+
+      bench('tldts', () => {
+        for (let i = 0, len = data.length; i < len; i++) {
+          // eslint-disable-next-line import-x/namespace -- safe
+          tldts[methodName](data[i], tldtsOpt);
+        }
+      });
+
+      bench('tldts-experimental', () => {
+        for (let i = 0, len = data.length; i < len; i++) {
+          // eslint-disable-next-line import-x/namespace -- safe
+          tldtsExperimental[methodName](data[i], tldtsOpt);
+        }
+      });
+    });
+  });
+
+  run();
+})();

+ 0 - 1
Build/validate-domestic.ts

@@ -3,7 +3,6 @@ import { parse } from 'csv-parse/sync';
 import { createTrie } from './lib/trie';
 import path from 'path';
 import { processLine } from './lib/process-line';
-import { extract } from 'tar-stream';
 import { extractDomainsFromFelixDnsmasq } from './lib/parse-dnsmasq';
 
 export const parseDomesticList = async () => {