瀏覽代碼

Chore: minor changes

SukkaW 1 年之前
父節點
當前提交
efa34399b0

+ 26 - 24
Build/build-reject-domainset.ts

@@ -9,7 +9,7 @@ import { createRuleset, compareAndWriteFile } from './lib/create-file';
 import { domainDeduper } from './lib/domain-deduper';
 import createKeywordFilter from './lib/aho-corasick';
 import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
-import { sortDomains } from './lib/stable-sort-domain';
+import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain';
 import { task } from './trace';
 // tldts-experimental is way faster than tldts, but very little bit inaccurate
 // (since it is hashes based). But the result is still deterministic, which is
@@ -21,6 +21,10 @@ import { getPhishingDomains } from './lib/get-phishing-domains';
 import { subtract as SetSubstract } from 'mnemonist/set';
 import { setAddFromArray, setAddFromArrayCurried } from './lib/set-add-from-array';
 import { sort } from './lib/timsort';
+import { looseTldtsOpt } from './constants/loose-tldts-opt';
+import { build } from 'bun';
+
+const getRejectSukkaConfPromise = readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'));
 
 export const buildRejectDomainSet = task(import.meta.path, async (span) => {
   /** Whitelists */
@@ -37,11 +41,9 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
       let shouldStop = false;
       await Promise.all([
         // Parse from remote hosts & domain lists
-        ...HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSets)),
-
-        ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSets)),
-
-        ...ADGUARD_FILTERS.map(
+        HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSets)),
+        DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSets)),
+        ADGUARD_FILTERS.map(
           input => processFilterRules(childSpan, ...input)
             .then(({ white, black, foundDebugDomain }) => {
               if (foundDebugDomain) {
@@ -53,7 +55,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
               setAddFromArray(domainSets, black);
             })
         ),
-        ...([
+        ([
           'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt',
           'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt'
         ].map(
@@ -64,9 +66,8 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
             })
         )),
         getPhishingDomains(childSpan).then(appendArrayToDomainSets),
-        childSpan.traceChildAsync('process reject_sukka.conf', () => readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'))
-          .then(appendArrayToDomainSets))
-      ]);
+        getRejectSukkaConfPromise.then(appendArrayToDomainSets)
+      ].flat());
       // eslint-disable-next-line sukka/no-single-return -- not single return
       return shouldStop;
     });
@@ -107,30 +108,31 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
     });
   });
 
-  const trie = span.traceChildSync('dedupe from white suffixes', () => {
-    const trie = createTrie(domainSets, true, true);
-    filterRuleWhitelistDomainSets.forEach(trie.whitelist);
-    return trie;
-  });
+  const trie = span.traceChildSync('create smol trie', () => createTrie(domainSets, true, true));
+
+  span.traceChildSync('dedupe from white suffixes', () => filterRuleWhitelistDomainSets.forEach(trie.whitelist));
 
   // Dedupe domainSets
   const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain', () => domainDeduper(trie));
 
   console.log(`Final size ${dudupedDominArray.length}`);
 
+  const {
+    domainMap: domainArrayMainDomainMap,
+    subdomainMap: domainArraySubdomainMap
+  } = span.traceChildSync(
+    'build map for stat and sort',
+    () => buildParseDomainMap(dudupedDominArray)
+  );
+
   // Create reject stats
   const rejectDomainsStats: Array<[string, number]> = span
     .traceChild('create reject stats')
     .traceSyncFn(() => {
-      const tldtsOpt = { allowPrivateDomains: false, detectIp: false, validateHostname: false };
       const statMap = dudupedDominArray.reduce<Map<string, number>>((acc, cur) => {
-        const suffix = tldts.getDomain(cur, tldtsOpt);
-        if (!suffix) return acc;
-
-        if (acc.has(suffix)) {
-          acc.set(suffix, acc.get(suffix)! + 1);
-        } else {
-          acc.set(suffix, 1);
+        const suffix = domainArrayMainDomainMap.get(cur);
+        if (suffix) {
+          acc.set(suffix, (acc.get(suffix) ?? 0) + 1);
         }
         return acc;
       }, new Map());
@@ -157,7 +159,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
       'Sukka\'s Ruleset - Reject Base',
       description,
       new Date(),
-      span.traceChildSync('sort reject domainset', () => sortDomains(dudupedDominArray)),
+      span.traceChildSync('sort reject domainset', () => sortDomains(dudupedDominArray, domainArrayMainDomainMap, domainArraySubdomainMap)),
       'domainset',
       path.resolve(import.meta.dir, '../List/domainset/reject.conf'),
       path.resolve(import.meta.dir, '../Clash/domainset/reject.txt')

+ 9 - 0
Build/constants/loose-tldts-opt.ts

@@ -0,0 +1,9 @@
+import type * as tldts from 'tldts';
+
+export const looseTldtsOpt: Parameters<typeof tldts.getSubdomain>[1] = {
+  allowPrivateDomains: false,
+  extractHostname: false,
+  validateHostname: false,
+  detectIp: false,
+  mixedInputs: false
+};

+ 2 - 2
Build/lib/aho-corasick.ts

@@ -1,8 +1,8 @@
 interface Node {
   /** @default false */
-  wordEnd?: boolean,
+  wordEnd: boolean,
   children: Map<string, Node | undefined>,
-  fail?: Node
+  fail: Node | undefined
 }
 
 const createNode = (): Node => ({

+ 3 - 10
Build/lib/get-phishing-domains.ts

@@ -5,6 +5,7 @@ import { getSubdomain, getPublicSuffix } from 'tldts-experimental';
 import type { Span } from '../trace';
 import { appendArrayInPlaceCurried } from './append-array-in-place';
 import { PHISHING_DOMAIN_LISTS } from './reject-data-source';
+import { looseTldtsOpt } from '../constants/loose-tldts-opt';
 
 const BLACK_TLD = new Set([
   'accountant',
@@ -99,14 +100,6 @@ export const WHITELIST_MAIN_DOMAINS = new Set([
   'notion.site'
 ]);
 
-const tldtsOpt: Parameters<typeof getSubdomain>[1] = {
-  allowPrivateDomains: false,
-  extractHostname: false,
-  validateHostname: false,
-  detectIp: false,
-  mixedInputs: false
-};
-
 export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
   const gorhill = await getGorhillPublicSuffixPromise();
 
@@ -132,7 +125,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
         continue;
       }
 
-      const tld = getPublicSuffix(safeGorhillLine, tldtsOpt);
+      const tld = getPublicSuffix(safeGorhillLine, looseTldtsOpt);
       if (!tld || !BLACK_TLD.has(tld)) continue;
 
       domainCountMap[apexDomain] ||= 0;
@@ -187,7 +180,7 @@ export function calcDomainAbuseScore(line: string) {
     }
   }
 
-  const subdomain = getSubdomain(line, tldtsOpt);
+  const subdomain = getSubdomain(line, looseTldtsOpt);
 
   if (subdomain) {
     if (subdomain.slice(1).includes('.')) {

+ 1 - 0
Build/lib/normalize-domain.ts

@@ -7,6 +7,7 @@ export const normalizeDomain = (domain: string) => {
   const parsed = tldtsParse(domain, { allowPrivateDomains: true, detectIp: false });
   // if (parsed.isIp) return null;
   if (!parsed.hostname) return null;
+  // Private invalid domain (things like .tor, .dn42, etc)
   if (!parsed.isIcann && !parsed.isPrivate) return null;
 
   let h = parsed.hostname;

+ 30 - 11
Build/lib/stable-sort-domain.ts

@@ -3,32 +3,51 @@
 // enough when sorting.
 import { getDomain, getSubdomain } from 'tldts-experimental';
 import { sort } from './timsort';
+import { looseTldtsOpt } from '../constants/loose-tldts-opt';
 
 export const compare = (a: string, b: string) => {
   if (a === b) return 0;
   return (a.length - b.length) || a.localeCompare(b);
 };
 
-const tldtsOpt: Parameters<typeof getDomain>[1] = {
-  allowPrivateDomains: false,
-  extractHostname: false,
-  validateHostname: false,
-  detectIp: false,
-  mixedInputs: false
-};
-
-export const sortDomains = (inputs: string[]) => {
+export const buildParseDomainMap = (inputs: string[]) => {
   const domainMap = new Map<string, string>();
   const subdomainMap = new Map<string, string>();
 
   for (let i = 0, len = inputs.length; i < len; i++) {
     const cur = inputs[i];
     if (!domainMap.has(cur)) {
-      const topD = getDomain(cur, tldtsOpt);
+      const topD = getDomain(cur, looseTldtsOpt);
+      domainMap.set(cur, topD ?? cur);
+    }
+    if (!subdomainMap.has(cur)) {
+      const subD = getSubdomain(cur, looseTldtsOpt);
+      subdomainMap.set(cur, subD ?? cur);
+    }
+  }
+
+  return { domainMap, subdomainMap };
+};
+
+export const sortDomains = (
+  inputs: string[],
+  domainMap?: Map<string, string>,
+  subdomainMap?: Map<string, string>
+) => {
+  if (!domainMap || !subdomainMap) {
+    const { domainMap: dm, subdomainMap: sm } = buildParseDomainMap(inputs);
+    domainMap = dm;
+    subdomainMap = sm;
+  }
+
+  for (let i = 0, len = inputs.length; i < len; i++) {
+    const cur = inputs[i];
+    if (!domainMap.has(cur)) {
+      const topD = getDomain(cur, looseTldtsOpt);
       domainMap.set(cur, topD ?? cur);
     }
     if (!subdomainMap.has(cur)) {
-      const subD = getSubdomain(cur, tldtsOpt);
+      const subD = getSubdomain(cur, looseTldtsOpt);
       subdomainMap.set(cur, subD ?? cur);
     }
   }

+ 23 - 21
Build/lib/trie.ts

@@ -36,32 +36,34 @@ const createNode = (parent: TrieNode | null = null): TrieNode => {
   return node;
 };
 
+const hostnameToTokens = (hostname: string): string[] => {
+  let buf = '';
+  const tokens: string[] = [];
+  for (let i = 0, l = hostname.length; i < l; i++) {
+    const c = hostname[i];
+    if (c === '.') {
+      if (buf) {
+        tokens.push(buf, /* . */ c);
+        buf = '';
+      } else {
+        tokens.push(/* . */ c);
+      }
+    } else {
+      buf += c;
+    }
+  }
+  if (buf) {
+    tokens.push(buf);
+  }
+  return tokens;
+};
+
 export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false, smolTree = false) => {
   let size = 0;
   const root: TrieNode = createNode();
 
   const suffixToTokens = hostnameMode
-    ? (suffix: string) => {
-      let buf = '';
-      const tokens: string[] = [];
-      for (let i = 0, l = suffix.length; i < l; i++) {
-        const c = suffix[i];
-        if (c === '.') {
-          if (buf) {
-            tokens.push(buf, /* . */ c);
-            buf = '';
-          } else {
-            tokens.push(/* . */ c);
-          }
-        } else {
-          buf += c;
-        }
-      }
-      if (buf) {
-        tokens.push(buf);
-      }
-      return tokens;
-    }
+    ? hostnameToTokens
     : (suffix: string) => suffix;
 
   /**