瀏覽代碼

Chore: dedupe and sort other rulesets

SukkaW 1 年之前
父節點
當前提交
90079b9987

+ 3 - 3
Build/build-cdn-download-conf.ts

@@ -5,7 +5,7 @@ import { createTrie } from './lib/trie';
 import { task } from './trace';
 import { task } from './trace';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { getPublicSuffixListTextPromise } from './lib/download-publicsuffixlist';
 import { getPublicSuffixListTextPromise } from './lib/download-publicsuffixlist';
-import { domainDeduper } from './lib/domain-deduper';
+import { domainsetDeduper } from './lib/domain-deduper';
 import { appendArrayInPlace } from './lib/append-array-in-place';
 import { appendArrayInPlace } from './lib/append-array-in-place';
 import { sortDomains } from './lib/stable-sort-domain';
 import { sortDomains } from './lib/stable-sort-domain';
 import { output } from './lib/misc';
 import { output } from './lib/misc';
@@ -76,7 +76,7 @@ export const buildCdnDownloadConf = task(require.main === module, __filename)(as
         'This file contains object storage and static assets CDN domains.'
         'This file contains object storage and static assets CDN domains.'
       ],
       ],
       new Date(),
       new Date(),
-      sortDomains(domainDeduper(cdnDomainsList)),
+      sortDomains(domainsetDeduper(cdnDomainsList)),
       'domainset',
       'domainset',
       output('cdn', 'domainset')
       output('cdn', 'domainset')
     ),
     ),
@@ -89,7 +89,7 @@ export const buildCdnDownloadConf = task(require.main === module, __filename)(as
         'This file contains domains for software updating & large file hosting.'
         'This file contains domains for software updating & large file hosting.'
       ],
       ],
       new Date(),
       new Date(),
-      sortDomains(domainDeduper(downloadDomainSet)),
+      sortDomains(domainsetDeduper(downloadDomainSet)),
       'domainset',
       'domainset',
       output('download', 'domainset')
       output('download', 'domainset')
     )
     )

+ 2 - 2
Build/build-common.ts

@@ -4,7 +4,7 @@ import * as path from 'node:path';
 import { readFileByLine } from './lib/fetch-text-by-line';
 import { readFileByLine } from './lib/fetch-text-by-line';
 import { processLine } from './lib/process-line';
 import { processLine } from './lib/process-line';
 import { createRuleset } from './lib/create-file';
 import { createRuleset } from './lib/create-file';
-import { domainDeduper } from './lib/domain-deduper';
+import { domainsetDeduper } from './lib/domain-deduper';
 import type { Span } from './trace';
 import type { Span } from './trace';
 import { task } from './trace';
 import { task } from './trace';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { SHARED_DESCRIPTION } from './lib/constants';
@@ -116,7 +116,7 @@ function transformDomainset(parentSpan: Span, sourcePath: string, relativePath:
         const clashFileBasename = relativePath.slice(0, -path.extname(relativePath).length);
         const clashFileBasename = relativePath.slice(0, -path.extname(relativePath).length);
 
 
         const [title, descriptions, lines] = res;
         const [title, descriptions, lines] = res;
-        const deduped = domainDeduper(lines);
+        const deduped = domainsetDeduper(lines);
 
 
         let description: string[];
         let description: string[];
         if (descriptions.length) {
         if (descriptions.length) {

+ 3 - 3
Build/build-reject-domainset.ts

@@ -7,7 +7,7 @@ import { createTrie } from './lib/trie';
 
 
 import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source';
 import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source';
 import { createRuleset, compareAndWriteFile } from './lib/create-file';
 import { createRuleset, compareAndWriteFile } from './lib/create-file';
-import { domainDeduper } from './lib/domain-deduper';
+import { domainsetDeduper } from './lib/domain-deduper';
 import createKeywordFilter from './lib/aho-corasick';
 import createKeywordFilter from './lib/aho-corasick';
 import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
 import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
 import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain';
 import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain';
@@ -149,8 +149,8 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
   });
   });
 
 
   // Dedupe domainSets
   // Dedupe domainSets
-  const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainDeduper(baseTrie));
-  const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainDeduper(extraTrie));
+  const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainsetDeduper(baseTrie));
+  const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainsetDeduper(extraTrie));
 
 
   console.log(`Final size ${dudupedDominArray.length} + ${dudupedDominArrayExtra.length}`);
   console.log(`Final size ${dudupedDominArray.length} + ${dudupedDominArrayExtra.length}`);
 
 

+ 2 - 2
Build/build-speedtest-domainset.ts

@@ -1,4 +1,4 @@
-import { domainDeduper } from './lib/domain-deduper';
+import { domainsetDeduper } from './lib/domain-deduper';
 import path from 'node:path';
 import path from 'node:path';
 import { createRuleset } from './lib/create-file';
 import { createRuleset } from './lib/create-file';
 import { sortDomains } from './lib/stable-sort-domain';
 import { sortDomains } from './lib/stable-sort-domain';
@@ -235,7 +235,7 @@ export const buildSpeedtestDomainSet = task(require.main === module, __filename)
     }
     }
   }))));
   }))));
 
 
-  const deduped = span.traceChildSync('sort result', () => sortDomains(domainDeduper(domainTrie)));
+  const deduped = span.traceChildSync('sort result', () => sortDomains(domainsetDeduper(domainTrie)));
 
 
   const description = [
   const description = [
     ...SHARED_DESCRIPTION,
     ...SHARED_DESCRIPTION,

+ 9 - 0
Build/lib/bitwise.ts

@@ -0,0 +1,9 @@
+/** Packs two 16-bit integers into one 32-bit integer */
+export const pack = (a: number, b: number): number => {
+  return (a << 16) | b;
+};
+
+/** Unpacks two 16-bit integers from one 32-bit integer */
+export const unpack = (value: number): [a: number, b: number] => {
+  return [(value >> 16) & 0xFFFF, value & 0xFFFF];
+};

+ 57 - 37
Build/lib/create-file.ts

@@ -8,6 +8,8 @@ import { fastStringArrayJoin, writeFile } from './misc';
 import { readFileByLine } from './fetch-text-by-line';
 import { readFileByLine } from './fetch-text-by-line';
 import stringify from 'json-stringify-pretty-compact';
 import stringify from 'json-stringify-pretty-compact';
 import { ipCidrListToSingbox, surgeDomainsetToSingbox, surgeRulesetToSingbox } from './singbox';
 import { ipCidrListToSingbox, surgeDomainsetToSingbox, surgeRulesetToSingbox } from './singbox';
+import { createTrie } from './trie';
+import { pack, unpack } from './bitwise';
 
 
 export async function compareAndWriteFile(span: Span, linesA: string[], filePath: string) {
 export async function compareAndWriteFile(span: Span, linesA: string[], filePath: string) {
   let isEqual = true;
   let isEqual = true;
@@ -92,17 +94,6 @@ const withBannerArray = (title: string, description: string[] | readonly string[
   ];
   ];
 };
 };
 
 
-const collectType = (rule: string) => {
-  let buf = '';
-  for (let i = 0, len = rule.length; i < len; i++) {
-    if (rule[i] === ',') {
-      return buf;
-    }
-    buf += rule[i];
-  }
-  return null;
-};
-
 const defaultSortTypeOrder = Symbol('defaultSortTypeOrder');
 const defaultSortTypeOrder = Symbol('defaultSortTypeOrder');
 const sortTypeOrder: Record<string | typeof defaultSortTypeOrder, number> = {
 const sortTypeOrder: Record<string | typeof defaultSortTypeOrder, number> = {
   DOMAIN: 1,
   DOMAIN: 1,
@@ -120,33 +111,62 @@ const sortTypeOrder: Record<string | typeof defaultSortTypeOrder, number> = {
   'IP-CIDR': 400,
   'IP-CIDR': 400,
   'IP-CIDR6': 400
   'IP-CIDR6': 400
 };
 };
-// sort DOMAIN-SUFFIX and DOMAIN first, then DOMAIN-KEYWORD, then IP-CIDR and IP-CIDR6 if any
-export const sortRuleSet = (ruleSet: string[]) => {
-  return ruleSet.map((rule) => {
-    const type = collectType(rule);
-    if (!type) {
-      return [10, rule] as const;
-    }
-    if (!(type in sortTypeOrder)) {
-      return [sortTypeOrder[defaultSortTypeOrder], rule] as const;
-    }
-    if (type === 'URL-REGEX') {
-      let extraWeight = 0;
-      if (rule.includes('.+') || rule.includes('.*')) {
-        extraWeight += 10;
-      }
-      if (rule.includes('|')) {
-        extraWeight += 1;
-      }
 
 
-      return [
-        sortTypeOrder[type] + extraWeight,
-        rule
-      ] as const;
+const flagDomain = 1 << 2;
+const flagDomainSuffix = 1 << 3;
+
+// dedupe and sort based on rule type
+const processRuleSet = (ruleSet: string[]) => {
+  const trie = createTrie<number>(null, true);
+
+  const sortMap: Array<[value: number, weight: number]> = [];
+  for (let i = 0, len = ruleSet.length; i < len; i++) {
+    const line = ruleSet[i];
+    const [type, value] = line.split(',');
+
+    let extraWeight = 0;
+
+    switch (type) {
+      case 'DOMAIN':
+        trie.add(value, pack(i, flagDomain));
+        break;
+      case 'DOMAIN-SUFFIX':
+        trie.add('.' + value, pack(i, flagDomainSuffix));
+        break;
+      case 'URL-REGEX':
+        if (value.includes('.+') || value.includes('.*')) {
+          extraWeight += 10;
+        }
+        if (value.includes('|')) {
+          extraWeight += 1;
+        }
+        sortMap.push([i, sortTypeOrder[type] + extraWeight]);
+        break;
+      case null:
+        sortMap.push([i, 10]);
+        break;
+      default:
+        if (type in sortTypeOrder) {
+          sortMap.push([i, sortTypeOrder[type]]);
+        } else {
+          sortMap.push([i, sortTypeOrder[defaultSortTypeOrder]]);
+        }
     }
     }
-    return [sortTypeOrder[type], rule] as const;
-  }).sort((a, b) => a[0] - b[0])
-    .map(c => c[1]);
+  }
+
+  const dumped = trie.dumpWithMeta();
+  for (let i = 0, len = dumped.length; i < len; i++) {
+    const [originalIndex, flag] = unpack(dumped[i][1]);
+    console.log(dumped[i][0], ruleSet[originalIndex]);
+
+    const type = flag === flagDomain ? 'DOMAIN' : 'DOMAIN-SUFFIX';
+
+    sortMap.push([originalIndex, sortTypeOrder[type]]);
+  }
+
+  return sortMap
+    .sort((a, b) => a[1] - b[1])
+    .map(c => ruleSet[c[0]]);
 };
 };
 
 
 const MARK = 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe';
 const MARK = 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe';
@@ -162,7 +182,7 @@ export const createRuleset = (
     _clashMrsPath?: string
     _clashMrsPath?: string
   ]
   ]
 ) => parentSpan.traceChild(`create ruleset: ${path.basename(surgePath, path.extname(surgePath))}`).traceAsyncFn(async (childSpan) => {
 ) => parentSpan.traceChild(`create ruleset: ${path.basename(surgePath, path.extname(surgePath))}`).traceAsyncFn(async (childSpan) => {
-  content = sortRuleSet(content);
+  content = processRuleSet(content);
   const surgeContent = childSpan.traceChildSync('process surge ruleset', () => {
   const surgeContent = childSpan.traceChildSync('process surge ruleset', () => {
     let _surgeContent;
     let _surgeContent;
     switch (type) {
     switch (type) {

+ 2 - 27
Build/lib/domain-deduper.ts

@@ -1,8 +1,6 @@
 import { createTrie, type Trie } from './trie';
 import { createTrie, type Trie } from './trie';
 
 
-export function domainDeduper(inputDomains: string[] | Trie, toArray?: true): string[];
-export function domainDeduper(inputDomains: string[] | Trie, toArray: false): Set<string>;
-export function domainDeduper(inputDomains: string[] | Trie, toArray = true): string[] | Set<string> {
+export function domainsetDeduper(inputDomains: string[] | Trie): string[] {
   let trie: Trie;
   let trie: Trie;
   if (Array.isArray(inputDomains)) {
   if (Array.isArray(inputDomains)) {
     trie = createTrie(inputDomains, true);
     trie = createTrie(inputDomains, true);
@@ -12,28 +10,5 @@ export function domainDeduper(inputDomains: string[] | Trie, toArray = true): st
     throw new Error('Invalid trie');
     throw new Error('Invalid trie');
   }
   }
 
 
-  const dumped = trie.dump();
-  if (toArray) {
-    return dumped;
-  }
-  return new Set(dumped);
-
-  // const trie = createTrie(inputDomains, true);
-  // const sets = new Set(inputDomains);
-
-  // for (let i = 0, len1 = inputDomains.length; i < len1; i++) {
-  //   const d = inputDomains[i];
-  //   if (d[0] !== '.') {
-  //     continue;
-  //   }
-
-  //   trie.substractSetInPlaceFromFound(d, sets);
-  //   sets.delete(d.slice(1));
-  // }
-
-  // if (toArray) {
-  //   return Array.from(sets);
-  // }
-
-  // return sets;
+  return trie.dump();
 }
 }

+ 32 - 17
Build/lib/trie.ts

@@ -7,10 +7,11 @@ import { inspect } from 'node:util';
 
 
 const noop = () => { /** noop */ };
 const noop = () => { /** noop */ };
 
 
-type TrieNode = [
+type TrieNode<Meta = any> = [
   boolean, /** sentinel */
   boolean, /** sentinel */
   TrieNode | null, /** parent */
   TrieNode | null, /** parent */
-  Map<string, TrieNode> /** children */
+  Map<string, TrieNode>, /** children */
+  Meta /** meta */
 ];
 ];
 
 
 const deepTrieNodeToJSON = (node: TrieNode) => {
 const deepTrieNodeToJSON = (node: TrieNode) => {
@@ -18,14 +19,17 @@ const deepTrieNodeToJSON = (node: TrieNode) => {
   if (node[0]) {
   if (node[0]) {
     obj['[start]'] = node[0];
     obj['[start]'] = node[0];
   }
   }
+  if (node[3] !== undefined) {
+    obj['[meta]'] = node[3];
+  }
   node[2].forEach((value, key) => {
   node[2].forEach((value, key) => {
     obj[key] = deepTrieNodeToJSON(value);
     obj[key] = deepTrieNodeToJSON(value);
   });
   });
   return obj;
   return obj;
 };
 };
 
 
-const createNode = (parent: TrieNode | null = null): TrieNode => {
-  return [false, parent, new Map<string, TrieNode>()] as TrieNode;
+const createNode = <Meta = any>(parent: TrieNode | null = null, meta: Meta | null = null): TrieNode => {
+  return [false, parent, new Map<string, TrieNode>(), meta] as TrieNode<Meta>;
 };
 };
 
 
 export const hostnameToTokens = (hostname: string): string[] => {
 export const hostnameToTokens = (hostname: string): string[] => {
@@ -72,16 +76,16 @@ const walkHostnameTokens = (hostname: string, onToken: (token: string) => boolea
   return false;
   return false;
 };
 };
 
 
-export const createTrie = (from?: string[] | Set<string> | null, smolTree = false) => {
+export const createTrie = <Meta = any>(from?: string[] | Set<string> | null, smolTree = false) => {
   let size = 0;
   let size = 0;
-  const root: TrieNode = createNode();
+  const root: TrieNode<Meta> = createNode();
 
 
   /**
   /**
    * Method used to add the given suffix to the trie.
    * Method used to add the given suffix to the trie.
    */
    */
   const add = smolTree
   const add = smolTree
-    ? (suffix: string): void => {
-      let node: TrieNode = root;
+    ? (suffix: string, meta?: Meta): void => {
+      let node: TrieNode<Meta> = root;
 
 
       const onToken = (token: string) => {
       const onToken = (token: string) => {
         if (node[2].has(token)) {
         if (node[2].has(token)) {
@@ -98,6 +102,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
           node = newNode;
           node = newNode;
         }
         }
 
 
+        node[3] = meta!;
         return false;
         return false;
       };
       };
 
 
@@ -128,8 +133,8 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
 
 
       node[0] = true;
       node[0] = true;
     }
     }
-    : (suffix: string): void => {
-      let node: TrieNode = root;
+    : (suffix: string, meta?: Meta): void => {
+      let node: TrieNode<Meta> = root;
 
 
       const onToken = (token: string) => {
       const onToken = (token: string) => {
         if (node[2].has(token)) {
         if (node[2].has(token)) {
@@ -140,6 +145,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
           node = newNode;
           node = newNode;
         }
         }
 
 
+        node[3] = meta!;
         return false;
         return false;
       };
       };
 
 
@@ -221,15 +227,15 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
   };
   };
 
 
   const walk = (
   const walk = (
-    onMatches: (suffix: string[]) => void,
+    onMatches: (suffix: string[], meta: Meta) => void,
     initialNode = root,
     initialNode = root,
     initialSuffix: string[] = []
     initialSuffix: string[] = []
   ) => {
   ) => {
-    const nodeStack: TrieNode[] = [initialNode];
+    const nodeStack: Array<TrieNode<Meta>> = [initialNode];
     // Resolving initial string (begin the start of the stack)
     // Resolving initial string (begin the start of the stack)
     const suffixStack: string[][] = [initialSuffix];
     const suffixStack: string[][] = [initialSuffix];
 
 
-    let node: TrieNode = root;
+    let node: TrieNode<Meta> = root;
 
 
     do {
     do {
       node = nodeStack.pop()!;
       node = nodeStack.pop()!;
@@ -244,7 +250,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
 
 
       // If the node is a sentinel, we push the suffix to the results
       // If the node is a sentinel, we push the suffix to the results
       if (node[0]) {
       if (node[0]) {
-        onMatches(suffix);
+        onMatches(suffix, node[3]);
       }
       }
     } while (nodeStack.length);
     } while (nodeStack.length);
   };
   };
@@ -383,6 +389,16 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
     return results;
     return results;
   };
   };
 
 
+  const dumpWithMeta = () => {
+    const results: Array<[string, Meta]> = [];
+
+    walk((suffix, meta) => {
+      results.push([fastStringArrayJoin(suffix, ''), meta]);
+    });
+
+    return results;
+  };
+
   const whitelist = (suffix: string) => {
   const whitelist = (suffix: string) => {
     if (!smolTree) {
     if (!smolTree) {
       throw new Error('whitelist method is only available in smolTree mode.');
       throw new Error('whitelist method is only available in smolTree mode.');
@@ -428,7 +444,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
       add(from[i]);
       add(from[i]);
     }
     }
   } else if (from) {
   } else if (from) {
-    from.forEach(add);
+    from.forEach((value) => add(value));
   }
   }
 
 
   return {
   return {
@@ -440,6 +456,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
     delete: remove,
     delete: remove,
     has,
     has,
     dump,
     dump,
+    dumpWithMeta,
     get size() {
     get size() {
       if (smolTree) {
       if (smolTree) {
         throw new Error('A Trie with smolTree enabled cannot have correct size!');
         throw new Error('A Trie with smolTree enabled cannot have correct size!');
@@ -460,5 +477,3 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
 };
 };
 
 
 export type Trie = ReturnType<typeof createTrie>;
 export type Trie = ReturnType<typeof createTrie>;
-
-export default createTrie;