浏览代码

Perf: domain deduper using only trie + DFS

SukkaW 1 年之前
父节点
当前提交
160e7bfab7
共有 3 个文件被更改,包括 163 次插入41 次删除
  1. 21 14
      Build/lib/domain-deduper.ts
  2. 73 1
      Build/lib/trie.test.ts
  3. 69 26
      Build/lib/trie.ts

+ 21 - 14
Build/lib/domain-deduper.ts

@@ -3,22 +3,29 @@ import { createTrie } from './trie';
 export function domainDeduper(inputDomains: string[], toArray?: true): string[];
 export function domainDeduper(inputDomains: string[], toArray: false): Set<string>;
 export function domainDeduper(inputDomains: string[], toArray = true): string[] | Set<string> {
-  const trie = createTrie(inputDomains, true);
-  const sets = new Set(inputDomains);
+  const trie = createTrie(inputDomains, true, true);
+  const dumped = trie.dump();
+  if (toArray) {
+    return dumped;
+  }
+  return new Set(dumped);
 
-  for (let i = 0, len1 = inputDomains.length; i < len1; i++) {
-    const d = inputDomains[i];
-    if (d[0] !== '.') {
-      continue;
-    }
+  // const trie = createTrie(inputDomains, true);
+  // const sets = new Set(inputDomains);
 
-    trie.substractSetInPlaceFromFound(d, sets);
-    sets.delete(d.slice(1));
-  }
+  // for (let i = 0, len1 = inputDomains.length; i < len1; i++) {
+  //   const d = inputDomains[i];
+  //   if (d[0] !== '.') {
+  //     continue;
+  //   }
 
-  if (toArray) {
-    return Array.from(sets);
-  }
+  //   trie.substractSetInPlaceFromFound(d, sets);
+  //   sets.delete(d.slice(1));
+  // }
+
+  // if (toArray) {
+  //   return Array.from(sets);
+  // }
 
-  return sets;
+  // return sets;
 }

+ 73 - 1
Build/lib/trie.test.ts

@@ -112,7 +112,7 @@ describe.each([
     expect(trie.find('noc.one')).toStrictEqual(['noc.one']);
   });
 
-  it('should remove subdomain', () => {
+  it('should match subdomain - 1', () => {
     const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], hostnameMode);
 
     console.log(trie);
@@ -121,8 +121,80 @@ describe.each([
     expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
   });
 
+  it('should match subdomain - 2', () => {
+    const trie = createTrie(['www.noc.one', 'www.sukkaw.com', '.skk.moe', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], hostnameMode);
+
+    console.log(trie);
+
+    expect(trie.find('.skk.moe')).toStrictEqual(['.skk.moe', 'image.cdn.skk.moe', 'blog.skk.moe']);
+    expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
+  });
+
   it('should not remove non-subdomain', () => {
     const trie = createTrie(['skk.moe', 'sukkaskk.moe'], hostnameMode);
     expect(trie.find('.skk.moe')).toStrictEqual([]);
   });
 });
+
+describe('smol tree', () => {
+  it('should create simple tree - 1', () => {
+    const trie = createTrie([
+      '.skk.moe', 'blog.skk.moe', '.cdn.skk.moe', 'skk.moe',
+      'www.noc.one', 'cdn.noc.one',
+      '.blog.sub.example.com', 'sub.example.com', 'cdn.sub.example.com', '.sub.example.com'
+    ], true, true);
+
+    console.log(trie);
+
+    expect(trie.dump()).toStrictEqual([
+      '.sub.example.com',
+      'cdn.noc.one', 'www.noc.one',
+      '.skk.moe'
+    ]);
+  });
+
+  it.only('should create simple tree - 2', () => {
+    const trie = createTrie([
+      '.skk.moe', 'blog.skk.moe', '.cdn.skk.moe', 'skk.moe'
+    ], true, true);
+
+    console.log({ trie });
+
+    expect(trie.dump()).toStrictEqual([
+      '.skk.moe'
+    ]);
+  });
+
+  it('should create simple tree - 2', () => {
+    const trie = createTrie([
+      '.blog.sub.example.com', 'cdn.sub.example.com', '.sub.example.com'
+    ], true, true);
+
+    console.log(trie);
+
+    expect(trie.dump()).toStrictEqual([
+      '.sub.example.com'
+    ]);
+
+    trie.add('.sub.example.com');
+    expect(trie.dump()).toStrictEqual([
+      '.sub.example.com'
+    ]);
+  });
+
+  it('should create simple tree - 3', () => {
+    const trie = createTrie([
+      'commercial.shouji.360.cn',
+      'act.commercial.shouji.360.cn',
+      'cdn.creative.medialytics.com',
+      'px.cdn.creative.medialytics.com'
+    ], true, true);
+
+    expect(trie.dump()).toStrictEqual([
+      'cdn.creative.medialytics.com',
+      'px.cdn.creative.medialytics.com',
+      'commercial.shouji.360.cn',
+      'act.commercial.shouji.360.cn'
+    ]);
+  });
+});

+ 69 - 26
Build/lib/trie.ts

@@ -5,9 +5,11 @@
 // import { Trie } from 'mnemonist';
 
 export const SENTINEL = Symbol('SENTINEL');
+const PARENT = Symbol('Parent Node');
 
 type TrieNode = {
   [SENTINEL]: boolean,
+  [PARENT]: TrieNode | null,
   [Bun.inspect.custom]: () => string
 } & Map<string, TrieNode>;
 
@@ -26,14 +28,15 @@ function trieNodeInspectCustom(this: TrieNode) {
   return JSON.stringify(deepTrieNodeToJSON(this), null, 2);
 }
 
-const createNode = (): TrieNode => {
+const createNode = (parent: TrieNode | null = null): TrieNode => {
   const node = new Map<string, TrieNode>() as TrieNode;
   node[SENTINEL] = false;
+  node[PARENT] = parent;
   node[Bun.inspect.custom] = trieNodeInspectCustom;
   return node;
 };
 
-export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false) => {
+export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false, smolTree = false) => {
   let size = 0;
   const root: TrieNode = createNode();
 
@@ -75,11 +78,35 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
 
       if (node.has(token)) {
         node = node.get(token)!;
+
+        if (smolTree) {
+          if (node.get('.')?.[SENTINEL] === true) {
+            return;
+          }
+          // return;
+        }
       } else {
-        const newNode = createNode();
+        const newNode = createNode(node);
         node.set(token, newNode);
         node = newNode;
       }
+
+      if (smolTree) {
+        if (i === 1 && tokens[0] === '.') {
+          node[SENTINEL] = false;
+          // Trying to add `.sub.example.com` where there is already a `blog.sub.example.com` in the trie
+          const newNode = createNode(node);
+          node.set('.', newNode);
+          node = newNode;
+          break;
+        }
+        if (i === 0) {
+          // Trying to add `example.com` when there is already a `.example.com` in the trie
+          if (node.get('.')?.[SENTINEL] === true) {
+            return;
+          }
+        }
+      }
     }
 
     // Do we need to increase size?
@@ -107,10 +134,15 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
 
     return true;
   };
+
   /**
    * Method used to retrieve every item in the trie with the given prefix.
    */
   const find = (inputSuffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => {
+    if (smolTree) {
+      throw new Error('A Trie with smolTree enabled cannot perform find!');
+    }
+
     let node: TrieNode | undefined = root;
     let token: string;
 
@@ -153,10 +185,7 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
         nodeStack.push(childNode);
 
         if (hostnameMode) {
-          const stack = (suffix as string[]).slice();
-          stack.unshift(k);
-
-          suffixStack.push(stack);
+          suffixStack.push([k, ...suffix]);
         } else {
           suffixStack.push(k + (suffix as string));
         }
@@ -167,9 +196,13 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
   };
 
   /**
- * Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
- */
+   * Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
+   */
   const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
+    if (smolTree) {
+      throw new Error('A Trie with smolTree enabled cannot perform substractSetInPlaceFromFound!');
+    }
+
     let node: TrieNode | undefined = root;
     let token: string;
 
@@ -193,7 +226,7 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
 
       if (node[SENTINEL]) {
         if (suffix !== inputTokens) {
-        // found match, delete it from set
+          // found match, delete it from set
           if (hostnameMode) {
             set.delete((suffix as string[]).join(''));
           } else {
@@ -205,8 +238,7 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
       node.forEach((childNode, k) => {
         nodeStack.push(childNode);
         if (hostnameMode) {
-          const stack = (suffix as string[]).slice();
-          stack.unshift(k);
+          const stack = [k, ...suffix];
           suffixStack.push(stack);
         } else {
           suffixStack.push(k + (suffix as string));
@@ -216,8 +248,8 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
   };
 
   /**
- * Method used to delete a prefix from the trie.
- */
+   * Method used to delete a prefix from the trie.
+   */
   const remove = (suffix: string): boolean => {
     let node: TrieNode | undefined = root;
     let toPrune: TrieNode | null = null;
@@ -294,35 +326,43 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
   }
 
   const dump = () => {
-    const node = root;
     const nodeStack: TrieNode[] = [];
-    const suffixStack: string[] = [];
+    const suffixStack: Array<string | string[]> = [];
     // Resolving initial string
-    const suffix = '';
+    const suffix = hostnameMode ? [] : '';
 
-    nodeStack.push(node);
+    nodeStack.push(root);
     suffixStack.push(suffix);
 
     const results: string[] = [];
 
-    let currentNode: TrieNode;
-    let currentPrefix: string;
-    let hasValue = false;
+    let node: TrieNode;
 
     do {
-      currentNode = nodeStack.pop()!;
-      currentPrefix = suffixStack.pop()!;
+      let hasValue = false;
 
-      if (currentNode[SENTINEL]) {
+      node = nodeStack.pop()!;
+      const suffix = suffixStack.pop()!;
+
+      if (node[SENTINEL]) {
         hasValue = true;
       }
 
       node.forEach((childNode, k) => {
         nodeStack.push(childNode);
-        suffixStack.push(k + suffix);
+
+        if (hostnameMode) {
+          suffixStack.push([k, ...suffix]);
+        } else {
+          suffixStack.push(k + (suffix as string));
+        }
       });
 
-      if (hasValue) results.push(currentPrefix);
+      if (hasValue) {
+        results.push(
+          hostnameMode ? (suffix as string[]).join('') : (suffix as string)
+        );
+      }
     } while (nodeStack.length);
 
     return results;
@@ -338,6 +378,9 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
     has,
     dump,
     get size() {
+      if (smolTree) {
+        throw new Error('A Trie with smolTree enabled cannot have correct size!');
+      }
       return size;
     },
     get root() {