浏览代码

Chore: update source deduping tool

SukkaW 4 月之前
父节点
当前提交
bb6c7cb3fa
共有 3 个文件被更改,包括 93 次插入8 次删除
  1. 43 0
      Build/lib/trie.test.ts
  2. 38 4
      Build/lib/trie.ts
  3. 12 4
      Build/tools-dedupe-src.ts

+ 43 - 0
Build/lib/trie.test.ts

@@ -365,4 +365,47 @@ describe('smol tree', () => {
     trie.whitelist('cdn.example.com');
     expect(trie.dump()).toStrictEqual(['blog.cdn.example.com']);
   });
+
+  it('contains - normal', () => {
+    const trie = createTrie([
+      'skk.moe',
+      'anotherskk.moe',
+      'blog.anotherskk.moe',
+      'blog.skk.moe'
+    ], true);
+
+    expect(trie.contains('skk.moe')).toBe(true);
+    expect(trie.contains('blog.skk.moe')).toBe(true);
+    expect(trie.contains('anotherskk.moe')).toBe(true);
+    expect(trie.contains('blog.anotherskk.moe')).toBe(true);
+
+    expect(trie.contains('example.com')).toBe(false);
+    expect(trie.contains('blog.example.com')).toBe(false);
+    expect(trie.contains('skk.mo')).toBe(false);
+    expect(trie.contains('cdn.skk.moe')).toBe(false);
+  });
+
+  it('contains - subdomain', () => {
+    const trie = createTrie([
+      'index.rubygems.org'
+    ], true);
+
+    expect(trie.contains('rubygems.org')).toBe(false);
+    expect(trie.contains('index.rubygems.org')).toBe(true);
+    expect(trie.contains('sub.index.rubygems.org')).toBe(false);
+  });
+
+  it('contains - include subdomains', () => {
+    const trie = createTrie([
+      '.skk.moe'
+    ], true);
+
+    expect(trie.contains('skk.moe')).toBe(true);
+    expect(trie.contains('blog.skk.moe')).toBe(true);
+    expect(trie.contains('image.cdn.skk.moe')).toBe(true);
+
+    expect(trie.contains('example.com')).toBe(false);
+    expect(trie.contains('blog.example.com')).toBe(false);
+    expect(trie.contains('skk.mo')).toBe(false);
+  });
 });

+ 38 - 4
Build/lib/trie.ts

@@ -186,10 +186,44 @@ abstract class Triebase<Meta = unknown> {
   public contains(suffix: string, includeAllSubdomain = suffix[0] === '.'): boolean {
     const hostnameFromIndex = suffix[0] === '.' ? 1 : 0;
 
-    const res = this.walkIntoLeafWithSuffix(suffix, hostnameFromIndex);
-    if (!res) return false;
-    if (includeAllSubdomain) return getBit(res.node[0], INCLUDE_ALL_SUBDOMAIN);
-    return true;
+    let node: TrieNode = this.$root;
+    // let parent: TrieNode = node;
+
+    let child: Map<string, TrieNode<Meta>> = node[2];
+
+    let result = false;
+
+    const onToken = (token: string) => {
+      // if (token === '') {
+      //   return true;
+      // }
+
+      // parent = node;
+
+      child = node[2];
+
+      if (child.has(token)) {
+        node = child.get(token)!;
+      } else {
+        if (getBit(node[0], INCLUDE_ALL_SUBDOMAIN)) {
+          result = true;
+        }
+        return null;
+      }
+
+      return false;
+    };
+
+    if (walkHostnameTokens(suffix, onToken, hostnameFromIndex) === null) {
+      return result;
+    }
+
+    if (includeAllSubdomain) return getBit(node[0], INCLUDE_ALL_SUBDOMAIN);
+    return getBit(node[0], START);
+
+    // if (res === null) return false;
+    // if (includeAllSubdomain) return getBit(res.node[0], INCLUDE_ALL_SUBDOMAIN);
+    // return true;
   };
 
   private static bfsResults: [node: TrieNode | null, suffix: string[]] = [null, []];

+ 12 - 4
Build/tools-dedupe-src.ts

@@ -4,7 +4,7 @@ import fsp from 'node:fs/promises';
 import { SOURCE_DIR } from './constants/dir';
 import { readFileByLine } from './lib/fetch-text-by-line';
 import { processLine } from './lib/process-line';
-import { HostnameSmolTrie, HostnameTrie } from './lib/trie';
+import { HostnameSmolTrie } from './lib/trie';
 import { task } from './trace';
 
 const ENFORCED_WHITELIST = [
@@ -21,7 +21,8 @@ const ENFORCED_WHITELIST = [
   'samsungqbe.com',
   'ntp.api.bz',
   'cdn.tuk.dev',
-  'vocadb-analytics.fly.dev'
+  'vocadb-analytics.fly.dev',
+  'img.vim-cn.com'
 ];
 
 const WHITELIST: string[] = ['httpdns.bilivideo.com', 'ntp.api.bz', 'httpdns-v6.gslb.yy.com', 'img.vim-cn.com', 'img.jjbb.me', 'thingproxy.freeboard.io', 'assets.chess24.com', 'cdn.chess24.com', 'static-assets.freeanimehentai.net', 'static.javcdn.info', 'cdn.vidible.tv', 'it.apache.contactlab.it', 'mirror.netinch.com', 'de.freedif.org', 'league1.maoyuncloud.cn', 'spl.ztvx8.com', 'zls.xz6d.com', 'iadmatapk.nosdn.127.net', 'show.buzzcity.net', 'click.buzzcity.net', 'apps.buzzcity.net', 'content-cdn.y2mate.com', 'images.voguehk.com', 'cdn.amh.moe', 'statics.mnnews.tw'];
@@ -51,10 +52,13 @@ task(require.main === module, __filename)(async (span) => {
 async function dedupeFile(file: string, whitelist: HostnameSmolTrie) {
   const result: string[] = [];
 
-  const trie = new HostnameTrie();
+  const trie = new HostnameSmolTrie();
 
   let line: string | null = '';
 
+  // eslint-disable-next-line @typescript-eslint/unbound-method -- .call
+  let trieHasOrContains = HostnameSmolTrie.prototype.has;
+
   for await (const l of readFileByLine(file)) {
     line = processLine(l);
 
@@ -62,12 +66,16 @@ async function dedupeFile(file: string, whitelist: HostnameSmolTrie) {
       if (l.startsWith('# $ skip_dedupe_src')) {
         return;
       }
+      if (l.startsWith('# $ dedupe_use_trie_contains')) {
+        // eslint-disable-next-line @typescript-eslint/unbound-method -- .call
+        trieHasOrContains = HostnameSmolTrie.prototype.contains;
+      }
 
       result.push(l); // keep all comments and blank lines
       continue;
     }
 
-    if (trie.has(line)) {
+    if (trieHasOrContains.call(trie, line)) {
       continue; // drop duplicate
     }