浏览代码

Chore: refine reject domainset building

SukkaW 2 年之前
父节点
当前提交
c95e96fc61

+ 1 - 1
Build/build-microsoft-cdn.ts

@@ -33,7 +33,7 @@ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise(async () => {
         trie.add(domain);
       }
     }
-    return new Set(PROBE_DOMAINS.flatMap(domain => trie.find(domain, false)));
+    return new Set(PROBE_DOMAINS.flatMap(domain => trie.find(domain)));
   });
 
   // Second trie is to remove blacklisted domains

+ 40 - 30
Build/build-reject-domainset.ts

@@ -79,48 +79,58 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
   console.log(`Import ${previousSize} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`);
 
   // Dedupe domainSets
-  await span.traceChild('dedupe from black keywords/suffixes').traceAsyncFn(async () => {
-  /** Collect DOMAIN-SUFFIX from non_ip/reject.conf for deduplication */
+  await span.traceChild('dedupe from black keywords/suffixes').traceAsyncFn(async (childSpan) => {
+    /** Collect DOMAIN-SUFFIX from non_ip/reject.conf for deduplication */
     const domainSuffixSet = new Set<string>();
     /** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */
     const domainKeywordsSet = new Set<string>();
 
-    for await (const line of readFileByLine(path.resolve(import.meta.dir, '../Source/non_ip/reject.conf'))) {
-      const [type, keyword] = line.split(',');
+    await childSpan.traceChild('collect keywords/suffixes').traceAsyncFn(async () => {
+      for await (const line of readFileByLine(path.resolve(import.meta.dir, '../Source/non_ip/reject.conf'))) {
+        const [type, value] = line.split(',');
 
-      if (type === 'DOMAIN-KEYWORD') {
-        domainKeywordsSet.add(keyword.trim());
-      } else if (type === 'DOMAIN-SUFFIX') {
-        domainSuffixSet.add(keyword.trim());
+        if (type === 'DOMAIN-KEYWORD') {
+          domainKeywordsSet.add(value.trim());
+        } else if (type === 'DOMAIN-SUFFIX') {
+          domainSuffixSet.add(value.trim());
+        }
       }
-    }
-
-    const trie1 = createTrie(domainSets);
-
-    domainSuffixSet.forEach(suffix => {
-      trie1.find(suffix, true).forEach(f => domainSets.delete(f));
     });
-    filterRuleWhitelistDomainSets.forEach(suffix => {
-      trie1.find(suffix, true).forEach(f => domainSets.delete(f));
-
-      if (suffix[0] === '.') {
-        // handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
-        domainSets.delete(suffix.slice(1));
-      } else {
-        // If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set
-        domainSets.delete(`.${suffix}`);
-      }
+
+    // Remove as many domains as possible from domainSets before creating trie
+    SetHelpers.subtract(domainSets, domainSuffixSet);
+    SetHelpers.subtract(domainSets, filterRuleWhitelistDomainSets);
+
+    childSpan.traceChild('dedupe from white/suffixes').traceSyncFn(() => {
+      const trie = createTrie(domainSets);
+
+      domainSuffixSet.forEach(suffix => {
+        trie.remove(suffix);
+        trie.substractSetInPlaceFromFound(suffix, domainSets);
+      });
+      filterRuleWhitelistDomainSets.forEach(suffix => {
+        trie.substractSetInPlaceFromFound(suffix, domainSets);
+
+        if (suffix[0] === '.') {
+          // handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
+          domainSets.delete(suffix.slice(1));
+        } else {
+          // If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set
+          domainSets.delete(`.${suffix}`);
+        }
+      });
     });
 
-    // remove pre-defined enforced blacklist from whitelist
-    const kwfilter = createKeywordFilter(domainKeywordsSet);
+    childSpan.traceChild('dedupe from black keywords').traceSyncFn(() => {
+      const kwfilter = createKeywordFilter(domainKeywordsSet);
 
-    for (const domain of domainSets) {
+      for (const domain of domainSets) {
       // Remove keyword
-      if (kwfilter(domain)) {
-        domainSets.delete(domain);
+        if (kwfilter(domain)) {
+          domainSets.delete(domain);
+        }
       }
-    }
+    });
 
     console.log(`Deduped ${previousSize} - ${domainSets.size} = ${previousSize - domainSets.size} from black keywords and suffixes!`);
   });

+ 21 - 12
Build/lib/cache-filesystem.ts

@@ -16,8 +16,12 @@ const enum CacheStatus {
 }
 
 export interface CacheOptions {
+  /** Path to sqlite file dir */
   cachePath?: string,
-  tbd?: number
+  /** Time before deletion */
+  tbd?: number,
+  /** Cache table name */
+  tableName?: string
 }
 
 interface CacheApplyNonStringOption<T> {
@@ -60,13 +64,18 @@ export const TTL = {
 
 export class Cache {
   db: Database;
-  tbd = 60 * 1000; // time before deletion
+  /** Time before deletion */
+  tbd = 60 * 1000;
+  /** SQLite file path */
   cachePath: string;
+  /** Table name */
+  tableName: string;
 
-  constructor({ cachePath = path.join(os.tmpdir() || '/tmp', 'hdc'), tbd }: CacheOptions = {}) {
+  constructor({ cachePath = path.join(os.tmpdir() || '/tmp', 'hdc'), tbd, tableName = 'cache' }: CacheOptions = {}) {
     this.cachePath = cachePath;
     mkdirSync(this.cachePath, { recursive: true });
     if (tbd != null) this.tbd = tbd;
+    this.tableName = tableName;
 
     const db = new Database(path.join(this.cachePath, 'cache.db'));
 
@@ -75,8 +84,8 @@ export class Cache {
     db.exec('PRAGMA temp_store = memory;');
     db.exec('PRAGMA optimize;');
 
-    db.prepare('CREATE TABLE IF NOT EXISTS cache (key TEXT PRIMARY KEY, value TEXT, ttl REAL NOT NULL);').run();
-    db.prepare('CREATE INDEX IF NOT EXISTS cache_ttl ON cache (ttl);').run();
+    db.prepare(`CREATE TABLE IF NOT EXISTS ${this.tableName} (key TEXT PRIMARY KEY, value TEXT, ttl REAL NOT NULL);`).run();
+    db.prepare(`CREATE INDEX IF NOT EXISTS cache_ttl ON ${this.tableName} (ttl);`).run();
 
     const date = new Date();
 
@@ -84,7 +93,7 @@ export class Cache {
 
     // ttl + tbd < now => ttl < now - tbd
     const now = date.getTime() - this.tbd;
-    db.prepare('DELETE FROM cache WHERE ttl < ?').run(now);
+    db.prepare(`DELETE FROM ${this.tableName} WHERE ttl < ?`).run(now);
 
     this.db = db;
 
@@ -100,7 +109,7 @@ export class Cache {
 
   set(key: string, value: string, ttl = 60 * 1000): void {
     const insert = this.db.prepare(
-      'INSERT INTO cache (key, value, ttl) VALUES ($key, $value, $valid) ON CONFLICT(key) DO UPDATE SET value = $value, ttl = $valid'
+      `INSERT INTO ${this.tableName} (key, value, ttl) VALUES ($key, $value, $valid) ON CONFLICT(key) DO UPDATE SET value = $value, ttl = $valid`
     );
 
     insert.run({
@@ -112,7 +121,7 @@ export class Cache {
 
   get(key: string, defaultValue?: string): string | undefined {
     const rv = this.db.prepare<{ value: string }, string>(
-      'SELECT value FROM cache WHERE key = ?'
+      `SELECT value FROM ${this.tableName} WHERE key = ?`
     ).get(key);
 
     if (!rv) return defaultValue;
@@ -121,13 +130,13 @@ export class Cache {
 
   has(key: string): CacheStatus {
     const now = Date.now();
-    const rv = this.db.prepare<{ ttl: number }, string>('SELECT ttl FROM cache WHERE key = ?').get(key);
+    const rv = this.db.prepare<{ ttl: number }, string>(`SELECT ttl FROM ${this.tableName} WHERE key = ?`).get(key);
 
     return !rv ? CacheStatus.Miss : (rv.ttl > now ? CacheStatus.Hit : CacheStatus.Stale);
   }
 
   del(key: string): void {
-    this.db.prepare('DELETE FROM cache WHERE key = ?').run(key);
+    this.db.prepare(`DELETE FROM ${this.tableName} WHERE key = ?`).run(key);
   }
 
   async apply<T>(
@@ -167,9 +176,9 @@ export class Cache {
   }
 }
 
-export const fsCache = traceSync('initializing filesystem cache', () => new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') }));
+export const fsFetchCache = traceSync('initializing filesystem cache for fetch', () => new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') }));
 // process.on('exit', () => {
-//   fsCache.destroy();
+//   fsFetchCache.destroy();
 // });
 
 const separator = '\u0000';

+ 5 - 5
Build/lib/create-file.ts

@@ -35,11 +35,11 @@ export async function compareAndWriteFile(span: Span, linesA: string[], filePath
         }
         if (
           lineA[0] === '/'
-            && lineA[1] === '/'
-            && lineA[3] === '#'
-            && lineB[0] === '/'
-            && lineB[1] === '/'
-            && lineB[3] === '#'
+          && lineA[1] === '/'
+          && lineB[0] === '/'
+          && lineB[1] === '/'
+          && lineA[3] === '#'
+          && lineB[3] === '#'
         ) {
           continue;
         }

+ 1 - 6
Build/lib/domain-deduper.ts

@@ -12,12 +12,7 @@ export function domainDeduper(inputDomains: string[], toArray = true): string[]
       continue;
     }
 
-    const found = trie.find(d, false);
-
-    for (let j = 0, len2 = found.length; j < len2; j++) {
-      sets.delete(found[j]);
-    }
-
+    trie.substractSetInPlaceFromFound(d, sets);
     sets.delete(d.slice(1));
   }
 

+ 3 - 4
Build/lib/get-phishing-domains.ts

@@ -104,11 +104,10 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
     return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
       for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
         const white = WHITELIST_DOMAIN[i];
-        const found = trieForRemovingWhiteListed.find(`.${white}`, true);
-        for (let j = 0, len2 = found.length; j < len2; j++) {
-          domainSet.delete(found[j]);
-        }
         domainSet.delete(white);
+        domainSet.delete(`.${white}`);
+
+        trieForRemovingWhiteListed.substractSetInPlaceFromFound(`.${white}`, domainSet);
       }
     });
   });

+ 4 - 5
Build/lib/parse-filter.ts

@@ -8,14 +8,14 @@ import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
 import picocolors from 'picocolors';
 import { normalizeDomain } from './normalize-domain';
 import { fetchAssets } from './fetch-assets';
-import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';
+import { deserializeSet, fsFetchCache, serializeSet } from './cache-filesystem';
 import type { Span } from '../trace';
 
 const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
 let foundDebugDomain = false;
 
 export function processDomainLists(span: Span, domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
-  return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn(() => fsCache.apply(
+  return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn(() => fsFetchCache.apply(
     domainListsUrl,
     async () => {
       const domainSets = new Set<string>();
@@ -45,7 +45,7 @@ export function processDomainLists(span: Span, domainListsUrl: string, includeAl
   ));
 }
 export function processHosts(span: Span, hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) {
-  return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsCache.apply(
+  return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply(
     hostsUrl,
     async () => {
       const domainSets = new Set<string>();
@@ -119,7 +119,7 @@ export async function processFilterRules(
   fallbackUrls?: readonly string[] | undefined | null,
   ttl: number | null = null
 ): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
-  const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn((span) => fsCache.apply<Readonly<[
+  const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn((span) => fsFetchCache.apply<Readonly<[
     white: string[],
     black: string[],
     warningMessages: string[]
@@ -187,7 +187,6 @@ export async function processFilterRules(
         }
       };
 
-      // TODO-SUKKA: add cache here
       if (!fallbackUrls || fallbackUrls.length === 0) {
         for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) {
           // don't trim here

+ 0 - 16
Build/lib/random-int.bench.ts

@@ -1,16 +0,0 @@
-import { bench, group, run } from 'mitata';
-import { randomInt as nativeRandomInt } from 'crypto';
-
-const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;
-
-group('random-int', () => {
-  bench('crypto.randomInt', () => {
-    nativeRandomInt(3, 7);
-  });
-
-  bench('Math.random', () => {
-    randomInt(3, 7);
-  });
-});
-
-run();

+ 4 - 55
Build/lib/trie.test.ts

@@ -11,6 +11,7 @@ describe('Trie', () => {
     trie.add('akku');
 
     expect(trie.size).toBe(3);
+
     expect(trie.has('sukka')).toBeTrue();
     expect(trie.has('ukka')).toBeTrue();
     expect(trie.has('akku')).toBeTrue();
@@ -86,59 +87,6 @@ describe('Trie', () => {
     expect(trie.find('')).toEqual(['greek', 'roman', 'esqueroman', 'sesqueroman']);
   });
 
-  // it('should work with custom tokens.', () => {
-  //   const trie = new Trie(Array);
-
-  //   trie.add(['the', 'cat', 'eats', 'the', 'mouse']);
-  //   trie.add(['the', 'mouse', 'eats', 'cheese']);
-  //   trie.add(['hello', 'world']);
-
-  //   assert.strictEqual(trie.size, 3);
-
-  //   assert.strictEqual(trie.has(['the', 'mouse', 'eats', 'cheese']), true);
-  //   assert.strictEqual(trie.has(['the', 'mouse', 'eats']), false);
-
-  //   assert.strictEqual(trie.delete(['hello']), false);
-  //   assert.strictEqual(trie.delete(['hello', 'world']), true);
-
-  //   assert.strictEqual(trie.size, 2);
-  // });
-
-  // it('should be possible to iterate over the trie\'s prefixes.', () => {
-  //   const trie = new Trie();
-
-  //   trie.add('rat');
-  //   trie.add('rate');
-
-  //   let prefixes = take(trie.prefixes());
-
-  //   assert.deepStrictEqual(prefixes, ['rat', 'rate']);
-
-  //   trie.add('rater');
-  //   trie.add('rates');
-
-  //   prefixes = take(trie.keys('rate'));
-
-  //   assert.deepStrictEqual(prefixes, ['rate', 'rates', 'rater']);
-  // });
-
-  // it('should be possible to iterate over the trie\'s prefixes using for...of.', () => {
-  //   const trie = new Trie();
-
-  //   trie.add('rat');
-  //   trie.add('rate');
-
-  //   const tests = [
-  //     'rat',
-  //     'rate'
-  //   ];
-
-  //   let i = 0;
-
-  //   for (const prefix of trie)
-  //     assert.deepStrictEqual(prefix, tests[i++]);
-  // });
-
   it('should be possible to create a trie from an arbitrary iterable.', () => {
     const words = ['roman', 'esqueroman'];
 
@@ -159,9 +107,10 @@ describe('surge domainset dedupe', () => {
 
   it('should remove subdomain', () => {
     const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net']);
-    // trie.find('noc.one').toBe(['www.noc.one']);
+
+    console.log(trie);
+
     expect(trie.find('.skk.moe')).toStrictEqual(['image.cdn.skk.moe', 'blog.skk.moe']);
-    // trie.find('sukkaw.net').toBe(['cdn.sukkaw.net']);
     expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
   });
 

+ 106 - 10
Build/lib/trie.ts

@@ -2,20 +2,34 @@
  * Suffix Trie based on Mnemonist Trie
  */
 
+// import { Trie } from 'mnemonist';
+
 export const SENTINEL = Symbol('SENTINEL');
 
 type TrieNode = {
-  [SENTINEL]: boolean
+  [SENTINEL]: boolean,
+  [Bun.inspect.custom]: () => string
 } & Map<string, TrieNode>;
 
+const deepTrieNodeToJSON = (node: TrieNode) => {
+  const obj: Record<string, any> = {};
+  if (node[SENTINEL]) {
+    obj['[start]'] = node[SENTINEL];
+  }
+  node.forEach((value, key) => {
+    obj[key] = deepTrieNodeToJSON(value);
+  });
+  return obj;
+};
+
 const createNode = (): TrieNode => {
-  const map = new Map<string, TrieNode>();
-  const node = map as TrieNode;
+  const node = new Map<string, TrieNode>() as TrieNode;
   node[SENTINEL] = false;
+  node[Bun.inspect.custom] = () => JSON.stringify(deepTrieNodeToJSON(node), null, 2);
   return node;
 };
 
-export const createTrie = (from?: string[] | Set<string>) => {
+export const createTrie = (from?: string[] | Set<string> | null) => {
   let size = 0;
   const root: TrieNode = createNode();
 
@@ -25,6 +39,7 @@ export const createTrie = (from?: string[] | Set<string>) => {
   const add = (suffix: string): void => {
     let node: TrieNode = root;
     let token: string;
+
     for (let i = suffix.length - 1; i >= 0; i--) {
       token = suffix[i];
 
@@ -40,8 +55,8 @@ export const createTrie = (from?: string[] | Set<string>) => {
     // Do we need to increase size?
     if (!node[SENTINEL]) {
       size++;
-      node[SENTINEL] = true;
     }
+    node[SENTINEL] = true;
   };
 
   /**
@@ -84,8 +99,8 @@ export const createTrie = (from?: string[] | Set<string>) => {
     const nodeStack: TrieNode[] = [node];
     const suffixStack: string[] = [inputSuffix];
 
-    while (nodeStack.length) {
-      const suffix = suffixStack.pop()!;
+    do {
+      const suffix: string = suffixStack.pop()!;
       node = nodeStack.pop()!;
 
       if (node[SENTINEL]) {
@@ -98,11 +113,50 @@ export const createTrie = (from?: string[] | Set<string>) => {
         nodeStack.push(childNode);
         suffixStack.push(k + suffix);
       });
-    }
+    } while (nodeStack.length);
 
     return matches;
   };
 
+  /**
+   * Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
+   */
+  const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
+    let node: TrieNode | undefined = root;
+    let token: string;
+
+    // Find the leaf-est node, and early return if not any
+    for (let i = inputSuffix.length - 1; i >= 0; i--) {
+      token = inputSuffix[i];
+
+      node = node.get(token);
+      if (!node) {
+        return;
+      }
+    }
+
+    // Performing DFS from prefix
+    const nodeStack: TrieNode[] = [node];
+    const suffixStack: string[] = [inputSuffix];
+
+    do {
+      const suffix = suffixStack.pop()!;
+      node = nodeStack.pop()!;
+
+      if (node[SENTINEL]) {
+        if (suffix !== inputSuffix) {
+          // found match, delete it from set
+          set.delete(suffix);
+        }
+      }
+
+      node.forEach((childNode, k) => {
+        nodeStack.push(childNode);
+        suffixStack.push(k + suffix);
+      });
+    } while (nodeStack.length);
+  };
+
   /**
    * Method used to delete a prefix from the trie.
    */
@@ -169,23 +223,65 @@ export const createTrie = (from?: string[] | Set<string>) => {
     return node[SENTINEL];
   };
 
-  if (from) {
+  if (Array.isArray(from)) {
+    for (let i = 0, l = from.length; i < l; i++) {
+      add(from[i]);
+    }
+  } else if (from) {
     from.forEach(add);
   }
 
+  const dump = () => {
+    const node = root;
+    const nodeStack: TrieNode[] = [];
+    const suffixStack: string[] = [];
+    // Resolving initial string
+    const suffix = '';
+
+    nodeStack.push(node);
+    suffixStack.push(suffix);
+
+    const results: string[] = [];
+
+    let currentNode: TrieNode;
+    let currentPrefix: string;
+    let hasValue = false;
+
+    do {
+      currentNode = nodeStack.pop()!;
+      currentPrefix = suffixStack.pop()!;
+
+      if (currentNode[SENTINEL]) {
+        hasValue = true;
+      }
+
+      node.forEach((childNode, k) => {
+        nodeStack.push(childNode);
+        suffixStack.push(k + suffix);
+      });
+
+      if (hasValue) results.push(currentPrefix);
+    } while (nodeStack.length);
+
+    return results;
+  };
+
   return {
     add,
     contains,
     find,
+    substractSetInPlaceFromFound,
     remove,
     delete: remove,
     has,
+    dump,
     get size() {
       return size;
     },
     get root() {
       return root;
-    }
+    },
+    [Bun.inspect.custom]: () => JSON.stringify(deepTrieNodeToJSON(root), null, 2)
   };
 };