Browse Source

Implement keyword deduper using AhoCorasick

SukkaW 2 years ago
parent
commit
dcf565fb6b
3 changed files with 150 additions and 37 deletions
  1. 9 36
      Build/build-reject-domainset.js
  2. 139 0
      Build/lib/aho-corasick.js
  3. 2 1
      Build/lib/reject-data-source.js

+ 9 - 36
Build/build-reject-domainset.js

@@ -4,7 +4,7 @@ const fse = require('fs-extra');
 const readline = require('readline');
 
 const { resolve: pathResolve } = require('path');
-const { processHosts, processFilterRules, preprocessFullDomainSetBeforeUsedAsWorkerData } = require('./lib/parse-filter');
+const { processHosts, processFilterRules } = require('./lib/parse-filter');
 const { getDomain } = require('tldts');
 const Trie = require('./lib/trie');
 
@@ -12,6 +12,8 @@ const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLI
 const { withBannerArray } = require('./lib/with-banner');
 const { compareAndWriteFile } = require('./lib/string-array-compare');
 const { processLine } = require('./lib/process-line');
+const { domainDeduper } = require('./lib/domain-deduper');
+const createKeywordFilter = require('./lib/aho-corasick');
 
 /** Whitelists */
 const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
@@ -151,6 +153,8 @@ const domainSuffixSet = new Set();
   console.log(`Start deduping from black keywords/suffixes! (${previousSize})`);
   console.time('* Dedupe from black keywords/suffixes');
 
+  const kwfilter = createKeywordFilter(Array.from(domainKeywordsSet));
+
   const trie1 = Trie.from(Array.from(domainSets));
   domainSuffixSet.forEach(suffix => {
     trie1.find(suffix, true).forEach(f => domainSets.delete(f));
@@ -173,7 +177,7 @@ const domainSuffixSet = new Set();
     }
 
     // Remove keyword
-    if (isMatchKeyword(domain)) {
+    if (kwfilter.search(domain)) {
       domainSets.delete(domain);
     }
   }
@@ -187,28 +191,10 @@ const domainSuffixSet = new Set();
 
   const START_TIME = Date.now();
 
-  const domainSetsArray = Array.from(domainSets);
-  const trie2 = Trie.from(domainSetsArray);
-  const fullsetDomainStartsWithADot = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray);
-  console.log(fullsetDomainStartsWithADot.length);
-
-  for (let j = 0, len = fullsetDomainStartsWithADot.length; j < len; j++) {
-    const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
-    const found = trie2.find(domainStartsWithADotAndFromFullSet, false);
-    if (found.length) {
-      found.forEach(f => {
-        domainSets.delete(f);
-      });
-    }
-
-    const a = domainStartsWithADotAndFromFullSet.slice(1);
-    if (trie2.has(a)) {
-      domainSets.delete(a);
-    }
-  }
+  const dudupedDominArray = domainDeduper(Array.from(domainSets));
 
   console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`);
-  console.log(`Deduped ${previousSize - domainSets.size} rules!`);
+  console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`);
 
   console.time('* Write reject.conf');
 
@@ -221,7 +207,7 @@ const domainSuffixSet = new Set();
     }
     return 0;
   };
-  const sortedDomainSets = Array.from(domainSets)
+  const sortedDomainSets = dudupedDominArray
     .map((v) => {
       return { v, domain: getDomain(v.charCodeAt(0) === 46 ? v.slice(1) : v) || v };
     })
@@ -255,16 +241,3 @@ const domainSuffixSet = new Set();
 
   console.timeEnd('Total Time - build-reject-domain-set');
 })();
-
-/**
- * @param {string} domain
- */
-function isMatchKeyword(domain) {
-  for (const keyword of domainKeywordsSet) {
-    if (domain.includes(keyword)) {
-      return true;
-    }
-  }
-
-  return false;
-}

+ 139 - 0
Build/lib/aho-corasick.js

@@ -0,0 +1,139 @@
+/**
+ * @typedef {Object} Node
+ * @prop {number} [depth = 0]
+ * @prop {string} key
+ * @prop {boolean} [word = false]
+ * @prop {Record<string, Node>} [children={}]
+ * @prop {Node} [fail]
+ * @prop {number} [count=0]
+ */
+
+/**
+ * @param {string} key
+ * @param {number} depth
+ * @returns {Node}
+ */
+const createNode = (key, depth = 0) => ({
+  depth,
+  key,
+  word: false,
+  children: {},
+  fail: undefined,
+  count: 0
+});
+
+/**
+ * @param {string[]} keys
+ */
+const createKeywordFilter = (keys) => {
+  const root = createNode('root');
+
+  const build = () => {
+    /** @type {Node[]} */
+    const queue = [];
+    queue.push(root);
+
+    let idx = 0;
+    while (queue.length > idx) {
+      const beginNode = queue[idx];
+      const map = beginNode.children;
+      // eslint-disable-next-line guard-for-in -- plain object
+      for (const key in beginNode.children) {
+        const node = map[key];
+        let failNode = beginNode.fail;
+
+        while (failNode && !failNode.children[key]) {
+          failNode = failNode.fail;
+        }
+
+        node.fail = failNode?.children[key] || root;
+
+        queue.push(node);
+      }
+
+      idx++;
+    }
+  };
+
+  /**
+   * @param {string} key
+   * @param {number} len
+   */
+  const put = (key, len) => {
+    let node = root;
+    const lastIdx = len - 1;
+    node.count++;
+    for (let idx = 0; idx < len; idx++) {
+      const val = key[idx];
+      const nextNode = node.children[val];
+
+      if (nextNode) {
+        nextNode.count++;
+        node = nextNode;
+      } else {
+        const newNode = createNode(val, idx + 1);
+        newNode.count = 1;
+        node.children[val] = newNode;
+        node = newNode;
+      }
+
+      if (lastIdx === idx && node.depth) {
+        node.word = true;
+      }
+    }
+  };
+
+  /**
+   * @param {string} key
+   */
+  const add = (key) => {
+    const len = key.length;
+    put(key, len);
+    build();
+
+    return true;
+  };
+
+  for (let idx = 0; idx < keys.length; idx++) {
+    add(keys[idx], false);
+  }
+
+  build();
+
+  /**
+   * @param {string} text
+   * @returns {boolean}
+   */
+  const search = (text) => {
+    let node = root;
+    /** @type {string[]} */
+    const fText = [];
+    /** @type {string[]} */
+    const oText = [];
+
+    for (let i = 0, textLen = text.length; i < textLen; i++) {
+      // const key = text.charAt(i);
+      const key = text[i];
+
+      while (node && !node?.children[key]) {
+        node = node?.fail;
+      }
+      node = node?.children[key] || root;
+
+      fText.push(key);
+      oText.push(key);
+
+      if (node.word) {
+        return true;
+      }
+    }
+
+    return false;
+  };
+
+  return {
+    search
+  };
+};
+
+module.exports = createKeywordFilter;

+ 2 - 1
Build/lib/reject-data-source.js

@@ -240,7 +240,8 @@ const PREDEFINED_ENFORCED_WHITELIST = [
   'ipfs.fleek.cool',
   'repl.co',
   'w3s.link',
-  'translate.goog'
+  'translate.goog',
+  'backblazeb2.com'
 ];
 
 module.exports.HOSTS = HOSTS;