Browse Source

Perf: make reject list build faster

SukkaW 2 years ago
parent
commit
4d0a5260ca

+ 49 - 80
Build/build-reject-domainset.js

@@ -2,12 +2,9 @@
 const { promises: fsPromises } = require('fs');
 const fse = require('fs-extra');
 const { resolve: pathResolve } = require('path');
-const Piscina = require('piscina');
 const { processHosts, processFilterRules, preprocessFullDomainSetBeforeUsedAsWorkerData } = require('./lib/parse-filter');
-const cpuCount = require('os').cpus().length;
-const { isCI } = require('ci-info');
-const threads = isCI ? cpuCount : cpuCount / 2;
 const { getDomain } = require('tldts');
+const Trie = require('./lib/trie');
 
 const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST } = require('./lib/reject-data-source');
 const { withBannerArray } = require('./lib/with-banner');
@@ -30,15 +27,14 @@ const domainSuffixSet = new Set();
   console.time('* Download and process Hosts');
 
   // Parse from remote hosts & domain lists
-  (await Promise.all(
-    HOSTS.map(entry => processHosts(entry[0], entry[1]))
-  )).forEach(hosts => {
-    hosts.forEach(host => {
-      if (host) {
-        domainSets.add(host);
-      }
+  (await Promise.all(HOSTS.map(entry => processHosts(entry[0], entry[1]))))
+    .forEach(hosts => {
+      hosts.forEach(host => {
+        if (host) {
+          domainSets.add(host);
+        }
+      });
     });
-  });
 
   console.timeEnd('* Download and process Hosts');
 
@@ -167,8 +163,31 @@ const domainSuffixSet = new Set();
   console.log(`Start deduping from black keywords/suffixes! (${previousSize})`);
   console.time(`* Dedupe from black keywords/suffixes`);
 
+  const trie1 = Trie.from(Array.from(domainSets));
+  domainSuffixSet.forEach(suffix => {
+    trie1.find(suffix, true).forEach(f => domainSets.delete(f));
+  });
+  filterRuleWhitelistDomainSets.forEach(suffix => {
+    trie1.find(suffix, true).forEach(f => domainSets.delete(f));
+  });
+
+  // Build whitelist trie, to handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
+  const trieWhite = Trie.from(Array.from(filterRuleWhitelistDomainSets));
   for (const domain of domainSets) {
-    if (isMatchKeyword(domain) || isMatchSuffix(domain) || isInWhiteList(domain)) {
+    if (domain[0] !== '.' && trieWhite.has(`.${domain}`)) {
+      domainSets.delete(domain);
+      continue;
+    }
+    if (domain[0] === '.') {
+      const found = trieWhite.find(domain);
+      if (found.length > 0) {
+        domainSets.delete(domain);
+        continue;
+      }
+    }
+
+    // Remove keyword
+    if (isMatchKeyword(domain)) {
       domainSets.delete(domain);
     }
   }
@@ -183,44 +202,28 @@ const domainSuffixSet = new Set();
   const START_TIME = Date.now();
 
   const domainSetsArray = Array.from(domainSets);
-  const workerData = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray);
-
-  const piscina = new Piscina({
-    filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'),
-    workerData,
-    idleTimeout: 50,
-    minThreads: threads,
-    maxThreads: threads
-  });
-
-  console.log(workerData.length);
-
-  console.log(`Launching ${threads} threads...`);
-
-  const tasksArray = domainSetsArray.reduce((result, element, index) => {
-    const chunk = index % threads;
-    result[chunk] ??= [];
-
-    result[chunk].push(element);
-    return result;
-  }, /** @type {string[][]} */([]));
+  const trie2 = Trie.from(domainSetsArray);
+  const fullsetDomainStartsWithADot = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray);
+  console.log(fullsetDomainStartsWithADot.length);
+
+  for (let j = 0, len = fullsetDomainStartsWithADot.length; j < len; j++) {
+    const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
+    const found = trie2.find(domainStartsWithADotAndFromFullSet, false);
+    if (found.length) {
+      found.forEach(f => {
+        domainSets.delete(f);
+      })
+    }
 
-  (await Promise.all(
-    tasksArray.map(chunk => piscina.run({ chunk }))
-  )).forEach((result, taskIndex) => {
-      const chunk = tasksArray[taskIndex];
-      for (let i = 0, len = result.length; i < len; i++) {
-        if (result[i]) {
-          domainSets.delete(chunk[i]);
-        }
-      }
-    });
+    const a = domainStartsWithADotAndFromFullSet.slice(1);
+    if (trie2.has(a)) {
+      domainSets.delete(a);
+    }
+  }
 
   console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`);
   console.log(`Deduped ${previousSize - domainSets.size} rules!`);
 
-  await piscina.destroy();
-
   console.time('* Write reject.conf');
 
   const sorter = (a, b) => {
@@ -264,9 +267,6 @@ const domainSuffixSet = new Set();
   console.timeEnd('* Write reject.conf');
 
   console.timeEnd('Total Time - build-reject-domain-set');
-  if (piscina.queueSize === 0) {
-    process.exit(0);
-  }
 })();
 
 /**
@@ -281,34 +281,3 @@ function isMatchKeyword(domain) {
 
   return false;
 }
-
-/**
- * @param {string} domain
- */
-function isMatchSuffix(domain) {
-  for (const suffix of domainSuffixSet) {
-    if (domain.endsWith(suffix)) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-/**
- * @param {string} domain
- */
-function isInWhiteList(domain) {
-  for (const white of filterRuleWhitelistDomainSets) {
-    if (domain === white || domain.endsWith(white)) {
-      return true;
-    }
-    if (white.endsWith(domain)) {
-      // If a whole domain is in blacklist but a subdomain is in whitelist
-      // We have no choice but to remove the whole domain from blacklist
-      return true;
-    }
-  }
-
-  return false;
-}

+ 1 - 1
Build/lib/parse-filter.js

@@ -397,7 +397,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
  */
 function preprocessFullDomainSetBeforeUsedAsWorkerData(data) {
   return data
-    .filter(domain => domain.charCodeAt(0) === 46)
+    .filter(domain => domain[0] === '.')
     .sort((a, b) => a.length - b.length);
 }
 

+ 31 - 52
Build/lib/trie.js

@@ -129,7 +129,7 @@ class Trie {
   /**
    * Method used to delete a prefix from the trie.
    *
-   * @param  {string|array} suffix - Prefix to delete.
+   * @param  {string} suffix - Prefix to delete.
    * @return {boolean}
    */
   delete(suffix) {
@@ -198,66 +198,45 @@ class Trie {
   }
 
   /**
-   * Method returning an iterator over the trie's prefixes.
-   *
-   * @param  {string|array} [prefix] - Optional starting prefix.
-   * @return {Iterator}
+   * @return {string[]}
    */
-  // prefixes(prefix) {
-  //   let node = this.root;
-  //   const nodeStack = [];
-  //   const prefixStack = [];
-  //   let token;
-  //   let i;
-  //   let l;
-
-  //   const isString = this.mode === 'string';
+  dump() {
+    let node = this.root;
+    const nodeStack = [];
+    const prefixStack = [];
+    // Resolving initial prefix
+    const prefix = '';
 
-  //   // Resolving initial prefix
-  //   if (prefix) {
-  //     for (i = 0, l = prefix.length; i < l; i++) {
-  //       token = prefix[i];
-  //       node = node[token];
+    nodeStack.push(node);
+    prefixStack.push(prefix);
 
-  //       // If the prefix does not exist, we return an empty iterator
-  //       if (typeof node === 'undefined')
-  //         return Iterator.empty();
-  //     }
-  //   }
-  //   else {
-  //     prefix = isString ? '' : [];
-  //   }
+    /** @type {string[]} */
+    const results = [];
 
-  //   nodeStack.push(node);
-  //   prefixStack.push(prefix);
-
-  //   return new Iterator(() => {
-  //     let currentNode;
-  //     let currentPrefix;
-  //     let hasValue = false;
-  //     let k;
+    let currentNode;
+    let currentPrefix;
+    let hasValue = false;
+    let k;
 
-  //     while (nodeStack.length) {
-  //       currentNode = nodeStack.pop();
-  //       currentPrefix = prefixStack.pop();
+    while (nodeStack.length) {
+      currentNode = nodeStack.pop();
+      currentPrefix = prefixStack.pop();
 
-  //       for (k in currentNode) {
-  //         if (k === SENTINEL) {
-  //           hasValue = true;
-  //           continue;
-  //         }
+      for (k in currentNode) {
+        if (k === SENTINEL) {
+          hasValue = true;
+          continue;
+        }
 
-  //         nodeStack.push(currentNode[k]);
-  //         prefixStack.push(isString ? currentPrefix + k : currentPrefix.concat(k));
-  //       }
+        nodeStack.push(currentNode[k]);
+        prefixStack.push(k + currentPrefix);
+      }
 
-  //       if (hasValue)
-  //         return { done: false, value: currentPrefix };
-  //     }
+      if (hasValue) results.push(currentPrefix);
+    }
 
-  //     return { done: true };
-  //   });
-  // }
+    return results;
+  }
 
   /**
    * Convenience known methods.

+ 0 - 45
Build/worker/build-reject-domainset-worker.js

@@ -1,45 +0,0 @@
-// @ts-check
-const Piscina = require('piscina');
-const Trie = require('../lib/trie');
-// const { isCI } = require('ci-info');
-/** @type {string[]} */
-const fullsetDomainStartsWithADot = Piscina.workerData;
-const totalLen = fullsetDomainStartsWithADot.length;
-
-const DOT = '.';
-
-// const log = isCI ? () => { } : console.log.bind(console);
-/**
- * @param {{ chunk: string[] }} param0
- */
-module.exports = ({ chunk }) => {
-  const chunkLength = chunk.length;
-  const outputToBeRemoved = new Int8Array(chunkLength);
-
-  const trie = Trie.from(chunk);
-
-  for (let j = 0; j < totalLen; j++) {
-    const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
-
-    const found = trie.find(domainStartsWithADotAndFromFullSet, false)
-
-    if (found.length) {
-      found.forEach(f => {
-        const index = chunk.indexOf(f);
-        if (index !== -1) {
-          outputToBeRemoved[index] = 1;
-        }
-      })
-    }
-
-    const a = domainStartsWithADotAndFromFullSet.slice(1);
-    if (trie.has(a)) {
-      const index = chunk.indexOf(a);
-      if (index !== -1) {
-        outputToBeRemoved[index] = 1;
-      }
-    }
-  }
-
-  return Piscina.move(outputToBeRemoved);
-};