Browse Source

Perf: fastest dedupe

SukkaW 2 years ago
parent
commit
b659bff079
2 changed files with 22 additions and 30 deletions
  1. 4 2
      Build/build-reject-domainset.js
  2. 18 28
      Build/worker/build-reject-domainset-worker.js

+ 4 - 2
Build/build-reject-domainset.js

@@ -183,15 +183,17 @@ const domainSuffixSet = new Set();
   const START_TIME = Date.now();
   const START_TIME = Date.now();
 
 
   const domainSetsArray = Array.from(domainSets);
   const domainSetsArray = Array.from(domainSets);
+  const workerData = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray);
+
   const piscina = new Piscina({
   const piscina = new Piscina({
     filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'),
     filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'),
-    workerData: preprocessFullDomainSetBeforeUsedAsWorkerData(Array.from(domainSetsArray)),
+    workerData,
     idleTimeout: 50,
     idleTimeout: 50,
     minThreads: threads,
     minThreads: threads,
     maxThreads: threads
     maxThreads: threads
   });
   });
 
 
-  console.log(preprocessFullDomainSetBeforeUsedAsWorkerData(Array.from(domainSetsArray)).length);
+  console.log(workerData.length);
 
 
   console.log(`Launching ${threads} threads...`);
   console.log(`Launching ${threads} threads...`);
 
 

+ 18 - 28
Build/worker/build-reject-domainset-worker.js

@@ -1,8 +1,9 @@
 // @ts-check
 // @ts-check
 const Piscina = require('piscina');
 const Piscina = require('piscina');
+const Trie = require('../lib/trie');
 // const { isCI } = require('ci-info');
 // const { isCI } = require('ci-info');
 /** @type {string[]} */
 /** @type {string[]} */
-const fullsetDomainStartsWithADot = Piscina.workerData
+const fullsetDomainStartsWithADot = Piscina.workerData;
 const totalLen = fullsetDomainStartsWithADot.length;
 const totalLen = fullsetDomainStartsWithADot.length;
 
 
 const DOT = '.';
 const DOT = '.';
@@ -15,38 +16,27 @@ module.exports = ({ chunk }) => {
   const chunkLength = chunk.length;
   const chunkLength = chunk.length;
   const outputToBeRemoved = new Int8Array(chunkLength);
   const outputToBeRemoved = new Int8Array(chunkLength);
 
 
-  for (let i = 0; i < chunkLength; i++) {
-    const domainFromInputChunk = chunk[i];
-    const domainFromInputLen = domainFromInputChunk.length;
+  const trie = Trie.from(chunk);
 
 
-    for (let j = 0; j < totalLen; j++) {
-      const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
-      // domainFromFullSet is always startsWith "."
-      if (domainStartsWithADotAndFromFullSet === domainFromInputChunk) continue;
+  for (let j = 0; j < totalLen; j++) {
+    const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
 
 
-      const domainFromFullSetLen = domainStartsWithADotAndFromFullSet.length;
+    const found = trie.find(domainStartsWithADotAndFromFullSet, false)
 
 
-      if (domainFromInputLen < domainFromFullSetLen) {
-        if (domainFromInputLen + 1 !== domainFromFullSetLen) {
-          continue;
+    if (found.length) {
+      found.forEach(f => {
+        const index = chunk.indexOf(f);
+        if (index !== -1) {
+          outputToBeRemoved[index] = 1;
         }
         }
+      })
+    }
 
 
-        // !domainFromInput.starsWith('.') && `.${domainFromInput}` === domainFromFullSet
-        if (
-          domainFromInputChunk[0] !== DOT
-          && domainStartsWithADotAndFromFullSet.endsWith(domainFromInputChunk)
-        ) {
-          outputToBeRemoved[i] = 1;
-          // log(domainFromInputChunk, domainStartsWithADotAndFromFullSet)
-          break;
-        }
-      } else if (
-        domainFromInputLen > domainFromFullSetLen
-        && domainFromInputChunk.endsWith(domainStartsWithADotAndFromFullSet)
-      ) {
-        outputToBeRemoved[i] = 1;
-        // log(domainFromInputChunk, domainStartsWithADotAndFromFullSet)
-        break;
+    const a = domainStartsWithADotAndFromFullSet.slice(1);
+    if (trie.has(a)) {
+      const index = chunk.indexOf(a);
+      if (index !== -1) {
+        outputToBeRemoved[index] = 1;
       }
       }
     }
     }
   }
   }