Browse Source

Perf: improve performance of reject suffix/keyword deduping

SukkaW 2 years ago
parent
commit
725f26b428

+ 2 - 2
Build/build-common.ts

@@ -109,7 +109,7 @@ async function transformDomainset(parentSpan: Span, sourcePath: string, relative
     )
   ];
 
-  return createRuleset(
+  return span.traceAsyncFn(() => createRuleset(
     span,
     title,
     description,
@@ -118,7 +118,7 @@ async function transformDomainset(parentSpan: Span, sourcePath: string, relative
     'domainset',
     path.resolve(outputSurgeDir, relativePath),
     path.resolve(outputClashDir, `${relativePath.slice(0, -path.extname(relativePath).length)}.txt`)
-  );
+  ));
 }
 
 /**

+ 12 - 30
Build/build-reject-domainset.ts

@@ -4,12 +4,12 @@ import path from 'path';
 import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
 import { createTrie } from './lib/trie';
 
-import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST, DOMAIN_LISTS } from './lib/reject-data-source';
+import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS } from './lib/reject-data-source';
 import { createRuleset, compareAndWriteFile } from './lib/create-file';
 import { processLine } from './lib/process-line';
 import { domainDeduper } from './lib/domain-deduper';
 import createKeywordFilter from './lib/aho-corasick';
-import { readFileByLine } from './lib/fetch-text-by-line';
+import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
 import { sortDomains } from './lib/stable-sort-domain';
 import { task } from './trace';
 import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
@@ -63,25 +63,10 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
           setAddFromArray(domainSets, purePhishingDomains);
         }),
         childSpan.traceChild('process reject_sukka.conf').traceAsyncFn(async () => {
-          for await (const l of readFileByLine(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'))) {
-            const line = processLine(l);
-            if (!line) continue;
-            domainSets.add(line);
-          }
+          setAddFromArray(domainSets, await readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf')));
         })
       ]);
 
-      // remove pre-defined enforced blacklist from whitelist
-      const trie0 = createTrie(filterRuleWhitelistDomainSets);
-
-      for (let i = 0, len1 = PREDEFINED_ENFORCED_BACKLIST.length; i < len1; i++) {
-        const enforcedBlack = PREDEFINED_ENFORCED_BACKLIST[i];
-        const found = trie0.find(enforcedBlack);
-        for (let j = 0, len2 = found.length; j < len2; j++) {
-          filterRuleWhitelistDomainSets.delete(found[j]);
-        }
-      }
-
       return shouldStop;
     });
 
@@ -116,25 +101,22 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
     });
     filterRuleWhitelistDomainSets.forEach(suffix => {
       trie1.find(suffix, true).forEach(f => domainSets.delete(f));
+
+      if (suffix[0] === '.') {
+        // handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
+        domainSets.delete(suffix.slice(1));
+      } else {
+        // If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set
+        domainSets.delete(`.${suffix}`);
+      }
     });
 
     // remove pre-defined enforced blacklist from whitelist
     const kwfilter = createKeywordFilter(domainKeywordsSet);
 
-    // handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
     for (const domain of domainSets) {
-      if (domain[0] === '.') {
-        if (filterRuleWhitelistDomainSets.has(domain)) {
-          domainSets.delete(domain);
-          continue;
-        }
-      } else if (filterRuleWhitelistDomainSets.has(`.${domain}`)) {
-        domainSets.delete(domain);
-        continue;
-      }
-
       // Remove keyword
-      if (kwfilter.search(domain)) {
+      if (kwfilter(domain)) {
         domainSets.delete(domain);
       }
     }

+ 1 - 5
Build/lib/aho-corasick.ts

@@ -77,7 +77,7 @@ const createKeywordFilter = (keys: string[] | Set<string>) => {
 
   build();
 
-  const search = (text: string) => {
+  return (text: string) => {
     let node: Node | undefined = root;
 
     for (let i = 0, textLen = text.length; i < textLen; i++) {
@@ -96,10 +96,6 @@ const createKeywordFilter = (keys: string[] | Set<string>) => {
 
     return false;
   };
-
-  return {
-    search
-  };
 };
 
 export default createKeywordFilter;

+ 21 - 15
Build/lib/get-phishing-domains.ts

@@ -99,17 +99,19 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
     SetAdd(domainSet, domainSet2);
   }
 
-  span.traceChild('whitelisting phishing domains').traceSyncFn(() => {
-    const trieForRemovingWhiteListed = createTrie(domainSet);
-
-    for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
-      const white = WHITELIST_DOMAIN[i];
-      const found = trieForRemovingWhiteListed.find(`.${white}`, true);
-      for (let j = 0, len2 = found.length; j < len2; j++) {
-        domainSet.delete(found[j]);
+  span.traceChild('whitelisting phishing domains').traceSyncFn((parentSpan) => {
+    const trieForRemovingWhiteListed = parentSpan.traceChild('create trie for whitelisting').traceSyncFn(() => createTrie(domainSet));
+
+    return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
+      for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
+        const white = WHITELIST_DOMAIN[i];
+        const found = trieForRemovingWhiteListed.find(`.${white}`, true);
+        for (let j = 0, len2 = found.length; j < len2; j++) {
+          domainSet.delete(found[j]);
+        }
+        domainSet.delete(white);
       }
-      domainSet.delete(white);
-    }
+    });
   });
 
   const domainCountMap: Record<string, number> = {};
@@ -177,11 +179,15 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
     }
   });
 
-  const results = span.traceChild('get final phishing results').traceSyncFn(
-    () => Object.entries(domainCountMap)
-      .filter(entries => entries[1] >= 5)
-      .map(entries => entries[0])
-  );
+  const results = span.traceChild('get final phishing results').traceSyncFn(() => {
+    const results: string[] = [];
+    for (const domain in domainCountMap) {
+      if (domainCountMap[domain] > 5) {
+        results.push(domain);
+      }
+    }
+    return results;
+  });
 
   return [results, domainSet] as const;
 });

+ 0 - 4
Build/lib/reject-data-source.ts

@@ -211,10 +211,6 @@ export const PREDEFINED_WHITELIST = [
   'pstmrk.it'
 ];
 
-export const PREDEFINED_ENFORCED_BACKLIST = [
-  'telemetry.mozilla.org'
-];
-
 export const PREDEFINED_ENFORCED_WHITELIST = [
   'godaddysites.com',
   'web.app',

+ 4 - 0
Source/non_ip/reject.conf

@@ -43,6 +43,10 @@ DOMAIN-SUFFIX,pantheonsite.io
 DOMAIN-SUFFIX,sitebeat.crazydomains.com
 # >> Snowplow Analytics (publicsuffix)
 DOMAIN-SUFFIX,try-snowplow.com
+# >> Mozilla Telemetry (Enforcing)
+DOMAIN-SUFFIX,telemetry-coverage.mozilla.org
+DOMAIN-SUFFIX,telemetry.mozilla.org
+DOMAIN-SUFFIX,incoming-telemetry.thunderbird.net
 
 # >> Phishing
 DOMAIN-SUFFIX,gofenews.com