Browse Source

Perf: simplify white suffix dedupe

SukkaW 1 year ago
parent
commit
02bff12245

+ 12 - 17
Build/build-reject-domainset.ts

@@ -15,7 +15,7 @@ import * as tldts from 'tldts';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { getPhishingDomains } from './lib/get-phishing-domains';
 import { getPhishingDomains } from './lib/get-phishing-domains';
 
 
-import * as SetHelpers from 'mnemonist/set';
+import { add as SetAdd, subtract as SetSubstract } from 'mnemonist/set';
 import { setAddFromArray } from './lib/set-add-from-array';
 import { setAddFromArray } from './lib/set-add-from-array';
 import { sort } from './lib/timsort';
 import { sort } from './lib/timsort';
 
 
@@ -23,7 +23,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
   /** Whitelists */
   /** Whitelists */
   const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
   const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
 
 
-  const domainSets = new Set<string>();
+  let domainSets = new Set<string>();
 
 
   // Parse from AdGuard Filters
   // Parse from AdGuard Filters
   const shouldStop = await span
   const shouldStop = await span
@@ -33,9 +33,9 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
       let shouldStop = false;
       let shouldStop = false;
       await Promise.all([
       await Promise.all([
         // Parse from remote hosts & domain lists
         // Parse from remote hosts & domain lists
-        ...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2], entry[3]).then(hosts => SetHelpers.add(domainSets, hosts))),
+        ...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2], entry[3]).then(hosts => SetAdd(domainSets, hosts))),
 
 
-        ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2]).then(hosts => SetHelpers.add(domainSets, hosts))),
+        ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2]).then(hosts => SetAdd(domainSets, hosts))),
 
 
         ...ADGUARD_FILTERS.map(input => (
         ...ADGUARD_FILTERS.map(input => (
           typeof input === 'string'
           typeof input === 'string'
@@ -58,7 +58,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
           setAddFromArray(filterRuleWhitelistDomainSets, black);
           setAddFromArray(filterRuleWhitelistDomainSets, black);
         }))),
         }))),
         getPhishingDomains(childSpan).then(([purePhishingDomains, fullPhishingDomainSet]) => {
         getPhishingDomains(childSpan).then(([purePhishingDomains, fullPhishingDomainSet]) => {
-          SetHelpers.add(domainSets, fullPhishingDomainSet);
+          SetAdd(domainSets, fullPhishingDomainSet);
           setAddFromArray(domainSets, purePhishingDomains);
           setAddFromArray(domainSets, purePhishingDomains);
         }),
         }),
         childSpan.traceChildAsync('process reject_sukka.conf', async () => {
         childSpan.traceChildAsync('process reject_sukka.conf', async () => {
@@ -94,22 +94,17 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
     });
     });
 
 
     // Remove as many domains as possible from domainSets before creating trie
     // Remove as many domains as possible from domainSets before creating trie
-    SetHelpers.subtract(domainSets, filterRuleWhitelistDomainSets);
+    SetSubstract(domainSets, filterRuleWhitelistDomainSets);
 
 
-    childSpan.traceChildSync('dedupe from white suffixes', () => {
-      const trie = createTrie(domainSets);
+    domainSets = new Set(childSpan.traceChildSync('dedupe from white suffixes', () => {
+      const trie = createTrie(domainSets, true, true);
 
 
       filterRuleWhitelistDomainSets.forEach(suffix => {
       filterRuleWhitelistDomainSets.forEach(suffix => {
-        trie.substractSetInPlaceFromFound(suffix, domainSets);
-        if (suffix[0] === '.') {
-          domainSets.delete(suffix.slice(1));
-          domainSets.delete(suffix);
-        } else {
-          domainSets.delete(`.${suffix}`);
-          domainSets.delete(suffix);
-        }
+        trie.whitelist(suffix);
       });
       });
-    });
+
+      return trie.dump();
+    }));
 
 
     childSpan.traceChildSync('dedupe from black keywords', () => {
     childSpan.traceChildSync('dedupe from black keywords', () => {
       const kwfilter = createKeywordFilter(domainKeywordsSet);
       const kwfilter = createKeywordFilter(domainKeywordsSet);

+ 0 - 25
Build/lib/get-phishing-domains.ts

@@ -1,22 +1,11 @@
 import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
 import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
 import { processDomainLists } from './parse-filter';
 import { processDomainLists } from './parse-filter';
 import * as tldts from 'tldts';
 import * as tldts from 'tldts';
-import { createTrie } from './trie';
 import { TTL } from './cache-filesystem';
 import { TTL } from './cache-filesystem';
 
 
 import { add as SetAdd } from 'mnemonist/set';
 import { add as SetAdd } from 'mnemonist/set';
 import type { Span } from '../trace';
 import type { Span } from '../trace';
 
 
-const WHITELIST_DOMAIN = [
-  'w3s.link',
-  'dweb.link',
-  'nftstorage.link',
-  'square.site',
-  'business.site',
-  'page.link', // Firebase URL Shortener
-  'fleek.cool',
-  'notion.site'
-];
 const BLACK_TLD = new Set([
 const BLACK_TLD = new Set([
   'accountant',
   'accountant',
   'autos',
   'autos',
@@ -114,20 +103,6 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
     return domainSet;
     return domainSet;
   });
   });
 
 
-  span.traceChildSync('whitelisting phishing domains', (curSpan) => {
-    const trieForRemovingWhiteListed = curSpan.traceChildSync('create trie for whitelisting', () => createTrie(domainSet));
-
-    return curSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
-      for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
-        const white = WHITELIST_DOMAIN[i];
-        domainSet.delete(white);
-        domainSet.delete(`.${white}`);
-
-        trieForRemovingWhiteListed.substractSetInPlaceFromFound(`.${white}`, domainSet);
-      }
-    });
-  });
-
   const domainCountMap: Record<string, number> = {};
   const domainCountMap: Record<string, number> = {};
 
 
   span.traceChildSync('process phishing domain set', () => {
   span.traceChildSync('process phishing domain set', () => {

+ 9 - 1
Build/lib/reject-data-source.ts

@@ -224,8 +224,16 @@ export const PREDEFINED_WHITELIST = [
   'email.accounts.bitly.com', // Fuck Peter Lowe Hosts
   'email.accounts.bitly.com', // Fuck Peter Lowe Hosts
   'adsense.google.com', // Fuck Peter Lowe Hosts
   'adsense.google.com', // Fuck Peter Lowe Hosts
   'api.vip.miui.com', // Fuck Goodbye Xiaomi Ads
   'api.vip.miui.com', // Fuck Goodbye Xiaomi Ads
-  'stripe.com' // EasyPrivacy only blocks m.stripe.com wwith $third-party,
+  'stripe.com', // EasyPrivacy only blocks m.stripe.com wwith $third-party,
   // yet stupid AdGuardDNSFilter blocks all of it. Stupid AdGuard
   // yet stupid AdGuardDNSFilter blocks all of it. Stupid AdGuard
+  'w3s.link', // ipfs gateway
+  'dweb.link', // ipfs gateway
+  'nftstorage.link', // ipfs gateway
+  'fleek.cool', // ipfs gateway
+  'square.site', // Drag'n'Drop site building platform
+  'business.site', // Drag'n'Drop site building platform
+  'page.link', // Firebase URL Shortener
+  'notion.site'
 ];
 ];
 
 
 export const PREDEFINED_ENFORCED_WHITELIST = [
 export const PREDEFINED_ENFORCED_WHITELIST = [