Browse Source

Perf: minor optimization here and there

SukkaW 2 years ago
parent
commit
e8f3519479

+ 4 - 1
Build/build-microsoft-cdn.ts

@@ -32,7 +32,10 @@ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise(async () => {
   });
 
   const trie2 = createTrie(set);
-  BLACKLIST.flatMap(domain => trie2.find(domain, true)).forEach(d => set.delete(d));
+  const black = BLACKLIST.flatMap(domain => trie2.find(domain, true));
+  for (let i = 0, len = black.length; i < len; i++) {
+    set.delete(black[i]);
+  }
 
   return Array.from(set).map(d => `DOMAIN-SUFFIX,${d}`).concat(WHITELIST);
 });

+ 20 - 17
Build/build-reject-domainset.ts

@@ -17,6 +17,9 @@ import * as tldts from 'tldts';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { getPhishingDomains } from './lib/get-phishing-domains';
 
+import * as SetHelpers from 'mnemonist/set';
+import { setAddFromArray } from './lib/set-add-from-array';
+
 export const buildRejectDomainSet = task(import.meta.path, async () => {
   /** Whitelists */
   const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
@@ -33,9 +36,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
       getGorhillPublicSuffixPromise(),
       // Parse from remote hosts & domain lists
       ...HOSTS.map(entry => processHosts(entry[0], entry[1], entry[2]).then(hosts => {
-        hosts.forEach(host => {
-          domainSets.add(host);
-        });
+        SetHelpers.add(domainSets, hosts);
       })),
       ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2])),
       ...ADGUARD_FILTERS.map(input => {
@@ -48,24 +49,20 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
             shouldStop = true;
             // we should not break here, as we want to see full matches from all data source
           }
-          white.forEach(i => filterRuleWhitelistDomainSets.add(i));
-          black.forEach(i => domainSets.add(i));
+          setAddFromArray(filterRuleWhitelistDomainSets, white);
+          setAddFromArray(domainSets, black);
         });
       }),
       ...([
         'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt',
         'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt'
       ].map(input => processFilterRules(input).then(({ white, black }) => {
-        white.forEach(i => filterRuleWhitelistDomainSets.add(i));
-        black.forEach(i => filterRuleWhitelistDomainSets.add(i));
+        setAddFromArray(filterRuleWhitelistDomainSets, white);
+        setAddFromArray(filterRuleWhitelistDomainSets, black);
       }))),
-      getPhishingDomains().then(([purePhishingDomains, fullDomainSet]) => {
-        fullDomainSet.forEach(host => {
-          if (host) {
-            domainSets.add(host);
-          }
-        });
-        purePhishingDomains.forEach(suffix => domainSets.add(`.${suffix}`));
+      getPhishingDomains().then(([purePhishingDomains, fullPhishingDomainSet]) => {
+        SetHelpers.add(domainSets, fullPhishingDomainSet);
+        setAddFromArray(domainSets, purePhishingDomains);
       }),
       (async () => {
         for await (const l of readFileByLine(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'))) {
@@ -79,9 +76,14 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
 
     // remove pre-defined enforced blacklist from whitelist
     const trie0 = createTrie(filterRuleWhitelistDomainSets);
-    PREDEFINED_ENFORCED_BACKLIST.forEach(enforcedBlack => {
-      trie0.find(enforcedBlack).forEach(found => filterRuleWhitelistDomainSets.delete(found));
-    });
+
+    for (let i = 0, len1 = PREDEFINED_ENFORCED_BACKLIST.length; i < len1; i++) {
+      const enforcedBlack = PREDEFINED_ENFORCED_BACKLIST[i];
+      const found = trie0.find(enforcedBlack);
+      for (let j = 0, len2 = found.length; j < len2; j++) {
+        filterRuleWhitelistDomainSets.delete(found[j]);
+      }
+    }
 
     return [gorhill, shouldStop] as const;
   });
@@ -109,6 +111,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
   // Dedupe domainSets
   traceSync('* Dedupe from black keywords/suffixes', () => {
     const trie1 = createTrie(domainSets);
+
     domainSuffixSet.forEach(suffix => {
       trie1.find(suffix, true).forEach(f => domainSets.delete(f));
     });

+ 1 - 3
Build/lib/aho-corasick.ts

@@ -73,9 +73,7 @@ const createKeywordFilter = (keys: string[] | Set<string>) => {
     }
   };
 
-  keys.forEach(k => {
-    put(k, k.length);
-  });
+  keys.forEach(k => put(k, k.length));
 
   build();
 

+ 11 - 8
Build/lib/cache-filesystem.ts

@@ -34,10 +34,13 @@ interface CacheApplyStringOption {
 type CacheApplyOption<T> = T extends string ? CacheApplyStringOption : CacheApplyNonStringOption<T>;
 
 const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;
+
+const ONE_HOUR = 60 * 60 * 1000;
+const ONE_DAY = 24 * ONE_HOUR;
 // Add some randomness to the cache ttl to avoid thundering herd
 export const TTL = {
   humanReadable(ttl: number) {
-    if (ttl >= 24 * 60 * 60 * 1000) {
+    if (ttl >= ONE_DAY) {
       return `${Math.round(ttl / 24 / 60 / 60 / 1000)}d`;
     }
     if (ttl >= 60 * 60 * 1000) {
@@ -45,13 +48,13 @@ export const TTL = {
     }
     return `${Math.round(ttl / 1000)}s`;
   },
-  THREE_HOURS: () => randomInt(1, 3) * 60 * 60 * 1000,
-  TWLVE_HOURS: () => randomInt(8, 12) * 60 * 60 * 1000,
-  ONE_DAY: () => randomInt(23, 25) * 60 * 60 * 1000,
-  THREE_DAYS: () => randomInt(1, 3) * 24 * 60 * 60 * 1000,
-  ONE_WEEK: () => randomInt(4, 7) * 24 * 60 * 60 * 1000,
-  TWO_WEEKS: () => randomInt(10, 14) * 24 * 60 * 60 * 1000,
-  TEN_DAYS: () => randomInt(7, 10) * 24 * 60 * 60 * 1000
+  THREE_HOURS: () => randomInt(1, 3) * ONE_HOUR,
+  TWLVE_HOURS: () => randomInt(8, 12) * ONE_HOUR,
+  ONE_DAY: () => randomInt(23, 25) * ONE_HOUR,
+  THREE_DAYS: () => randomInt(1, 3) * ONE_DAY,
+  ONE_WEEK: () => randomInt(4, 7) * ONE_DAY,
+  TEN_DAYS: () => randomInt(7, 10) * ONE_DAY,
+  TWO_WEEKS: () => randomInt(10, 14) * ONE_DAY
 };
 
 export class Cache {

+ 6 - 5
Build/lib/domain-deduper.ts

@@ -6,13 +6,16 @@ export function domainDeduper(inputDomains: string[], toArray = true): string[]
   const trie = createTrie(inputDomains);
   const sets = new Set(inputDomains);
 
-  for (let j = 0, len = inputDomains.length; j < len; j++) {
-    const d = inputDomains[j];
+  for (let i = 0, len = inputDomains.length; i < len; i++) {
+    const d = inputDomains[i];
     if (d[0] !== '.') {
       continue;
     }
 
-    trie.find(d, false).forEach(f => sets.delete(f));
+    const found = trie.find(d, true);
+    for (let j = 0, len = found.length; j < len; j++) {
+      sets.delete(found[j]);
+    }
 
     const a: string = d.slice(1);
 
@@ -27,5 +30,3 @@ export function domainDeduper(inputDomains: string[], toArray = true): string[]
 
   return sets;
 }
-
-export default domainDeduper;

+ 11 - 6
Build/lib/get-phishing-domains.ts

@@ -8,6 +8,8 @@ import { processLine } from './process-line';
 import { TTL } from './cache-filesystem';
 import { isCI } from 'ci-info';
 
+import { add as SetAdd } from 'mnemonist/set';
+
 const WHITELIST_DOMAIN = new Set([
   'w3s.link',
   'dweb.link',
@@ -92,16 +94,19 @@ export const getPhishingDomains = () => traceAsync('get phishing domains', async
       : null,
     getGorhillPublicSuffixPromise()
   ]);
-  domainSet2?.forEach((domain) => domainSet.add(domain));
+  if (domainSet2) {
+    SetAdd(domainSet, domainSet2);
+  }
 
   traceSync.skip('* whitelisting phishing domains', () => {
     const trieForRemovingWhiteListed = createTrie(domainSet);
-    WHITELIST_DOMAIN.forEach(white => {
-      trieForRemovingWhiteListed.find(`.${white}`, false).forEach(f => domainSet.delete(f));
-      // if (trieForRemovingWhiteListed.has(white)) {
+    for (const white of WHITELIST_DOMAIN) {
+      const found = trieForRemovingWhiteListed.find(`.${white}`, false);
+      for (let i = 0, len = found.length; i < len; i++) {
+        domainSet.delete(found[i]);
+      }
       domainSet.delete(white);
-      // }
-    });
+    }
   });
 
   const domainCountMap: Record<string, number> = {};

+ 10 - 2
Build/lib/normalize-domain.ts

@@ -10,8 +10,16 @@ export const normalizeDomain = (domain: string) => {
   if (!parsed.isIcann && !parsed.isPrivate) return null;
 
   let h = parsed.hostname;
-  if (h[0] === '.') h = h.slice(1);
-  if (h.endsWith('.')) h = h.slice(0, -1);
+
+  let sliceStart = 0;
+  let sliceEnd = h.length;
+
+  if (h[0] === '.') sliceStart = 1;
+  if (h.endsWith('.')) sliceEnd = -1;
+
+  if (sliceStart !== 0 || sliceEnd !== h.length) {
+    h = h.slice(sliceStart, sliceEnd);
+  }
 
   if (h) return h;
   return null;

+ 3 - 3
Build/lib/parse-filter.ts

@@ -208,12 +208,12 @@ export async function processFilterRules(
     }
   ));
 
-  warningMessages.forEach(msg => {
+  for (let i = 0, len = warningMessages.length; i < len; i++) {
     console.warn(
-      picocolors.yellow(msg),
+      picocolors.yellow(warningMessages[i]),
       picocolors.gray(picocolors.underline(filterRulesUrl))
     );
-  });
+  }
 
   console.log(
     picocolors.gray('[process filter]'),

+ 6 - 6
Build/lib/reject-data-source.ts

@@ -15,17 +15,12 @@ export const HOSTS = [
   // Curben's UrlHaus Malicious URL Blocklist
   // 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt',
   // 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
-  ['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, TTL.THREE_HOURS()],
+  ['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, TTL.THREE_HOURS()]
   // Curben's Phishing URL Blocklist
   // Covered by lib/get-phishing-domains.ts
   // 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt'
   // 'https://phishing-filter.pages.dev/phishing-filter-agh.txt'
   // ['https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true],
-  // Curben's PUP Domains Blocklist
-  // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
-  // 'https://pup-filter.pages.dev/pup-filter-agh.txt'
-  // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl
-  ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, TTL.TWO_WEEKS()]
 ] as const;
 
 export const DOMAIN_LISTS = [
@@ -38,6 +33,11 @@ export const DOMAIN_LISTS = [
   // DigitalSide Threat-Intel - OSINT Hub
   // Update once per day
   ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, TTL.ONE_DAY()],
+  // Curben's PUP Domains Blocklist
+  // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
+  // 'https://pup-filter.pages.dev/pup-filter-agh.txt'
+  // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl
+  ['https://curbengh.github.io/pup-filter/pup-filter-domains.txt', true, TTL.TWO_WEEKS()],
   // AdGuard CNAME Filter Combined
   // Update on a 7 days basis, so we add a 3 hours cache ttl
   ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()],

+ 8 - 0
Build/lib/set-add-from-array.ts

@@ -0,0 +1,8 @@
+/**
+ * In-place adding of elements from an array to a set.
+ */
+export function setAddFromArray<T>(set: Set<T>, arr: T[]): void {
+  for (let i = 0, len = arr.length; i < len; i++) {
+    set.add(arr[i]);
+  }
+}

+ 0 - 9
Build/validate-gfwlist.ts

@@ -93,15 +93,6 @@ export const parseGfwList = async () => {
     runAgainstRuleset(path.resolve(import.meta.dir, '../List/non_ip/stream.conf'))
   ]);
 
-  // for await (const l of readFileByLine(path.resolve(import.meta.dir, '../List/non_ip/stream.conf'))) {
-  //   const line = processLine(l);
-  //   if (!line) continue;
-  //   const domain = line[0] === '.' ? line.slice(1) : line;
-  //   if (top500Gfwed.has(domain)) {
-  //     notIncludedTop500Gfwed.delete(domain);
-  //   }
-  // }
-
   console.log(notIncludedTop500Gfwed);
 
   return [