Browse Source

Perf: move phishing hosts process into a worker

SukkaW 1 year ago
parent
commit
c9b53b1c1a
2 changed files with 291 additions and 263 deletions
  1. 109 0
      Build/constants/phishing-score-source.ts
  2. 182 263
      Build/lib/get-phishing-domains.ts

+ 109 - 0
Build/constants/phishing-score-source.ts

@@ -0,0 +1,109 @@
+import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
+
+export const BLACK_TLD = new Set([
+  'accountant', 'art', 'autos',
+  'bar', 'beauty', 'bid', 'bio', 'biz', 'bond', 'business', 'buzz',
+  'cc', 'cf', 'cfd', 'click', 'cloud', 'club', 'cn', 'codes',
+  'co.uk', 'co.in', 'com.br', 'com.cn', 'com.pl', 'com.vn',
+  'cool', 'cricket', 'cyou',
+  'date', 'design', 'digital', 'download',
+  'faith', 'fit', 'fun',
+  'ga', 'gd', 'gives', 'gq', 'group', 'host',
+  'icu', 'id', 'info', 'ink',
+  'lat', 'life', 'live', 'link', 'loan', 'lol', 'ltd',
+  'me', 'men', 'ml', 'mobi', 'mom', 'monster',
+  'net.pl',
+  'one', 'online',
+  'party', 'pro', 'pl', 'pw',
+  'racing', 'rest', 'review', 'rf.gd',
+  'sa.com', 'sbs', 'science', 'shop', 'site', 'skin', 'space', 'store', 'stream', 'su', 'surf',
+  'tech', 'tk', 'tokyo', 'top', 'trade',
+  'vip', 'vn',
+  'webcam', 'website', 'win',
+  'xyz',
+  'za.com'
+]);
+
+export const WHITELIST_MAIN_DOMAINS = new Set([
+  // 'w3s.link', // ipfs gateway
+  // 'dweb.link', // ipfs gateway
+  // 'nftstorage.link', // ipfs gateway
+  'fleek.cool', // ipfs gateway
+  'flk-ipfs.xyz', // ipfs gateway
+  'business.site', // Drag'n'Drop site building platform
+  'page.link', // Firebase URL Shortener
+  // 'notion.site',
+  // 'vercel.app',
+  'gitbook.io',
+  'zendesk.com',
+  'ipfs.eth.aragon.network',
+  'wordpress.com'
+]);
+
+export const leathalKeywords = createKeywordFilter([
+  'vinted-',
+  'inpost-pl',
+  'vlnted-',
+  'allegrolokalnie',
+  'thetollroads'
+]);
+
+export const sensitiveKeywords = createKeywordFilter([
+  '.amazon-',
+  '-amazon',
+  'fb-com',
+  'facebook-com',
+  '-facebook',
+  'facebook-',
+  'focebaak',
+  '.facebook.',
+  'metamask',
+  'www.apple',
+  '-coinbase',
+  'coinbase-',
+  'booking-com',
+  'booking.com-',
+  'booking-eu',
+  'vinted-',
+  'inpost-pl',
+  'login.microsoft',
+  'login-microsoft',
+  'microsoftonline',
+  'google.com-',
+  'minecraft',
+  'staemco',
+  'oferta'
+]);
+
+export const fakeTldKeywords = createKeywordFilter([
+  '.pl-',
+  '.com-',
+  '.net-'
+]);
+
+export const lowKeywords = createKeywordFilter([
+  'transactions-',
+  'payment',
+  'wallet',
+  '-transactions',
+  '-faceb', // facebook fake
+  '.faceb', // facebook fake
+  'facebook',
+  'virus-',
+  'icloud-',
+  'apple-',
+  '-roblox',
+  '-co-jp',
+  'customer.',
+  'customer-',
+  '.www-',
+  '.www.',
+  '.www2',
+  'instagram',
+  'microsof',
+  'passwordreset',
+  '.google-',
+  'recover',
+  'banking',
+  'shop'
+]);

+ 182 - 263
Build/lib/get-phishing-domains.ts

@@ -1,226 +1,190 @@
+import Worktank from 'worktank';
+
 import { processHostsWithPreload } from './parse-filter/hosts';
 import { processDomainListsWithPreload } from './parse-filter/domainlists';
-
-import * as tldts from 'tldts-experimental';
-
 import { dummySpan, printTraceResult } from '../trace';
 import type { Span } from '../trace';
 import { appendArrayInPlaceCurried } from './append-array-in-place';
-import { DEBUG_DOMAIN_TO_FIND, PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source';
-import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt';
-import picocolors from 'picocolors';
-import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
-import { deserializeArray, serializeArray } from './cache-filesystem';
-import { cache } from './fs-memo';
-import { isCI } from 'ci-info';
-
-const BLACK_TLD = new Set([
-  'accountant', 'art', 'autos',
-  'bar', 'beauty', 'bid', 'bio', 'biz', 'bond', 'business', 'buzz',
-  'cc', 'cf', 'cfd', 'click', 'cloud', 'club', 'cn', 'codes',
-  'co.uk', 'co.in', 'com.br', 'com.cn', 'com.pl', 'com.vn',
-  'cool', 'cricket', 'cyou',
-  'date', 'design', 'digital', 'download',
-  'faith', 'fit', 'fun',
-  'ga', 'gd', 'gives', 'gq', 'group', 'host',
-  'icu', 'id', 'info', 'ink',
-  'lat', 'life', 'live', 'link', 'loan', 'lol', 'ltd',
-  'me', 'men', 'ml', 'mobi', 'mom', 'monster',
-  'net.pl',
-  'one', 'online',
-  'party', 'pro', 'pl', 'pw',
-  'racing', 'rest', 'review', 'rf.gd',
-  'sa.com', 'sbs', 'science', 'shop', 'site', 'skin', 'space', 'store', 'stream', 'su', 'surf',
-  'tech', 'tk', 'tokyo', 'top', 'trade',
-  'vip', 'vn',
-  'webcam', 'website', 'win',
-  'xyz',
-  'za.com'
-]);
-
-const WHITELIST_MAIN_DOMAINS = new Set([
-  // 'w3s.link', // ipfs gateway
-  // 'dweb.link', // ipfs gateway
-  // 'nftstorage.link', // ipfs gateway
-  'fleek.cool', // ipfs gateway
-  'flk-ipfs.xyz', // ipfs gateway
-  'business.site', // Drag'n'Drop site building platform
-  'page.link', // Firebase URL Shortener
-  // 'notion.site',
-  // 'vercel.app',
-  'gitbook.io',
-  'zendesk.com',
-  'ipfs.eth.aragon.network',
-  'wordpress.com'
-]);
-
-const leathalKeywords = createKeywordFilter([
-  'vinted-',
-  'inpost-pl',
-  'vlnted-'
-]);
-
-const sensitiveKeywords = createKeywordFilter([
-  '.amazon-',
-  '-amazon',
-  'fb-com',
-  'facebook-com',
-  '-facebook',
-  'facebook-',
-  'focebaak',
-  '.facebook.',
-  'metamask',
-  'www.apple',
-  '-coinbase',
-  'coinbase-',
-  'booking-com',
-  'booking.com-',
-  'booking-eu',
-  'vinted-',
-  'inpost-pl',
-  'login.microsoft',
-  'login-microsoft',
-  'microsoftonline',
-  'google.com-',
-  'minecraft',
-  'staemco',
-  'oferta',
-  'allegrolokalnie',
-  'thetollroads'
-]);
-const fakeTldKeywords = createKeywordFilter([
-  '.pl-',
-  '.com-',
-  '.net-'
-]);
-const lowKeywords = createKeywordFilter([
-  'transactions-',
-  'payment',
-  'wallet',
-  '-transactions',
-  '-faceb', // facebook fake
-  '.faceb', // facebook fake
-  'facebook',
-  'virus-',
-  'icloud-',
-  'apple-',
-  '-roblox',
-  '-co-jp',
-  'customer.',
-  'customer-',
-  '.www-',
-  '.www.',
-  '.www2',
-  'instagram',
-  'microsof',
-  'passwordreset',
-  '.google-',
-  'recover',
-  'banking',
-  'shop'
-]);
-
-const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: string[]): string[] {
-  const domainCountMap = new Map<string, number>();
-  const domainScoreMap: Record<string, number> = {};
-
-  let line = '';
-  let tld: string | null = '';
-  let apexDomain: string | null = '';
-  let subdomain: string | null = '';
-
-  // const set = new Set<string>();
-  // let duplicateCount = 0;
-
-  for (let i = 0, len = domainArr.length; i < len; i++) {
-    line = domainArr[i];
-
-    // if (set.has(line)) {
-    //   duplicateCount++;
-    // } else {
-    //   set.add(line);
-    // }
-
-    const parsed = tldts.parse(line, loosTldOptWithPrivateDomains);
-    if (parsed.isPrivate) {
-      continue;
-    }
+import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source';
 
-    tld = parsed.publicSuffix;
-    apexDomain = parsed.domain;
+const downloads = [
+  ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry)),
+  ...PHISHING_HOSTS_EXTRA.map(entry => processHostsWithPreload(...entry))
+];
 
-    if (!tld) {
-      console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
-      continue;
-    }
-    if (!apexDomain) {
-      console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
-      continue;
-    }
+const pool = new Worktank({
+  name: 'process-phishing-domains',
+  size: 1,
+  timeout: 10000, // The maximum number of milliseconds to wait for the result from the worker, if exceeded the worker is terminated and the execution promise rejects
+  warmup: true,
+  autoterminate: 30000, // The interval of milliseconds at which to check if the pool can be automatically terminated, to free up resources, workers will be spawned up again if needed
+  env: {},
+  methods: {
+    // eslint-disable-next-line object-shorthand -- workertank
+    processPhihsingDomains: async function (
+      domainArr: string[],
+      importMetaUrl: string,
+      /** require.main === module */ isDebug = false
+    ): Promise<string[]> {
+      // TODO: createRequire is a temporary workaround for https://github.com/nodejs/node/issues/51956
+      const { default: module } = await import('node:module');
+      const __require = module.createRequire(importMetaUrl);
+
+      const picocolors = __require('picocolors') as typeof import('picocolors');
+      const tldts = __require('tldts-experimental') as typeof import('tldts-experimental');
+
+      const { loosTldOptWithPrivateDomains } = __require('../constants/loose-tldts-opt') as typeof import('../constants/loose-tldts-opt');
+      const { BLACK_TLD, WHITELIST_MAIN_DOMAINS, leathalKeywords, lowKeywords, fakeTldKeywords, sensitiveKeywords } = __require('../constants/phishing-score-source') as typeof import('../constants/phishing-score-source');
+
+      const domainCountMap = new Map<string, number>();
+      const domainScoreMap: Record<string, number> = {};
+
+      let line = '';
+      let tld: string | null = '';
+      let apexDomain: string | null = '';
+      let subdomain: string | null = '';
+
+      // const set = new Set<string>();
+      // let duplicateCount = 0;
+
+      for (let i = 0, len = domainArr.length; i < len; i++) {
+        line = domainArr[i];
+
+        // if (set.has(line)) {
+        //   duplicateCount++;
+        // } else {
+        //   set.add(line);
+        // }
+
+        const parsed = tldts.parse(line, loosTldOptWithPrivateDomains);
+        if (parsed.isPrivate) {
+          continue;
+        }
 
-    domainCountMap.set(
-      apexDomain,
-      domainCountMap.has(apexDomain)
-        ? domainCountMap.get(apexDomain)! + 1
-        : 1
-    );
+        tld = parsed.publicSuffix;
+        apexDomain = parsed.domain;
+
+        if (!tld) {
+          console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
+          continue;
+        }
+        if (!apexDomain) {
+          console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
+          continue;
+        }
+
+        domainCountMap.set(
+          apexDomain,
+          domainCountMap.has(apexDomain)
+            ? domainCountMap.get(apexDomain)! + 1
+            : 1
+        );
+
+        if (!(apexDomain in domainScoreMap)) {
+          domainScoreMap[apexDomain] = 0;
+          if (BLACK_TLD.has(tld)) {
+            domainScoreMap[apexDomain] += 3;
+          } else if (tld.length > 6) {
+            domainScoreMap[apexDomain] += 2;
+          }
+          if (apexDomain.length >= 18) {
+            domainScoreMap[apexDomain] += 0.5;
+          }
+        }
 
-    if (!(apexDomain in domainScoreMap)) {
-      domainScoreMap[apexDomain] = 0;
-      if (BLACK_TLD.has(tld)) {
-        domainScoreMap[apexDomain] += 3;
-      } else if (tld.length > 6) {
-        domainScoreMap[apexDomain] += 2;
+        subdomain = parsed.subdomain;
+
+        if (
+          subdomain
+          && !WHITELIST_MAIN_DOMAINS.has(apexDomain)
+        ) {
+          domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line);
+        }
       }
-      if (apexDomain.length >= 18) {
-        domainScoreMap[apexDomain] += 0.5;
+
+      domainCountMap.forEach((count, apexDomain) => {
+        if (
+        // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
+          (domainScoreMap[apexDomain] >= 24)
+          || (domainScoreMap[apexDomain] >= 16 && count >= 7)
+          || (domainScoreMap[apexDomain] >= 13 && count >= 11)
+          || (domainScoreMap[apexDomain] >= 5 && count >= 14)
+          || (domainScoreMap[apexDomain] >= 3 && count >= 21)
+          || (domainScoreMap[apexDomain] >= 1 && count >= 60)
+        ) {
+          domainArr.push('.' + apexDomain);
+        }
+      });
+
+      if (isDebug) {
+        console.log({
+          v: 1,
+          score: domainScoreMap['com-paytollbydv.world'],
+          count: domainCountMap.get('com-paytollbydv.worldx'),
+          domainArrLen: domainArr.length
+        });
       }
-    }
 
-    subdomain = parsed.subdomain;
+      return domainArr;
 
-    if (
-      subdomain
-      && !WHITELIST_MAIN_DOMAINS.has(apexDomain)
-    ) {
-      domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line);
-    }
-  }
+      function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) {
+        if (leathalKeywords(fullDomain)) {
+          return 100;
+        }
 
-  domainCountMap.forEach((count, apexDomain) => {
-    if (
-      // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
-      (domainScoreMap[apexDomain] >= 24)
-      || (domainScoreMap[apexDomain] >= 16 && count >= 7)
-      || (domainScoreMap[apexDomain] >= 13 && count >= 11)
-      || (domainScoreMap[apexDomain] >= 5 && count >= 14)
-      || (domainScoreMap[apexDomain] >= 3 && count >= 21)
-      || (domainScoreMap[apexDomain] >= 1 && count >= 60)
-    ) {
-      domainArr.push('.' + apexDomain);
-    }
-  });
+        let weight = 0;
 
-  if (require.main === module) {
-    console.log({
-      v: 1,
-      score: domainScoreMap['com-paytollbydv.world'],
-      count: domainCountMap.get('com-paytollbydv.worldx'),
-      domainArrLen: domainArr.length
-    });
-  }
+        const hitLowKeywords = lowKeywords(fullDomain);
+        const sensitiveKeywordsHit = sensitiveKeywords(fullDomain);
+        const fakeTldKeywordsHit = fakeTldKeywords(fullDomain);
 
-  return domainArr;
-}, {
-  serializer: serializeArray,
-  deserializer: deserializeArray,
-  temporaryBypass: !isCI || DEBUG_DOMAIN_TO_FIND !== null
-});
+        if (sensitiveKeywordsHit) {
+          weight += 15;
+          if (hitLowKeywords) {
+            weight += 10;
+            if (fakeTldKeywordsHit) {
+              weight += 8;
+            }
+          }
+          // besides add for low hit, always add extra here
+          if (fakeTldKeywordsHit) {
+            weight += 10;
+          }
+        } else if (hitLowKeywords) {
+          weight += 1.8;
+          if (fakeTldKeywordsHit) {
+            weight += 5;
+          }
+        }
 
-const downloads = [
-  ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry)),
-  ...PHISHING_HOSTS_EXTRA.map(entry => processHostsWithPreload(...entry))
-];
+        const subdomainLength = subdomain.length;
+
+        if (subdomainLength > 6) {
+          weight += 0.015;
+
+          if (subdomainLength > 13) {
+            weight += 0.2;
+            if (subdomainLength > 20) {
+              weight += 1;
+              if (subdomainLength > 30) {
+                weight += 5;
+                if (subdomainLength > 40) {
+                  weight += 10;
+                }
+              }
+            }
+
+            if (subdomain.indexOf('.', 1) > 1) {
+              weight += 1;
+            }
+          }
+        }
+
+        return weight;
+      }
+    }
+  }
+});
 
 export function getPhishingDomains(parentSpan: Span) {
   return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
@@ -235,67 +199,22 @@ export function getPhishingDomains(parentSpan: Span) {
 
     return span.traceChildAsync(
       'process phishing domain set',
-      () => processPhihsingDomains(domainArr)
+      async () => {
+        const phishingDomains = await pool.exec(
+          'processPhihsingDomains',
+          [
+            domainArr,
+            import.meta.url,
+            require.main === module
+          ]
+        );
+        pool.terminate();
+        return phishingDomains;
+      }
     );
   });
 }
 
-export function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) {
-  if (leathalKeywords(fullDomain)) {
-    return 100;
-  }
-
-  let weight = 0;
-
-  const hitLowKeywords = lowKeywords(fullDomain);
-  const sensitiveKeywordsHit = sensitiveKeywords(fullDomain);
-  const fakeTldKeywordsHit = fakeTldKeywords(fullDomain);
-
-  if (sensitiveKeywordsHit) {
-    weight += 15;
-    if (hitLowKeywords) {
-      weight += 10;
-      if (fakeTldKeywordsHit) {
-        weight += 8;
-      }
-    }
-    // besides add for low hit, always add extra here
-    if (fakeTldKeywordsHit) {
-      weight += 10;
-    }
-  } else if (hitLowKeywords) {
-    weight += 1.8;
-    if (fakeTldKeywordsHit) {
-      weight += 5;
-    }
-  }
-
-  const subdomainLength = subdomain.length;
-
-  if (subdomainLength > 6) {
-    weight += 0.015;
-
-    if (subdomainLength > 13) {
-      weight += 0.2;
-      if (subdomainLength > 20) {
-        weight += 1;
-        if (subdomainLength > 30) {
-          weight += 5;
-          if (subdomainLength > 40) {
-            weight += 10;
-          }
-        }
-      }
-
-      if (subdomain.indexOf('.', 1) > 1) {
-        weight += 1;
-      }
-    }
-  }
-
-  return weight;
-}
-
 if (require.main === module) {
   getPhishingDomains(dummySpan)
     .catch(console.error)