ソースを参照

Perf: cache phishing hosts process

SukkaW 1 年間 前
コミット
deadf16ae8
4 ファイル変更89 行追加51 行削除
  1. 11 2
      Build/lib/cache-filesystem.ts
  2. 69 49
      Build/lib/get-phishing-domains.ts
  3. 1 0
      package.json
  4. 8 0
      pnpm-lock.yaml

+ 11 - 2
Build/lib/cache-filesystem.ts

@@ -29,7 +29,8 @@ export interface CacheOptions<S = string> {
 
 interface CacheApplyRawOption {
   ttl?: number | null,
-  temporaryBypass?: boolean
+  temporaryBypass?: boolean,
+  incrementTtlWhenHit?: boolean
 }
 
 interface CacheApplyNonRawOption<T, S> extends CacheApplyRawOption {
@@ -158,6 +159,10 @@ export class Cache<S = string> {
     return rv ? (rv.ttl > now ? CacheStatus.Hit : CacheStatus.Stale) : CacheStatus.Miss;
   }
 
+  private updateTtl(key: string, ttl: number): void {
+    this.db.prepare(`UPDATE ${this.tableName} SET ttl = ? WHERE key = ?;`).run(Date.now() + ttl, key);
+  }
+
   del(key: string): void {
     this.db.prepare(`DELETE FROM ${this.tableName} WHERE key = ?`).run(key);
   }
@@ -167,7 +172,7 @@ export class Cache<S = string> {
     fn: () => Promise<T>,
     opt: CacheApplyOption<T, S>
   ): Promise<T> {
-    const { ttl, temporaryBypass } = opt;
+    const { ttl, temporaryBypass, incrementTtlWhenHit } = opt;
 
     if (temporaryBypass) {
       return fn();
@@ -193,6 +198,10 @@ export class Cache<S = string> {
 
     console.log(picocolors.green('[cache] hit'), picocolors.gray(key));
 
+    if (incrementTtlWhenHit) {
+      this.updateTtl(key, ttl);
+    }
+
     const deserializer = 'deserializer' in opt ? opt.deserializer : identity;
     return deserializer(cached);
   }

+ 69 - 49
Build/lib/get-phishing-domains.ts

@@ -7,7 +7,10 @@ import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source';
 import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt';
 import picocolors from 'picocolors';
 import createKeywordFilter from './aho-corasick';
-import { createCacheKey } from './cache-filesystem';
+import { createCacheKey, deserializeArray, fsFetchCache, serializeArray } from './cache-filesystem';
+import { fastStringArrayJoin } from './misc';
+
+import { sha256 } from 'hash-wasm';
 
 const BLACK_TLD = new Set([
   'accountant',
@@ -158,65 +161,82 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
     return domainArr;
   });
 
-  const domainCountMap: Record<string, number> = {};
-  const domainScoreMap: Record<string, number> = {};
+  return span.traceChildAsync(
+    'process phishing domain set',
+    () => processPhihsingDomains(domainArr)
+  );
+});
 
-  span.traceChildSync('process phishing domain set', () => {
-    for (let i = 0, len = domainArr.length; i < len; i++) {
-      const line = domainArr[i];
+async function processPhihsingDomains(domainArr: string[]) {
+  const hash = await sha256(fastStringArrayJoin(domainArr, '|'));
+  return fsFetchCache.apply(
+    cacheKey('processPhihsingDomains|' + hash),
+    () => {
+      const domainCountMap: Record<string, number> = {};
+      const domainScoreMap: Record<string, number> = {};
 
-      const {
-        publicSuffix: tld,
-        domain: apexDomain,
-        subdomain,
-        isPrivate
-      } = tldts.parse(line, loosTldOptWithPrivateDomains);
+      for (let i = 0, len = domainArr.length; i < len; i++) {
+        const line = domainArr[i];
 
-      if (isPrivate) {
-        continue;
-      }
+        const {
+          publicSuffix: tld,
+          domain: apexDomain,
+          subdomain,
+          isPrivate
+        } = tldts.parse(line, loosTldOptWithPrivateDomains);
 
-      if (!tld) {
-        console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
-        continue;
-      }
-      if (!apexDomain) {
-        console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
-        continue;
-      }
+        if (isPrivate) {
+          continue;
+        }
 
-      domainCountMap[apexDomain] ||= 0;
-      domainCountMap[apexDomain] += 1;
+        if (!tld) {
+          console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
+          continue;
+        }
+        if (!apexDomain) {
+          console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
+          continue;
+        }
 
-      if (!(apexDomain in domainScoreMap)) {
-        domainScoreMap[apexDomain] = 0;
-        if (BLACK_TLD.has(tld)) {
-          domainScoreMap[apexDomain] += 4;
-        } else if (tld.length > 6) {
-          domainScoreMap[apexDomain] += 2;
+        domainCountMap[apexDomain] ||= 0;
+        domainCountMap[apexDomain] += 1;
+
+        if (!(apexDomain in domainScoreMap)) {
+          domainScoreMap[apexDomain] = 0;
+          if (BLACK_TLD.has(tld)) {
+            domainScoreMap[apexDomain] += 4;
+          } else if (tld.length > 6) {
+            domainScoreMap[apexDomain] += 2;
+          }
+        }
+        if (
+          subdomain
+          && !WHITELIST_MAIN_DOMAINS.has(apexDomain)
+        ) {
+          domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain);
         }
       }
-      if (
-        subdomain
-        && !WHITELIST_MAIN_DOMAINS.has(apexDomain)
-      ) {
-        domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain);
+
+      for (const apexDomain in domainCountMap) {
+        if (
+          // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
+          domainScoreMap[apexDomain] >= 12
+          || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 4)
+        ) {
+          domainArr.push(`.${apexDomain}`);
+        }
       }
-    }
-  });
 
-  for (const apexDomain in domainCountMap) {
-    if (
-      // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
-      domainScoreMap[apexDomain] >= 12
-      || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 4)
-    ) {
-      domainArr.push(`.${apexDomain}`);
+      return Promise.resolve(domainArr);
+    },
+    {
+      ttl: 2 * 86400,
+      serializer: serializeArray,
+      deserializer: deserializeArray,
+      incrementTtlWhenHit: true
     }
-  }
-
-  return domainArr;
-});
+  );
+}
 
 export function calcDomainAbuseScore(subdomain: string) {
   let weight = 0;

+ 1 - 0
package.json

@@ -30,6 +30,7 @@
     "fast-cidr-tools": "^0.2.5",
     "fdir": "^6.3.0",
     "foxact": "^0.2.38",
+    "hash-wasm": "^4.11.0",
     "json-stringify-pretty-compact": "^3.0.0",
     "mnemonist": "^0.39.8",
     "picocolors": "^1.1.0",

+ 8 - 0
pnpm-lock.yaml

@@ -41,6 +41,9 @@ importers:
       foxact:
         specifier: ^0.2.38
         version: 0.2.38
+      hash-wasm:
+        specifier: ^4.11.0
+        version: 4.11.0
       json-stringify-pretty-compact:
         specifier: ^3.0.0
         version: 3.0.0
@@ -996,6 +999,9 @@ packages:
     resolution: {integrity: sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==}
     engines: {node: '>=8'}
 
+  hash-wasm@4.11.0:
+    resolution: {integrity: sha512-HVusNXlVqHe0fzIzdQOGolnFN6mX/fqcrSAOcTBXdvzrXVHwTz11vXeKRmkR5gTuwVpvHZEIyKoePDvuAR+XwQ==}
+
   hasown@2.0.2:
     resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==}
     engines: {node: '>= 0.4'}
@@ -2519,6 +2525,8 @@ snapshots:
 
   has-flag@4.0.0: {}
 
+  hash-wasm@4.11.0: {}
+
   hasown@2.0.2:
     dependencies:
       function-bind: 1.1.2