ソースを参照

Minor changes to fs memo implementation / Adapt fs memo

SukkaW 1 年間 前
コミット
a8c9cc5ac5
3 ファイル変更131 行追加149 行削除
  1. 0 40
      Build/lib/cache-filesystem.ts
  2. 61 28
      Build/lib/fs-memo.ts
  3. 70 81
      Build/lib/get-phishing-domains.ts

+ 0 - 40
Build/lib/cache-filesystem.ts

@@ -28,7 +28,6 @@ export interface CacheOptions<S = string> {
 
 interface CacheApplyRawOption {
   ttl?: number | null,
-  cacheName?: string,
   temporaryBypass?: boolean,
   incrementTtlWhenHit?: boolean
 }
@@ -187,45 +186,6 @@ export class Cache<S = string> {
     this.db.prepare(`DELETE FROM ${this.tableName} WHERE key = ?`).run(key);
   }
 
-  async apply<T>(
-    key: string,
-    fn: () => Promise<T>,
-    opt: CacheApplyOption<T, S>
-  ): Promise<T> {
-    const { ttl, temporaryBypass, incrementTtlWhenHit, cacheName } = opt;
-
-    if (temporaryBypass) {
-      return fn();
-    }
-    if (ttl == null) {
-      this.del(key);
-      return fn();
-    }
-
-    const cached = this.get(key);
-    if (cached == null) {
-      console.log(picocolors.yellow('[cache] miss'), picocolors.gray(cacheName || key), picocolors.gray(`ttl: ${TTL.humanReadable(ttl)}`));
-
-      const serializer = 'serializer' in opt ? opt.serializer : identity as any;
-
-      const promise = fn();
-
-      return promise.then((value) => {
-        this.set(key, serializer(value), ttl);
-        return value;
-      });
-    }
-
-    console.log(picocolors.green('[cache] hit'), picocolors.gray(cacheName || key));
-
-    if (incrementTtlWhenHit) {
-      this.updateTtl(key, ttl);
-    }
-
-    const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
-    return deserializer(cached);
-  }
-
   async applyWithHttp304<T>(
     url: string,
     extraCacheKey: string,

+ 61 - 28
Build/lib/fs-memo.ts

@@ -3,7 +3,17 @@ import { Cache } from './cache-filesystem';
 import type { CacheApplyOption } from './cache-filesystem';
 import { isCI } from 'ci-info';
 
-const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache') });
+import { Typeson, set, map, typedArrays } from 'typeson-registry';
+import picocolors from 'picocolors';
+import { identity } from './misc';
+
+const typeson = new Typeson().register([
+  typedArrays,
+  set,
+  map
+]);
+
+const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache'), tableName: 'fs_memo_cache' });
 
 const TTL = isCI
   // We run CI daily, so 1.5 days TTL is enough to persist the cache across runs
@@ -11,41 +21,64 @@ const TTL = isCI
   // We run locally less frequently, so we need to persist the cache for longer, 7 days
   : 7 * 86400 * 1000;
 
-  type JSONValue =
-    | string
-    | number
-    | boolean
-    | null
-    | JSONObject
-    | JSONArray;
+type TypesonValue =
+  | string
+  | number
+  | boolean
+  | null
+  | Set<any>
+  | Map<any, any>
+  | TypesonObject
+  | TypesonArray;
 
-interface JSONObject {
-  [key: string]: JSONValue
+interface TypesonObject {
+  [key: string]: TypesonValue
 }
 
-interface JSONArray extends Array<JSONValue> {}
+interface TypesonArray extends Array<TypesonValue> { }
+
+export type FsMemoCacheOptions<T> = CacheApplyOption<T, string> & {
+  ttl?: undefined | never
+};
 
-export function cache<Args extends JSONValue[], T>(
-  cb: (...args: Args) => Promise<T>,
-  opt: Omit<CacheApplyOption<T, string>, 'ttl'>
+export function cache<Args extends TypesonValue[], T>(
+  fn: (...args: Args) => Promise<T>,
+  opt: FsMemoCacheOptions<T>
 ): (...args: Args) => Promise<T> {
   // TODO if cb.toString() is long we should hash it
-  const fixedKey = cb.toString();
+  const fixedKey = fn.toString();
 
   return async function cachedCb(...args: Args) {
     // Construct the complete cache key for this function invocation
-    // TODO stringify is limited. For now we uses typescript to guard the args.
-    const cacheKey = `${fixedKey}|${JSON.stringify(args)}`;
-    const cacheName = cb.name || cacheKey;
-
-    return fsMemoCache.apply(
-      cacheKey,
-      cb,
-      {
-        cacheName,
-        ...opt,
-        ttl: TTL
-      } as CacheApplyOption<T, string>
-    );
+    // typeson.stringify is still limited. For now we uses typescript to guard the args.
+    const cacheKey = `${fixedKey}|${typeson.stringifySync(args)}`;
+    const cacheName = fn.name || cacheKey;
+
+    const { temporaryBypass, incrementTtlWhenHit } = opt;
+
+    if (temporaryBypass) {
+      return fn(...args);
+    }
+
+    const cached = fsMemoCache.get(cacheKey);
+    if (cached == null) {
+      console.log(picocolors.yellow('[cache] miss'), picocolors.gray(cacheName || cacheKey));
+
+      const serializer = 'serializer' in opt ? opt.serializer : identity as any;
+
+      const value = await fn(...args);
+
+      fsMemoCache.set(cacheKey, serializer(value), TTL);
+      return value;
+    }
+
+    console.log(picocolors.green('[cache] hit'), picocolors.gray(cacheName || cacheKey));
+
+    if (incrementTtlWhenHit) {
+      fsMemoCache.updateTtl(cacheKey, TTL);
+    }
+
+    const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
+    return deserializer(cached);
   };
 }

+ 70 - 81
Build/lib/get-phishing-domains.ts

@@ -8,9 +8,8 @@ import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/
 import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt';
 import picocolors from 'picocolors';
 import createKeywordFilter from './aho-corasick';
-import { createCacheKey, deserializeArray, fsFetchCache, serializeArray } from './cache-filesystem';
-import { fastStringArrayJoin } from './misc';
-import { stringHash } from './string-hash';
+import { createCacheKey, deserializeArray, serializeArray } from './cache-filesystem';
+import { cache } from './fs-memo';
 
 const BLACK_TLD = new Set([
   'accountant', 'art', 'autos',
@@ -102,6 +101,73 @@ const lowKeywords = createKeywordFilter([
 
 const cacheKey = createCacheKey(__filename);
 
+const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: string[]): Promise<string[]> {
+  const domainCountMap: Record<string, number> = {};
+  const domainScoreMap: Record<string, number> = {};
+
+  for (let i = 0, len = domainArr.length; i < len; i++) {
+    const line = domainArr[i];
+
+    const {
+      publicSuffix: tld,
+      domain: apexDomain,
+      subdomain,
+      isPrivate
+    } = tldts.parse(line, loosTldOptWithPrivateDomains);
+
+    if (isPrivate) {
+      continue;
+    }
+
+    if (!tld) {
+      console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
+      continue;
+    }
+    if (!apexDomain) {
+      console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
+      continue;
+    }
+
+    domainCountMap[apexDomain] ||= 0;
+    domainCountMap[apexDomain] += 1;
+
+    if (!(apexDomain in domainScoreMap)) {
+      domainScoreMap[apexDomain] = 0;
+      if (BLACK_TLD.has(tld)) {
+        domainScoreMap[apexDomain] += 4;
+      } else if (tld.length > 6) {
+        domainScoreMap[apexDomain] += 2;
+      }
+      if (apexDomain.length >= 18) {
+        domainScoreMap[apexDomain] += 0.5;
+      }
+    }
+    if (
+      subdomain
+      && !WHITELIST_MAIN_DOMAINS.has(apexDomain)
+    ) {
+      domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line);
+    }
+  }
+
+  for (const apexDomain in domainCountMap) {
+    if (
+      // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
+      domainScoreMap[apexDomain] >= 16
+      || (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 7)
+      || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 10)
+      || (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 16)
+    ) {
+      domainArr.push('.' + apexDomain);
+    }
+  }
+
+  return Promise.resolve(domainArr);
+}, {
+  serializer: serializeArray,
+  deserializer: deserializeArray
+});
+
 export function getPhishingDomains(parentSpan: Span) {
   return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
     const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
@@ -115,90 +181,13 @@ export function getPhishingDomains(parentSpan: Span) {
       return domainArr;
     });
 
-    const cacheHash = span.traceChildSync('get hash', () => stringHash(fastStringArrayJoin(domainArr, '|')));
-
     return span.traceChildAsync(
       'process phishing domain set',
-      () => processPhihsingDomains(domainArr, cacheHash)
+      () => processPhihsingDomains(domainArr)
     );
   });
 }
 
-async function processPhihsingDomains(domainArr: string[], cacheHash = '') {
-  return fsFetchCache.apply(
-    cacheKey('processPhihsingDomains|' + cacheHash),
-    () => {
-      const domainCountMap: Record<string, number> = {};
-      const domainScoreMap: Record<string, number> = {};
-
-      for (let i = 0, len = domainArr.length; i < len; i++) {
-        const line = domainArr[i];
-
-        const {
-          publicSuffix: tld,
-          domain: apexDomain,
-          subdomain,
-          isPrivate
-        } = tldts.parse(line, loosTldOptWithPrivateDomains);
-
-        if (isPrivate) {
-          continue;
-        }
-
-        if (!tld) {
-          console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
-          continue;
-        }
-        if (!apexDomain) {
-          console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
-          continue;
-        }
-
-        domainCountMap[apexDomain] ||= 0;
-        domainCountMap[apexDomain] += 1;
-
-        if (!(apexDomain in domainScoreMap)) {
-          domainScoreMap[apexDomain] = 0;
-          if (BLACK_TLD.has(tld)) {
-            domainScoreMap[apexDomain] += 4;
-          } else if (tld.length > 6) {
-            domainScoreMap[apexDomain] += 2;
-          }
-          if (apexDomain.length >= 18) {
-            domainScoreMap[apexDomain] += 0.5;
-          }
-        }
-        if (
-          subdomain
-          && !WHITELIST_MAIN_DOMAINS.has(apexDomain)
-        ) {
-          domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line);
-        }
-      }
-
-      for (const apexDomain in domainCountMap) {
-        if (
-          // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
-          domainScoreMap[apexDomain] >= 16
-          || (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 7)
-          || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 10)
-          || (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 16)
-        ) {
-          domainArr.push('.' + apexDomain);
-        }
-      }
-
-      return Promise.resolve(domainArr);
-    },
-    {
-      ttl: 2 * 86400 * 1000,
-      serializer: serializeArray,
-      deserializer: deserializeArray,
-      incrementTtlWhenHit: true
-    }
-  );
-}
-
 export function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) {
   let weight = 0;