Browse Source

Chore: improve phishing hosts

SukkaW 1 year ago
parent
commit
19228a8216
3 changed files with 59 additions and 43 deletions
  1. 17 11
      Build/lib/fs-memo.ts
  2. 41 31
      Build/lib/get-phishing-domains.ts
  3. 1 1
      Build/lib/misc.ts

+ 17 - 11
Build/lib/fs-memo.ts

@@ -6,7 +6,7 @@ import { isCI } from 'ci-info';
 import { xxhash64 } from 'hash-wasm';
 
 import picocolors from 'picocolors';
-import { identity } from './misc';
+import { fastStringArrayJoin, identity } from './misc';
 
 const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache'), tableName: 'fs_memo_cache' });
 
@@ -49,29 +49,35 @@ function createCache(onlyUseCachedIfFail: boolean) {
     fn: (...args: Args) => Promise<T>,
     opt: FsMemoCacheOptions<T>
   ): (...args: Args) => Promise<T> {
-    const fixedKey = fn.toString();
-
     if (opt.temporaryBypass) {
       return fn;
     }
 
+    const serializer = 'serializer' in opt ? opt.serializer : identity<T, string>;
+    const deserializer = 'deserializer' in opt ? opt.deserializer : identity<string, T>;
+
+    const fixedKey = fn.toString();
+
+    const fixedKeyHashPromise = xxhash64(fixedKey);
+    const devalueModulePromise = import('devalue');
+
     return async function cachedCb(...args: Args) {
-      const { stringify: devalueStringify } = await import('devalue');
+      const devalueStringify = (await devalueModulePromise).stringify;
 
       // Construct the complete cache key for this function invocation
       // typeson.stringify is still limited. For now we uses typescript to guard the args.
-      const cacheKey = (await Promise.all([
-        xxhash64(fixedKey),
-        xxhash64(devalueStringify(args))
-      ])).join('|');
+      const cacheKey = fastStringArrayJoin(
+        await Promise.all([
+          fixedKeyHashPromise,
+          xxhash64(devalueStringify(args))
+        ]),
+        '|'
+      );
 
       const cacheName = picocolors.gray(fn.name || fixedKey || cacheKey);
 
       const cached = fsMemoCache.get(cacheKey);
 
-      const serializer = 'serializer' in opt ? opt.serializer : identity as any;
-      const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
-
       if (onlyUseCachedIfFail) {
         try {
           const value = await fn(...args);

+ 41 - 31
Build/lib/get-phishing-domains.ts

@@ -1,7 +1,7 @@
 import { processDomainLists, processHosts } from './parse-filter';
 import * as tldts from 'tldts-experimental';
 
-import { dummySpan } from '../trace';
+import { dummySpan, printTraceResult } from '../trace';
 import type { Span } from '../trace';
 import { appendArrayInPlaceCurried } from './append-array-in-place';
 import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source';
@@ -108,26 +108,25 @@ const lowKeywords = createKeywordFilter([
   'banking'
 ]);
 
-const cacheKey = createCacheKey(__filename);
-
 const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: string[]): Promise<string[]> {
-  const domainCountMap: Record<string, number> = {};
+  const domainCountMap = new Map<string, number>();
   const domainScoreMap: Record<string, number> = {};
 
+  let tld: string | null = '';
+  let apexDomain: string | null = '';
+  let subdomain: string | null = '';
+
   for (let i = 0, len = domainArr.length; i < len; i++) {
     const line = domainArr[i];
 
-    const {
-      publicSuffix: tld,
-      domain: apexDomain,
-      subdomain,
-      isPrivate
-    } = tldts.parse(line, loosTldOptWithPrivateDomains);
-
-    if (isPrivate) {
+    const parsed = tldts.parse(line, loosTldOptWithPrivateDomains);
+    if (parsed.isPrivate) {
       continue;
     }
 
+    tld = parsed.publicSuffix;
+    apexDomain = parsed.domain;
+
     if (!tld) {
       console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
       continue;
@@ -137,8 +136,12 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr:
       continue;
     }
 
-    domainCountMap[apexDomain] ||= 0;
-    domainCountMap[apexDomain] += 1;
+    domainCountMap.set(
+      apexDomain,
+      domainCountMap.has(apexDomain)
+        ? domainCountMap.get(apexDomain)! + 1
+        : 1
+    );
 
     if (!(apexDomain in domainScoreMap)) {
       domainScoreMap[apexDomain] = 0;
@@ -151,6 +154,9 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr:
         domainScoreMap[apexDomain] += 0.5;
       }
     }
+
+    subdomain = parsed.subdomain;
+
     if (
       subdomain
       && !WHITELIST_MAIN_DOMAINS.has(apexDomain)
@@ -159,30 +165,33 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr:
     }
   }
 
-  for (const apexDomain in domainCountMap) {
+  domainCountMap.forEach((count, apexDomain) => {
     if (
       // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
       (domainScoreMap[apexDomain] >= 24)
-      || (domainScoreMap[apexDomain] >= 16 && domainCountMap[apexDomain] >= 7)
-      || (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 11)
-      || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 14)
-      || (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 21)
+      || (domainScoreMap[apexDomain] >= 16 && count >= 7)
+      || (domainScoreMap[apexDomain] >= 13 && count >= 11)
+      || (domainScoreMap[apexDomain] >= 5 && count >= 14)
+      || (domainScoreMap[apexDomain] >= 3 && count >= 21)
     ) {
       domainArr.push('.' + apexDomain);
     }
-  }
-
-  console.log({
-    score: domainScoreMap['flk-ipfs.xyz'],
-    count: domainCountMap['flk-ipfs.xyz']
   });
 
+  // console.log({
+  //   score: domainScoreMap['flk-ipfs.xyz'],
+  //   count: domainCountMap.get('flk-ipfs.xyz')
+  // });
+
   return Promise.resolve(domainArr);
 }, {
   serializer: serializeArray,
-  deserializer: deserializeArray
+  deserializer: deserializeArray,
+  temporaryBypass: true
 });
 
+const cacheKey = createCacheKey(__filename);
+
 export function getPhishingDomains(parentSpan: Span) {
   return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
     const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
@@ -219,7 +228,7 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub
       weight += 6;
     }
   } else if (hitLowKeywords) {
-    weight += 1.5;
+    weight += 1.7;
   }
 
   const subdomainLength = subdomain.length;
@@ -236,11 +245,8 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub
       }
     }
 
-    if (subdomain.slice(1).includes('.')) {
+    if (subdomain.indexOf('.', 1) > 1) {
       weight += 1;
-      if (subdomain.includes('www.')) {
-        weight += 1;
-      }
     }
   }
 
@@ -249,5 +255,9 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub
 
 if (require.main === module) {
   getPhishingDomains(dummySpan)
-    .catch(console.error);
+    .catch(console.error)
+    .finally(() => {
+      dummySpan.stop();
+      printTraceResult(dummySpan.traceResult);
+    });
 }

+ 1 - 1
Build/lib/misc.ts

@@ -66,7 +66,7 @@ export function domainWildCardToRegex(domain: string) {
   return result;
 }
 
-export const identity = <T>(x: T): T => x;
+export const identity = <T, R = T>(x: T): R => x as any;
 
 export function appendArrayFromSet<T>(dest: T[], source: Set<T> | Array<Set<T>>, transformer: (item: T) => T = identity) {
   const casted = Array.isArray(source) ? source : [source];