ソースを参照

Feat: update phishing domain threshold

SukkaW 1 年間 前
コミット
b5a6e05a84

+ 0 - 1
Build/build-reject-domainset.ts

@@ -11,7 +11,6 @@ import createKeywordFilter from './lib/aho-corasick';
 import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
 import { sortDomains } from './lib/stable-sort-domain';
 import { task } from './trace';
-import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
 import * as tldts from 'tldts';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { getPhishingDomains } from './lib/get-phishing-domains';

+ 10 - 0
Build/lib/get-phishing-domains.test.ts

@@ -0,0 +1,10 @@
+// eslint-disable-next-line import-x/no-unresolved -- bun
+import { describe, expect, it } from 'bun:test';
+
+import { calcDomainAbuseScore } from './get-phishing-domains';
+
+describe('sortDomains', () => {
+  it('nmdj.pl', () => {
+    console.log(calcDomainAbuseScore('.01462ccca801fed55370d79231c876e5.nmdj.pl'));
+  });
+});

+ 82 - 62
Build/lib/get-phishing-domains.ts

@@ -103,21 +103,23 @@ const BLACK_TLD = new Set([
 ]);
 
 export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
-  const [domainSet, domainSet2, gorhill] = await Promise.all([
-    processDomainLists(span, 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, TTL.THREE_HOURS()),
-    isCI
-      ? processDomainLists(span, 'https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS())
-      : null,
-    getGorhillPublicSuffixPromise()
-  ]);
-  if (domainSet2) {
+  const gorhill = await getGorhillPublicSuffixPromise();
+
+  const domainSet = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
+    const [domainSet, domainSet2] = await Promise.all([
+      processDomainLists(curSpan, 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, TTL.THREE_HOURS()),
+      processDomainLists(curSpan, 'https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS())
+    ]);
+
     SetAdd(domainSet, domainSet2);
-  }
 
-  span.traceChildSync('whitelisting phishing domains', (parentSpan) => {
-    const trieForRemovingWhiteListed = parentSpan.traceChildSync('create trie for whitelisting', () => createTrie(domainSet));
+    return domainSet;
+  });
 
-    return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
+  span.traceChildSync('whitelisting phishing domains', (curSpan) => {
+    const trieForRemovingWhiteListed = curSpan.traceChildSync('create trie for whitelisting', () => createTrie(domainSet));
+
+    return curSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
       for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
         const white = WHITELIST_DOMAIN[i];
         domainSet.delete(white);
@@ -134,68 +136,28 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
     const domainArr = Array.from(domainSet);
 
     for (let i = 0, len = domainArr.length; i < len; i++) {
-      const line = processLine(domainArr[i]);
-      if (!line) continue;
-
-      const apexDomain = gorhill.getDomain(line);
-      if (!apexDomain) continue;
-
-      domainCountMap[apexDomain] ||= 0;
+      const line = domainArr[i];
 
-      const isPhishingDomainMockingCoJp = line.includes('-co-jp');
-      if (isPhishingDomainMockingCoJp) {
-        domainCountMap[apexDomain] += 0.5;
-      }
+      const safeGorhillLine = line[0] === '.' ? line.slice(1) : line;
 
-      if (line.startsWith('.amaz')) {
-        domainCountMap[apexDomain] += 0.5;
-
-        if (line.startsWith('.amazon-')) {
-          domainCountMap[apexDomain] += 4.5;
-        }
-        if (isPhishingDomainMockingCoJp) {
-          domainCountMap[apexDomain] += 4;
-        }
-      } else if (line.startsWith('.customer')) {
-        domainCountMap[apexDomain] += 0.25;
+      const apexDomain = gorhill.getDomain(safeGorhillLine);
+      if (!apexDomain) {
+        console.log({ line });
+        continue;
       }
 
-      const tld = gorhill.getPublicSuffix(line[0] === '.' ? line.slice(1) : line);
+      const tld = gorhill.getPublicSuffix(safeGorhillLine);
       if (!tld || !BLACK_TLD.has(tld)) continue;
 
-      // Only when tld is black will this 1 weight be added
-      domainCountMap[apexDomain] += 1;
-
-      const lineLen = line.length;
-
-      if (lineLen > 19) {
-        // Add more weight if the domain is long enough
-        if (lineLen > 44) {
-          domainCountMap[apexDomain] += 3.5;
-        } else if (lineLen > 34) {
-          domainCountMap[apexDomain] += 2.5;
-        } else if (lineLen > 29) {
-          domainCountMap[apexDomain] += 1.5;
-        } else if (lineLen > 24) {
-          domainCountMap[apexDomain] += 0.75;
-        } else {
-          domainCountMap[apexDomain] += 0.25;
-        }
-
-        if (domainCountMap[apexDomain] < 5) {
-          const subdomain = tldts.getSubdomain(line, { detectIp: false });
-          if (subdomain?.includes('.')) {
-            domainCountMap[apexDomain] += 1.5;
-          }
-        }
-      }
+      domainCountMap[apexDomain] ||= 0;
+      domainCountMap[apexDomain] += calcDomainAbuseScore(line);
     }
   });
 
   const results = span.traceChildSync('get final phishing results', () => {
     const res: string[] = [];
     for (const domain in domainCountMap) {
-      if (domainCountMap[domain] >= 5) {
+      if (domainCountMap[domain] >= 8) {
         res.push(`.${domain}`);
       }
     }
@@ -204,3 +166,61 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
 
   return [results, domainSet] as const;
 });
+
+export function calcDomainAbuseScore(line: string) {
+  let weight = 1;
+
+  const isPhishingDomainMockingCoJp = line.includes('-co-jp');
+  if (isPhishingDomainMockingCoJp) {
+    weight += 0.5;
+  }
+
+  if (line.startsWith('.amaz')) {
+    weight += 0.5;
+
+    if (line.startsWith('.amazon-')) {
+      weight += 4.5;
+    }
+    if (isPhishingDomainMockingCoJp) {
+      weight += 4;
+    }
+  } else if (line.includes('.customer')) {
+    weight += 0.25;
+  }
+
+  const lineLen = line.length;
+
+  if (lineLen > 19) {
+    // Add more weight if the domain is long enough
+    if (lineLen > 44) {
+      weight += 3.5;
+    } else if (lineLen > 34) {
+      weight += 2.5;
+    } else if (lineLen > 29) {
+      weight += 1.5;
+    } else if (lineLen > 24) {
+      weight += 0.75;
+    } else {
+      weight += 0.25;
+    }
+  }
+
+  const subdomain = tldts.getSubdomain(line, { detectIp: false });
+
+  if (subdomain) {
+    if (subdomain.slice(1).includes('.')) {
+      weight += 1;
+    }
+    if (subdomain.length > 40) {
+      weight += 3;
+    } else if (subdomain.length > 30) {
+      weight += 1.5;
+    } else if (subdomain.length > 20) {
+      weight += 1;
+    } else if (subdomain.length > 10) {
+      weight += 0.1;
+    }
+  }
+
+  return weight;
+}

+ 3 - 0
Source/domainset/reject_sukka.conf

@@ -302,6 +302,7 @@ inst.360safe.com
 .pages.net.br
 .myenotice.com
 .eu5.net
+.jdie.pl
 
 # --- AD Block ---
 
@@ -733,6 +734,8 @@ comments.gazo.space
 .footprintdns.com
 .measure.office.com
 
+.opinionjet.com
+
 # >> Tracking
 .mktg.tags.f5.com
 .trk.caseads.com