浏览代码

Update Phishing Hosts building

SukkaW 1 年之前
父节点
当前提交
72670e6243
共有 3 个文件被更改,包括 21 次插入23 次删除
  1. 2 3
      Build/build-reject-domainset.ts
  2. 6 9
      Build/constants/reject-data-source.ts
  3. 13 11
      Build/lib/get-phishing-domains.ts

+ 2 - 3
Build/build-reject-domainset.ts

@@ -4,7 +4,7 @@ import process from 'node:process';
 
 import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
 
-import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source';
+import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source';
 import { compareAndWriteFile } from './lib/create-file';
 import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
 import { task } from './trace';
@@ -47,8 +47,7 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
       ...HOSTS_EXTRA.map(host => ` - ${host[0]}`),
       ...DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`),
       ...ADGUARD_FILTERS_EXTRA.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`),
-      ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`),
-      ...PHISHING_HOSTS_EXTRA.map(host => ` - ${host[0]}`)
+      ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`)
     ]);
 
   const appendArrayToRejectOutput = rejectOutput.addFromDomainset.bind(rejectOutput);

+ 6 - 9
Build/constants/reject-data-source.ts

@@ -22,7 +22,12 @@ export const HOSTS_EXTRA: HostsSource[] = [
   // Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller
   ['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()],
   // ad-wars is not actively maintained, so we set a 7 days cache ttl
-  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.TWO_WEEKS()]
+  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.TWO_WEEKS()],
+  [
+    'https://raw.githubusercontent.com/durablenapkin/scamblocklist/master/hosts.txt',
+    [],
+    true, TTL.TWLVE_HOURS()
+  ]
 ];
 
 export const DOMAIN_LISTS: HostsSource[] = [
@@ -97,14 +102,6 @@ export const PHISHING_DOMAIN_LISTS_EXTRA: HostsSource[] = [
   ]
 ];
 
-export const PHISHING_HOSTS_EXTRA: HostsSource[] = [
-  [
-    'https://raw.githubusercontent.com/durablenapkin/scamblocklist/master/hosts.txt',
-    [],
-    true, TTL.TWLVE_HOURS()
-  ]
-];
-
 type AdGuardFilterSource = [main: string, mirrors: string[] | null, ttl: number, allowThirdParty?: boolean];
 
 export const ADGUARD_FILTERS: AdGuardFilterSource[] = [

+ 13 - 11
Build/lib/get-phishing-domains.ts

@@ -1,10 +1,10 @@
-import { processDomainLists, processHosts } from './parse-filter';
+import { processDomainLists } from './parse-filter';
 import * as tldts from 'tldts-experimental';
 
 import { dummySpan } from '../trace';
 import type { Span } from '../trace';
 import { appendArrayInPlaceCurried } from './append-array-in-place';
-import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source';
+import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source';
 import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt';
 import picocolors from 'picocolors';
 import createKeywordFilter from './aho-corasick';
@@ -153,7 +153,8 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr:
   for (const apexDomain in domainCountMap) {
     if (
       // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
-      domainScoreMap[apexDomain] >= 16
+      (domainScoreMap[apexDomain] >= 24)
+      || (domainScoreMap[apexDomain] >= 16 && domainCountMap[apexDomain] >= 4)
       || (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 7)
       || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 10)
       || (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 16)
@@ -162,6 +163,11 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr:
     }
   }
 
+  // console.log(
+  //   domainScoreMap['wordpress.com'],
+  //   domainCountMap['wordpress.com']
+  // );
+
   return Promise.resolve(domainArr);
 }, {
   serializer: serializeArray,
@@ -175,8 +181,6 @@ export function getPhishingDomains(parentSpan: Span) {
 
       (await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry, cacheKey))))
         .forEach(appendArrayInPlaceCurried(domainArr));
-      (await Promise.all(PHISHING_HOSTS_EXTRA.map(entry => processHosts(curSpan, ...entry, cacheKey))))
-        .forEach(appendArrayInPlaceCurried(domainArr));
 
       return domainArr;
     });
@@ -205,9 +209,9 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub
 
   const subdomainLength = subdomain.length;
 
-  if (subdomainLength > 4) {
-    weight += 0.5;
-    if (subdomainLength > 10) {
+  if (subdomainLength > 6) {
+    weight += 0.25;
+    if (subdomainLength > 11) {
       weight += 0.6;
       if (subdomainLength > 20) {
         weight += 1;
@@ -220,9 +224,7 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub
       }
     }
 
-    if (subdomain.startsWith('www.')) {
-      weight += 1;
-    } else if (subdomain.slice(1).includes('.')) {
+    if (subdomain.slice(1).includes('.')) {
       weight += 1;
       if (subdomain.includes('www.')) {
         weight += 1;