get-phishing-domains.ts 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. import Worktank from 'worktank';
  2. import { dummySpan, printTraceResult } from '../trace';
  3. import type { Span } from '../trace';
  4. import type { TldTsParsed } from './normalize-domain';
  5. const pool = new Worktank({
  6. name: 'process-phishing-domains',
  7. size: 1,
  8. timeout: 10000, // The maximum number of milliseconds to wait for the result from the worker, if exceeded the worker is terminated and the execution promise rejects
  9. warmup: true,
  10. autoterminate: 30000, // The interval of milliseconds at which to check if the pool can be automatically terminated, to free up resources, workers will be spawned up again if needed
  11. env: {},
  12. methods: {
  13. // eslint-disable-next-line object-shorthand -- workertank
  14. getPhishingDomains: async function (
  15. importMetaUrl: string,
  16. /** require.main === module */ isDebug = false
  17. ): Promise<string[]> {
  18. // TODO: createRequire is a temporary workaround for https://github.com/nodejs/node/issues/51956
  19. const { default: module } = await import('node:module');
  20. const __require = module.createRequire(importMetaUrl);
  21. const picocolors = __require('picocolors') as typeof import('picocolors');
  22. const tldts = __require('tldts-experimental') as typeof import('tldts-experimental');
  23. const { appendArrayInPlaceCurried } = __require('foxts/append-array-in-place') as typeof import('foxts/append-array-in-place');
  24. const { loosTldOptWithPrivateDomains } = __require('../constants/loose-tldts-opt') as typeof import('../constants/loose-tldts-opt');
  25. const { BLACK_TLD, WHITELIST_MAIN_DOMAINS, leathalKeywords, lowKeywords, sensitiveKeywords } = __require('../constants/phishing-score-source') as typeof import('../constants/phishing-score-source');
  26. const { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } = __require('../constants/reject-data-source') as typeof import('../constants/reject-data-source');
  27. const { dummySpan } = __require('../trace') as typeof import('../trace');
  28. const NullPrototypeObject = __require('null-prototype-object') as typeof import('null-prototype-object');
  29. const { processHostsWithPreload } = __require('./parse-filter/hosts') as typeof import('./parse-filter/hosts');
  30. const { processDomainListsWithPreload } = __require('./parse-filter/domainlists') as typeof import('./parse-filter/domainlists');
  31. const downloads = [
  32. ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry)),
  33. ...PHISHING_HOSTS_EXTRA.map(entry => processHostsWithPreload(...entry))
  34. ];
  35. const domainArr: string[] = [];
  36. const domainGroups = await Promise.all(downloads.map(task => task(dummySpan)));
  37. domainGroups.forEach(appendArrayInPlaceCurried(domainArr));
  38. // return domainArr;
  39. const domainCountMap = new Map<string, number>();
  40. const domainScoreMap: Record<string, number> = new NullPrototypeObject();
  41. let line = '';
  42. let tld: string | null = '';
  43. let apexDomain: string | null = '';
  44. let subdomain: string | null = '';
  45. let parsed: TldTsParsed;
  46. // const set = new Set<string>();
  47. // let duplicateCount = 0;
  48. for (let i = 0, len = domainArr.length; i < len; i++) {
  49. line = domainArr[i];
  50. // if (set.has(line)) {
  51. // duplicateCount++;
  52. // } else {
  53. // set.add(line);
  54. // }
  55. parsed = tldts.parse(line, loosTldOptWithPrivateDomains);
  56. if (parsed.isPrivate) {
  57. continue;
  58. }
  59. tld = parsed.publicSuffix;
  60. apexDomain = parsed.domain;
  61. if (!tld) {
  62. console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
  63. continue;
  64. }
  65. if (!apexDomain) {
  66. console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
  67. continue;
  68. }
  69. if (WHITELIST_MAIN_DOMAINS.has(apexDomain)) {
  70. continue;
  71. }
  72. domainCountMap.set(
  73. apexDomain,
  74. domainCountMap.has(apexDomain)
  75. ? domainCountMap.get(apexDomain)! + 1
  76. : 1
  77. );
  78. let score = 0;
  79. if (apexDomain in domainScoreMap) {
  80. score = domainScoreMap[apexDomain];
  81. } else {
  82. if (BLACK_TLD.has(tld)) {
  83. score += 3;
  84. } else if (tld.length > 6) {
  85. score += 2;
  86. }
  87. if (apexDomain.length >= 18) {
  88. score += 0.5;
  89. }
  90. }
  91. subdomain = parsed.subdomain;
  92. if (subdomain) {
  93. score += calcDomainAbuseScore(subdomain, line);
  94. }
  95. domainScoreMap[apexDomain] = score;
  96. }
  97. domainCountMap.forEach((count, apexDomain) => {
  98. const score = domainScoreMap[apexDomain];
  99. if (
  100. // !WHITELIST_MAIN_DOMAINS.has(apexDomain)
  101. (score >= 24)
  102. || (score >= 16 && count >= 7)
  103. || (score >= 13 && count >= 11)
  104. || (score >= 5 && count >= 14)
  105. || (score >= 3 && count >= 21)
  106. || (score >= 1 && count >= 60)
  107. ) {
  108. domainArr.push('.' + apexDomain);
  109. }
  110. });
  111. if (isDebug) {
  112. console.log({
  113. v: 1,
  114. score: domainScoreMap['com-ticketry.world'],
  115. count: domainCountMap.get('com-ticketry.world'),
  116. domainArrLen: domainArr.length
  117. });
  118. }
  119. return domainArr;
  120. function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) {
  121. if (leathalKeywords(fullDomain)) {
  122. return 100;
  123. }
  124. let weight = 0;
  125. const hitLowKeywords = lowKeywords(fullDomain);
  126. const sensitiveKeywordsHit = sensitiveKeywords(fullDomain);
  127. if (sensitiveKeywordsHit) {
  128. weight += 15;
  129. if (hitLowKeywords) {
  130. weight += 10;
  131. }
  132. } else if (hitLowKeywords) {
  133. weight += 2;
  134. }
  135. const subdomainLength = subdomain.length;
  136. if (subdomainLength > 6) {
  137. weight += 0.015;
  138. if (subdomainLength > 13) {
  139. weight += 0.2;
  140. if (subdomainLength > 20) {
  141. weight += 1;
  142. if (subdomainLength > 30) {
  143. weight += 5;
  144. if (subdomainLength > 40) {
  145. weight += 10;
  146. }
  147. }
  148. }
  149. if (subdomain.indexOf('.', 1) > 1) {
  150. weight += 1;
  151. }
  152. }
  153. }
  154. return weight;
  155. }
  156. }
  157. }
  158. });
  159. export function getPhishingDomains(parentSpan: Span) {
  160. return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => span.traceChildAsync(
  161. 'process phishing domain set',
  162. async () => {
  163. const phishingDomains = await pool.exec(
  164. 'getPhishingDomains',
  165. [
  166. import.meta.url,
  167. require.main === module
  168. ]
  169. );
  170. pool.terminate();
  171. return phishingDomains;
  172. }
  173. ));
  174. }
  175. if (require.main === module) {
  176. getPhishingDomains(dummySpan)
  177. .catch(console.error)
  178. .finally(() => {
  179. dummySpan.stop();
  180. printTraceResult(dummySpan.traceResult);
  181. });
  182. }