build-reject-domainset.js 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. const { default: got } = require('got-cjs');
  2. const { promises: fsPromises } = require('fs');
  3. const { resolve: pathResolve } = require('path');
  4. const { cpus } = require('os');
  5. const threads = Math.max(cpus().length, 12);
  6. const rIPv4 = /((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)/;
  7. const rDomain = /^(((?!\-))(xn\-\-)?[a-z0-9\-_]{0,61}[a-z0-9]{1,1}\.)*(xn\-\-)?([a-z0-9\-]{1,61}|[a-z0-9\-]{1,30})\.[a-z]{2,}$/m
  8. const Piscina = require('piscina');
  9. /**
  10. * @param {string | URL} domainListsUrl
  11. */
  12. async function processDomainLists(domainListsUrl) {
  13. if (typeof domainListsUrl === 'string') {
  14. domainListsUrl = new URL(domainListsUrl);
  15. }
  16. /** @type Set<string> */
  17. const domainSets = new Set();
  18. /** @type string[] */
  19. const domains = (await got(domainListsUrl).text()).split('\n');
  20. domains.forEach(line => {
  21. if (
  22. line.startsWith('#')
  23. || line.startsWith('!')
  24. || line.startsWith(' ')
  25. || line === ''
  26. || line.startsWith('\r')
  27. || line.startsWith('\n')
  28. ) {
  29. return;
  30. }
  31. domainSets.add(line.trim());
  32. });
  33. return [...domainSets];
  34. }
  35. /**
  36. * @param {string | URL} hostsUrl
  37. */
  38. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  39. if (typeof hostsUrl === 'string') {
  40. hostsUrl = new URL(hostsUrl);
  41. }
  42. /** @type Set<string> */
  43. const domainSets = new Set();
  44. /** @type string[] */
  45. const hosts = (await got(hostsUrl).text()).split('\n');
  46. hosts.forEach(line => {
  47. if (line.includes('#')) {
  48. return;
  49. }
  50. if (line.startsWith(' ') || line.startsWith('\r') || line.startsWith('\n') || line.trim() === '') {
  51. return;
  52. }
  53. const [, ...domains] = line.split(' ');
  54. const domain = domains.join(' ').trim();
  55. if (rDomain.test(domain)) {
  56. if (includeAllSubDomain) {
  57. domainSets.add(`.${domain}`);
  58. } else {
  59. domainSets.add(domain);
  60. }
  61. }
  62. });
  63. return [...domainSets];
  64. }
  65. /**
  66. * @param {string | URL} filterRulesUrl
  67. * @returns {Promise<{ white: Set<string>, black: Set<string> }>}
  68. */
  69. async function processFilterRules(filterRulesUrl) {
  70. if (typeof filterRulesUrl === 'string') {
  71. filterRulesUrl = new URL(filterRulesUrl);
  72. }
  73. /** @type Set<string> */
  74. const whitelistDomainSets = new Set([
  75. 'localhost',
  76. 'broadcasthost',
  77. 'ip6-loopback',
  78. 'ip6-localnet',
  79. 'ip6-mcastprefix',
  80. 'ip6-allnodes',
  81. 'ip6-allrouters',
  82. 'ip6-allhosts',
  83. 'mcastprefix',
  84. 'analytics.google.com',
  85. 'msa.cdn.mediaset.net', // Added manually using DOMAIN-KEYWORDS
  86. 'cloud.answerhub.com',
  87. 'ae01.alicdn.com',
  88. 'whoami.akamai.net',
  89. 'whoami.ds.akahelp.net',
  90. 'pxlk9.net.', // This one is malformed from EasyList, which I will manually add instead
  91. 'instant.page', // No, it doesn't violate anyone's privacy. I will whitelist it
  92. 'piwik.pro',
  93. 'mixpanel.com',
  94. 'heapanalytics.com',
  95. 'dataunlocker.com',
  96. 'segment.com',
  97. 'segment.io',
  98. 'segmentify.com'
  99. ]);
  100. /** @type Set<string> */
  101. const blacklistDomainSets = new Set();
  102. /** @type Set<string> */
  103. const blackIPSets = new Set();
  104. /** @type string[] */
  105. const filterRules = (await got(filterRulesUrl).text()).split('\n');
  106. filterRules.forEach(line => {
  107. if (
  108. line.includes('#')
  109. || line.includes('!')
  110. || line.startsWith(' ')
  111. || line.startsWith('\r')
  112. || line.startsWith('\n')
  113. || line.includes('*')
  114. || line.includes('/')
  115. || line.includes('$')
  116. || line.trim() === ''
  117. || rIPv4.test(line)
  118. ) {
  119. return;
  120. }
  121. if (line.startsWith('@@||')
  122. && (
  123. line.endsWith('^')
  124. || line.endsWith('^|')
  125. )
  126. ) {
  127. const domain = `${line.replaceAll('@@||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim();
  128. if (rDomain.test(domain)) {
  129. whitelistDomainSets.add(domain);
  130. }
  131. } else if (
  132. line.startsWith('||')
  133. && (
  134. line.endsWith('^')
  135. || line.endsWith('^|')
  136. )
  137. ) {
  138. const domain = `${line.replaceAll('||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim();
  139. if (rDomain.test(domain)) {
  140. blacklistDomainSets.add(`.${domain}`);
  141. }
  142. } else if (line.startsWith('://')
  143. && (
  144. line.endsWith('^')
  145. || line.endsWith('^|')
  146. )
  147. ) {
  148. const domain = `${line.replaceAll('://', '').replaceAll('^|', '').replaceAll('^', '')}`.trim();
  149. if (rDomain.test(domain)) {
  150. blacklistDomainSets.add(domain);
  151. }
  152. }
  153. });
  154. return {
  155. white: whitelistDomainSets,
  156. black: blacklistDomainSets
  157. };
  158. }
  159. (async () => {
  160. /** @type Set<string> */
  161. const domainSets = new Set();
  162. // Parse from remote hosts & domain lists
  163. (await Promise.all([
  164. processHosts('https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true),
  165. processHosts('https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt')
  166. ])).forEach(hosts => {
  167. hosts.forEach(host => {
  168. if (host) {
  169. domainSets.add(host);
  170. }
  171. });
  172. });
  173. const hostsSize = domainSets.size;
  174. console.log(`Import ${hostsSize} rules from hosts files!`);
  175. await fsPromises.readFile(pathResolve(__dirname, '../List/domainset/reject_sukka.conf'), { encoding: 'utf-8' }).then(data => {
  176. data.split('\n').forEach(line => {
  177. if (
  178. line.startsWith('#')
  179. || line.startsWith(' ')
  180. || line === '' || line === ' '
  181. || line.startsWith('\r')
  182. || line.startsWith('\n')
  183. ) {
  184. return;
  185. }
  186. /* if (domainSets.has(line) || domainSets.has(`.${line}`)) {
  187. console.warn(`|${line}| is already in the list!`);
  188. } */
  189. domainSets.add(line.trim());
  190. });
  191. });
  192. const sukkaSize = domainSets.size - hostsSize;
  193. console.log(`Import ${sukkaSize} rules from reject_sukka.conf!`);
  194. // Parse from AdGuard Filters
  195. /** @type Set<string> */
  196. const filterRuleWhitelistDomainSets = new Set();
  197. (await Promise.all([
  198. processFilterRules('https://easylist.to/easylist/easylist.txt'),
  199. processFilterRules('https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt'),
  200. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_11_Mobile/filter.txt'),
  201. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_3_Spyware/filter.txt'),
  202. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_2_English/filter.txt'),
  203. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_224_Chinese/filter.txt'),
  204. processFilterRules('https://filters.adtidy.org/extension/ublock/filters/224.txt'),
  205. processFilterRules('https://easylist.to/easylist/easyprivacy.txt'),
  206. processFilterRules('https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt'),
  207. processFilterRules('https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'),
  208. processFilterRules('https://curben.gitlab.io/malware-filter/urlhaus-filter-agh-online.txt'),
  209. processFilterRules('https://curben.gitlab.io/malware-filter/pup-filter-agh.txt'),
  210. processFilterRules('https://curben.gitlab.io/malware-filter/phishing-filter-agh.txt'),
  211. processFilterRules('https://curben.gitlab.io/malware-filter/pup-filter-agh.txt')
  212. ])).forEach(({ white, black }) => {
  213. white.forEach(i => filterRuleWhitelistDomainSets.add(i));
  214. black.forEach(i => domainSets.add(i));
  215. });
  216. const adguardSize = domainSets.size - hostsSize - sukkaSize;
  217. console.log(`Import ${adguardSize} rules from adguard filters!`);
  218. // Read DOMAIN Keyword
  219. const domainKeywordsSet = new Set();
  220. const domainSuffixSet = new Set();
  221. await fsPromises.readFile(pathResolve(__dirname, '../List/non_ip/reject.conf'), { encoding: 'utf-8' }).then(data => {
  222. data.split('\n').forEach(line => {
  223. if (line.startsWith('DOMAIN-KEYWORD')) {
  224. const [, ...keywords] = line.split(',');
  225. domainKeywordsSet.add(keywords.join(',').trim());
  226. } else if (line.startsWith('DOMAIN-SUFFIX')) {
  227. const [, ...keywords] = line.split(',');
  228. domainSuffixSet.add(keywords.join(',').trim());
  229. }
  230. });
  231. });
  232. console.log(`Import ${domainKeywordsSet.size} black keywords!`);
  233. const beforeDeduping = domainSets.size;
  234. // Dedupe domainSets
  235. console.log(`Start deduping! (${beforeDeduping})`);
  236. const piscina = new Piscina({
  237. filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js')
  238. });
  239. (await Promise.all([
  240. piscina.run({ keywords: domainKeywordsSet, suffixes: domainSuffixSet, input: domainSets }, { name: 'dedupeKeywords' }),
  241. piscina.run({ whiteList: filterRuleWhitelistDomainSets, input: domainSets }, { name: 'whitelisted' })
  242. ])).forEach(set => {
  243. set.forEach(i => domainSets.delete(i));
  244. });
  245. const originalFullSet = new Set([...domainSets]);
  246. (await Promise.all(
  247. Array.from(domainSets).reduce((result, element, index) => {
  248. const chunk = index % threads;
  249. result[chunk] ??= [];
  250. result[chunk].push(element);
  251. return result;
  252. }, []).map(chunk => piscina.run({ input: chunk, fullSet: originalFullSet }, { name: 'dedupe' }))
  253. )).forEach(set => {
  254. set.forEach(i => domainSets.delete(i));
  255. });
  256. console.log(`Deduped ${beforeDeduping - domainSets.size} rules!`);
  257. return fsPromises.writeFile(
  258. pathResolve(__dirname, '../List/domainset/reject.conf'),
  259. `${[...domainSets].join('\n')}\n`,
  260. { encoding: 'utf-8' });
  261. })();