build-reject-domainset.js 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. const { simpleGet } = require('./util-http-get');
  2. const { promises: fsPromises } = require('fs');
  3. const { resolve: pathResolve } = require('path');
  4. const { cpus } = require('os');
  5. const threads = Math.max(cpus().length, 12);
  6. const rIPv4 = /((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)/;
  7. const Piscina = require('piscina');
  8. /**
  9. * @param {string | URL} domainListsUrl
  10. */
  11. async function processDomainLists(domainListsUrl) {
  12. if (typeof domainListsUrl === 'string') {
  13. domainListsUrl = new URL(domainListsUrl);
  14. }
  15. /** @type Set<string> */
  16. const domainSets = new Set();
  17. /** @type string[] */
  18. const domains = (await simpleGet.https(domainListsUrl)).split('\n');
  19. domains.forEach(line => {
  20. if (line.startsWith('#')) {
  21. return;
  22. }
  23. if (line.startsWith(' ') || line === '' || line.startsWith('\r') || line.startsWith('\n')) {
  24. return;
  25. }
  26. domainSets.add(line.trim());
  27. });
  28. return [...domainSets];
  29. }
  30. /**
  31. * @param {string | URL} hostsUrl
  32. */
  33. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  34. if (typeof hostsUrl === 'string') {
  35. hostsUrl = new URL(hostsUrl);
  36. }
  37. /** @type Set<string> */
  38. const domainSets = new Set();
  39. /** @type string[] */
  40. const hosts = (await simpleGet.https(hostsUrl)).split('\n');
  41. hosts.forEach(line => {
  42. if (line.includes('#')) {
  43. return;
  44. }
  45. if (line.startsWith(' ') || line.startsWith('\r') || line.startsWith('\n') || line.trim() === '') {
  46. return;
  47. }
  48. const [, ...domains] = line.split(' ');
  49. if (includeAllSubDomain) {
  50. domainSets.add(`.${domains.join(' ')}`.trim());
  51. } else {
  52. domainSets.add(domains.join(' ').trim());
  53. }
  54. });
  55. return [...domainSets];
  56. }
  57. /**
  58. * @param {string | URL} filterRulesUrl
  59. * @returns {Promise<{ white: Set<string>, black: Set<string> }>}
  60. */
  61. async function processFilterRules(filterRulesUrl) {
  62. if (typeof filterRulesUrl === 'string') {
  63. filterRulesUrl = new URL(filterRulesUrl);
  64. }
  65. /** @type Set<string> */
  66. const whitelistDomainSets = new Set([
  67. 'localhost',
  68. 'broadcasthost',
  69. 'ip6-loopback',
  70. 'ip6-localnet',
  71. 'ip6-mcastprefix',
  72. 'ip6-allnodes',
  73. 'ip6-allrouters',
  74. 'ip6-allhosts',
  75. 'mcastprefix',
  76. 'analytics.google.com',
  77. 'msa.cdn.mediaset.net', // Added manually using DOMAIN-KEYWORDS
  78. 'cloud.answerhub.com',
  79. 'ae01.alicdn.com',
  80. 'whoami.akamai.net',
  81. 'whoami.ds.akahelp.net'
  82. ]);
  83. /** @type Set<string> */
  84. const blacklistDomainSets = new Set();
  85. /** @type string[] */
  86. const filterRules = (await simpleGet.https(filterRulesUrl.hostname, filterRulesUrl.pathname)).split('\n');
  87. filterRules.forEach(line => {
  88. if (
  89. line.includes('#')
  90. || line.includes('!')
  91. || line.startsWith(' ')
  92. || line.startsWith('\r')
  93. || line.startsWith('\n')
  94. || line.includes('*')
  95. || line.includes('/')
  96. || line.includes('$')
  97. || line.trim() === ''
  98. || rIPv4.test(line)
  99. ) {
  100. return;
  101. }
  102. if (line.startsWith('@@||')
  103. && (
  104. line.endsWith('^')
  105. || line.endsWith('^|')
  106. )
  107. ) {
  108. whitelistDomainSets.add(`${line.replaceAll('@@||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  109. } else if (
  110. line.startsWith('||')
  111. && (
  112. line.endsWith('^')
  113. || line.endsWith('^|')
  114. )
  115. ) {
  116. blacklistDomainSets.add(`.${line.replaceAll('||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  117. } else if (line.startsWith('://')
  118. && (
  119. line.endsWith('^')
  120. || line.endsWith('^|')
  121. )
  122. ) {
  123. blacklistDomainSets.add(`${line.replaceAll('://', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  124. }
  125. });
  126. return {
  127. white: whitelistDomainSets,
  128. black: blacklistDomainSets
  129. };
  130. }
  131. (async () => {
  132. /** @type Set<string> */
  133. const domainSets = new Set();
  134. // Parse from remote hosts & domain lists
  135. (await Promise.all([
  136. processHosts('https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true),
  137. processHosts('https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt')
  138. ])).forEach(hosts => {
  139. hosts.forEach(host => {
  140. if (host) {
  141. domainSets.add(host.trim());
  142. }
  143. });
  144. });
  145. const hostsSize = domainSets.size;
  146. console.log(`Import ${hostsSize} rules from hosts files!`);
  147. await fsPromises.readFile(pathResolve(__dirname, '../List/domainset/reject_sukka.conf'), { encoding: 'utf-8' }).then(data => {
  148. data.split('\n').forEach(line => {
  149. if (
  150. line.startsWith('#')
  151. || line.startsWith(' ')
  152. || line === '' || line === ' '
  153. || line.startsWith('\r')
  154. || line.startsWith('\n')
  155. ) {
  156. return;
  157. }
  158. /* if (domainSets.has(line) || domainSets.has(`.${line}`)) {
  159. console.warn(`|${line}| is already in the list!`);
  160. } */
  161. domainSets.add(line.trim());
  162. });
  163. });
  164. const sukkaSize = domainSets.size - hostsSize;
  165. console.log(`Import ${sukkaSize} rules from reject_sukka.conf!`);
  166. // Parse from AdGuard Filters
  167. /** @type Set<string> */
  168. const filterRuleWhitelistDomainSets = new Set();
  169. (await Promise.all([
  170. processFilterRules('https://easylist.to/easylist/easylist.txt'),
  171. processFilterRules('https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt'),
  172. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_11_Mobile/filter.txt'),
  173. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_3_Spyware/filter.txt'),
  174. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_2_English/filter.txt'),
  175. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_224_Chinese/filter.txt'),
  176. processFilterRules('https://filters.adtidy.org/extension/ublock/filters/224.txt'),
  177. processFilterRules('https://easylist.to/easylist/easyprivacy.txt'),
  178. processFilterRules('https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt'),
  179. processFilterRules('https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'),
  180. processFilterRules('https://curben.gitlab.io/malware-filter/urlhaus-filter-agh-online.txt')
  181. ])).forEach(({ white, black }) => {
  182. white.forEach(i => filterRuleWhitelistDomainSets.add(i));
  183. black.forEach(i => domainSets.add(i));
  184. });
  185. const adguardSize = domainSets.size - hostsSize - sukkaSize;
  186. console.log(`Import ${adguardSize} rules from adguard filters!`);
  187. // Read DOMAIN Keyword
  188. const domainKeywordsSet = new Set();
  189. const domainSuffixSet = new Set();
  190. await fsPromises.readFile(pathResolve(__dirname, '../List/non_ip/reject.conf'), { encoding: 'utf-8' }).then(data => {
  191. data.split('\n').forEach(line => {
  192. if (line.startsWith('DOMAIN-KEYWORD')) {
  193. const [, ...keywords] = line.split(',');
  194. domainKeywordsSet.add(keywords.join(',').trim());
  195. } else if (line.startsWith('DOMAIN-SUFFIX')) {
  196. const [, ...keywords] = line.split(',');
  197. domainSuffixSet.add(keywords.join(',').trim());
  198. }
  199. });
  200. });
  201. console.log(`Import ${domainKeywordsSet.size} black keywords!`);
  202. const beforeDeduping = domainSets.size;
  203. // Dedupe domainSets
  204. console.log(`Start deduping! (${beforeDeduping})`);
  205. const piscina = new Piscina({
  206. filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js')
  207. });
  208. (await Promise.all([
  209. piscina.run({ keywords: domainKeywordsSet, suffixes: domainSuffixSet, input: domainSets }, { name: 'dedupeKeywords' }),
  210. piscina.run({ whiteList: filterRuleWhitelistDomainSets, input: domainSets }, { name: 'whitelisted' })
  211. ])).forEach(set => {
  212. set.forEach(i => domainSets.delete(i));
  213. });
  214. const fullSet = new Set([...domainSets]);
  215. (await Promise.all(
  216. Array.from(domainSets).reduce((result, element, index) => {
  217. const chunk = index % threads;
  218. result[chunk] ??= [];
  219. result[chunk].push(element);
  220. return result;
  221. }, []).map(chunk => piscina.run({ input: chunk, fullSet }, { name: 'dedupe' }))
  222. )).forEach(set => {
  223. set.forEach(i => domainSets.delete(i));
  224. });
  225. console.log(`Deduped ${beforeDeduping - domainSets.size} rules!`);
  226. return fsPromises.writeFile(
  227. pathResolve(__dirname, '../List/domainset/reject.conf'),
  228. `${[...domainSets].join('\n')}\n`,
  229. { encoding: 'utf-8' });
  230. })();