build-reject-domainset.js 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. const { simpleGet } = require('./util-http-get');
  2. const { promises: fsPromises } = require('fs');
  3. const { resolve: pathResolve } = require('path');
  4. let cliProgress;
  5. let Piscina;
  6. try {
  7. Piscina = require('piscina');
  8. cliProgress = require('cli-progress');
  9. } catch (e) {
  10. console.log('Dependencies not found');
  11. console.log('"npm i cli-progress piscina" then try again!');
  12. console.error(e);
  13. process.exit(1);
  14. }
  15. /**
  16. * @param {string | URL} hostsUrl
  17. */
  18. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  19. if (typeof hostsUrl === 'string') {
  20. hostsUrl = new URL(hostsUrl);
  21. }
  22. /** @type Set<string> */
  23. const domainSets = new Set();
  24. /** @type string[] */
  25. const hosts = (await simpleGet.https(hostsUrl)).split('\n');
  26. hosts.forEach(line => {
  27. if (line.startsWith('#')) {
  28. return;
  29. }
  30. if (line.startsWith(' ') || line === '' || line.startsWith('\r') || line.startsWith('\n')) {
  31. return;
  32. }
  33. const [, ...domains] = line.split(' ');
  34. if (includeAllSubDomain) {
  35. domainSets.add(`.${domains.join(' ')}`.trim());
  36. } else {
  37. domainSets.add(domains.join(' ').trim());
  38. }
  39. });
  40. return [...domainSets];
  41. }
  42. /**
  43. * @param {string | URL} filterRulesUrl
  44. * @returns {Promise<{ white: Set<string>, black: Set<string> }>}
  45. */
  46. async function processFilterRules(filterRulesUrl) {
  47. if (typeof filterRulesUrl === 'string') {
  48. filterRulesUrl = new URL(filterRulesUrl);
  49. }
  50. /** @type Set<string> */
  51. const whitelistDomainSets = new Set([
  52. 'localhost',
  53. 'analytics.google.com',
  54. 'msa.cdn.mediaset.net' // Added manually using DOMAIN-KEYWORDS
  55. ]);
  56. /** @type Set<string> */
  57. const blacklistDomainSets = new Set();
  58. /** @type string[] */
  59. const filterRules = (await simpleGet.https(filterRulesUrl.hostname, filterRulesUrl.pathname)).split('\n');
  60. filterRules.forEach(line => {
  61. if (
  62. line.startsWith('#')
  63. || line.startsWith('!')
  64. || line.startsWith(' ')
  65. || line === ''
  66. || line.startsWith('\r')
  67. || line.startsWith('\n')
  68. || line.includes('*')
  69. || line.includes('/')
  70. || line.includes('$')
  71. ) {
  72. return;
  73. }
  74. if (line.startsWith('@@||')
  75. && (
  76. line.endsWith('^')
  77. || line.endsWith('^|')
  78. )
  79. ) {
  80. whitelistDomainSets.add(`${line.replaceAll('@@||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  81. } else if (
  82. line.startsWith('||')
  83. && (
  84. line.endsWith('^')
  85. || line.endsWith('^|')
  86. )
  87. ) {
  88. blacklistDomainSets.add(`.${line.replaceAll('||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  89. } else if (line.startsWith('://')
  90. && (
  91. line.endsWith('^')
  92. || line.endsWith('^|')
  93. )
  94. ) {
  95. blacklistDomainSets.add(`${line.replaceAll('://', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  96. }
  97. });
  98. return {
  99. white: whitelistDomainSets,
  100. black: blacklistDomainSets
  101. };
  102. }
  103. (async () => {
  104. /** @type Set<string> */
  105. const domainSets = new Set();
  106. // Parse from remote hosts
  107. (await Promise.all([
  108. processHosts('https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=1&mimetype=plaintext', true),
  109. processHosts('https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt'),
  110. processHosts('https://cdn.jsdelivr.net/gh/neoFelhz/neohosts@gh-pages/full/hosts')
  111. ])).forEach(hosts => {
  112. hosts.forEach(host => {
  113. if (host) {
  114. domainSets.add(host.trim());
  115. }
  116. });
  117. });
  118. console.log(`Import ${domainSets.size} rules from hosts files!`);
  119. await fsPromises.readFile(pathResolve(__dirname, '../List/domainset/reject_sukka.conf'), { encoding: 'utf-8' }).then(data => {
  120. data.split('\n').forEach(line => {
  121. if (
  122. line.startsWith('#')
  123. || line.startsWith(' ')
  124. || line === '' || line === ' '
  125. || line.startsWith('\r')
  126. || line.startsWith('\n')
  127. ) {
  128. return;
  129. }
  130. /* if (domainSets.has(line) || domainSets.has(`.${line}`)) {
  131. console.warn(`|${line}| is already in the list!`);
  132. } */
  133. domainSets.add(line.trim());
  134. });
  135. });
  136. console.log(`Import rules from reject_sukka.conf!`);
  137. // Parse from AdGuard Filters
  138. /** @type Set<string> */
  139. const filterRuleWhitelistDomainSets = new Set();
  140. (await Promise.all([
  141. processFilterRules('https://easylist.to/easylist/easylist.txt'),
  142. processFilterRules('https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt'),
  143. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_11_Mobile/filter.txt'),
  144. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_3_Spyware/filter.txt'),
  145. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_2_English/filter.txt'),
  146. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_224_Chinese/filter.txt')
  147. ])).forEach(({ white, black }) => {
  148. white.forEach(i => filterRuleWhitelistDomainSets.add(i));
  149. black.forEach(i => domainSets.add(i));
  150. });
  151. console.log(`Import rules from adguard filters!`);
  152. // Read DOMAIN Keyword
  153. const domainKeywordsSet = new Set();
  154. await fsPromises.readFile(pathResolve(__dirname, '../List/non_ip/reject.conf'), { encoding: 'utf-8' }).then(data => {
  155. data.split('\n').forEach(line => {
  156. if (line.startsWith('DOMAIN-KEYWORD')) {
  157. const [, ...keywords] = line.split(',');
  158. domainKeywordsSet.add(keywords.join(',').trim());
  159. }
  160. });
  161. });
  162. console.log(`Import ${domainKeywordsSet.size} black keywords!`);
  163. // Dedupe domainSets
  164. console.log(`Start deduping!`);
  165. const piscina = new Piscina({
  166. filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js')
  167. });
  168. const res = await Promise.all([
  169. piscina.run({ keywords: domainKeywordsSet, input: domainSets }, { name: 'dedupeKeywords' }),
  170. piscina.run({ whiteList: filterRuleWhitelistDomainSets, input: domainSets }, { name: 'whitelisted' }),
  171. ...sliceIntoChunks(Array.from(domainSets), 5000).map(chunk => piscina.run({ input: chunk, fullSet: domainSets }, { name: 'dedupe' }))
  172. ]);
  173. res.forEach(set => {
  174. set.forEach(i => domainSets.delete(i));
  175. });
  176. return fsPromises.writeFile(pathResolve(__dirname, '../List/domainset/reject.conf'), `${[...domainSets].join('\n')}\n`);
  177. })();
  178. function sliceIntoChunks(arr, chunkSize) {
  179. const res = [];
  180. for (let i = 0; i < arr.length; i += chunkSize) {
  181. const chunk = arr.slice(i, i + chunkSize);
  182. res.push(chunk);
  183. }
  184. return res;
  185. }