build-reject-domainset.js 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. const { simpleGet } = require('./util-http-get');
  2. const { promises: fsPromises } = require('fs');
  3. const { resolve: pathResolve } = require('path');
  4. let Piscina;
  5. try {
  6. Piscina = require('piscina');
  7. } catch (e) {
  8. console.log('Dependencies not found');
  9. console.log('"npm i piscina" then try again!');
  10. console.error(e);
  11. process.exit(1);
  12. }
  13. /**
  14. * @param {string | URL} domainListsUrl
  15. */
  16. async function processDomainLists(domainListsUrl) {
  17. if (typeof domainListsUrl === 'string') {
  18. domainListsUrl = new URL(domainListsUrl);
  19. }
  20. /** @type Set<string> */
  21. const domainSets = new Set();
  22. /** @type string[] */
  23. const domains = (await simpleGet.https(domainListsUrl)).split('\n');
  24. domains.forEach(line => {
  25. if (line.startsWith('#')) {
  26. return;
  27. }
  28. if (line.startsWith(' ') || line === '' || line.startsWith('\r') || line.startsWith('\n')) {
  29. return;
  30. }
  31. domainSets.add(line.trim());
  32. });
  33. return [...domainSets];
  34. }
  35. /**
  36. * @param {string | URL} hostsUrl
  37. */
  38. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  39. if (typeof hostsUrl === 'string') {
  40. hostsUrl = new URL(hostsUrl);
  41. }
  42. /** @type Set<string> */
  43. const domainSets = new Set();
  44. /** @type string[] */
  45. const hosts = (await simpleGet.https(hostsUrl)).split('\n');
  46. hosts.forEach(line => {
  47. if (line.includes('#')) {
  48. return;
  49. }
  50. if (line.startsWith(' ') || line === '' || line.startsWith('\r') || line.startsWith('\n')) {
  51. return;
  52. }
  53. const [, ...domains] = line.split(' ');
  54. if (includeAllSubDomain) {
  55. domainSets.add(`.${domains.join(' ')}`.trim());
  56. } else {
  57. domainSets.add(domains.join(' ').trim());
  58. }
  59. });
  60. return [...domainSets];
  61. }
  62. /**
  63. * @param {string | URL} filterRulesUrl
  64. * @returns {Promise<{ white: Set<string>, black: Set<string> }>}
  65. */
  66. async function processFilterRules(filterRulesUrl) {
  67. if (typeof filterRulesUrl === 'string') {
  68. filterRulesUrl = new URL(filterRulesUrl);
  69. }
  70. /** @type Set<string> */
  71. const whitelistDomainSets = new Set([
  72. 'localhost',
  73. 'broadcasthost',
  74. 'ip6-loopback',
  75. 'ip6-localnet',
  76. 'ip6-mcastprefix',
  77. 'ip6-allnodes',
  78. 'ip6-allrouters',
  79. 'ip6-allhosts',
  80. 'mcastprefix',
  81. 'analytics.google.com',
  82. 'msa.cdn.mediaset.net', // Added manually using DOMAIN-KEYWORDS
  83. 'cloud.answerhub.com',
  84. 'ae01.alicdn.com',
  85. 'whoami.akamai.net',
  86. 'whoami.ds.akahelp.net'
  87. ]);
  88. /** @type Set<string> */
  89. const blacklistDomainSets = new Set();
  90. /** @type string[] */
  91. const filterRules = (await simpleGet.https(filterRulesUrl.hostname, filterRulesUrl.pathname)).split('\n');
  92. filterRules.forEach(line => {
  93. if (
  94. line.startsWith('#')
  95. || line.startsWith('!')
  96. || line.startsWith(' ')
  97. || line === ''
  98. || line.startsWith('\r')
  99. || line.startsWith('\n')
  100. || line.includes('*')
  101. || line.includes('/')
  102. || line.includes('$')
  103. ) {
  104. return;
  105. }
  106. if (line.startsWith('@@||')
  107. && (
  108. line.endsWith('^')
  109. || line.endsWith('^|')
  110. )
  111. ) {
  112. whitelistDomainSets.add(`${line.replaceAll('@@||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  113. } else if (
  114. line.startsWith('||')
  115. && (
  116. line.endsWith('^')
  117. || line.endsWith('^|')
  118. )
  119. ) {
  120. blacklistDomainSets.add(`.${line.replaceAll('||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  121. } else if (line.startsWith('://')
  122. && (
  123. line.endsWith('^')
  124. || line.endsWith('^|')
  125. )
  126. ) {
  127. blacklistDomainSets.add(`${line.replaceAll('://', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  128. }
  129. });
  130. return {
  131. white: whitelistDomainSets,
  132. black: blacklistDomainSets
  133. };
  134. }
  135. (async () => {
  136. /** @type Set<string> */
  137. const domainSets = new Set();
  138. // Parse from remote hosts & domain lists
  139. (await Promise.all([
  140. processHosts('https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=1&mimetype=plaintext', true),
  141. processHosts('https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt'),
  142. processHosts('https://cdn.jsdelivr.net/gh/neoFelhz/neohosts@gh-pages/full/hosts')
  143. ])).forEach(hosts => {
  144. hosts.forEach(host => {
  145. if (host) {
  146. domainSets.add(host.trim());
  147. }
  148. });
  149. });
  150. const hostsSize = domainSets.size;
  151. console.log(`Import ${hostsSize} rules from hosts files!`);
  152. await fsPromises.readFile(pathResolve(__dirname, '../List/domainset/reject_sukka.conf'), { encoding: 'utf-8' }).then(data => {
  153. data.split('\n').forEach(line => {
  154. if (
  155. line.startsWith('#')
  156. || line.startsWith(' ')
  157. || line === '' || line === ' '
  158. || line.startsWith('\r')
  159. || line.startsWith('\n')
  160. ) {
  161. return;
  162. }
  163. /* if (domainSets.has(line) || domainSets.has(`.${line}`)) {
  164. console.warn(`|${line}| is already in the list!`);
  165. } */
  166. domainSets.add(line.trim());
  167. });
  168. });
  169. const sukkaSize = domainSets.size - hostsSize;
  170. console.log(`Import ${sukkaSize} rules from reject_sukka.conf!`);
  171. // Parse from AdGuard Filters
  172. /** @type Set<string> */
  173. const filterRuleWhitelistDomainSets = new Set();
  174. (await Promise.all([
  175. processFilterRules('https://easylist.to/easylist/easylist.txt'),
  176. processFilterRules('https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt'),
  177. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_11_Mobile/filter.txt'),
  178. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_3_Spyware/filter.txt'),
  179. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_2_English/filter.txt'),
  180. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_224_Chinese/filter.txt')
  181. ])).forEach(({ white, black }) => {
  182. white.forEach(i => filterRuleWhitelistDomainSets.add(i));
  183. black.forEach(i => domainSets.add(i));
  184. });
  185. const adguardSize = domainSets.size - hostsSize - sukkaSize;
  186. console.log(`Import ${adguardSize} rules from adguard filters!`);
  187. // Read DOMAIN Keyword
  188. const domainKeywordsSet = new Set();
  189. const domainSuffixSet = new Set();
  190. await fsPromises.readFile(pathResolve(__dirname, '../List/non_ip/reject.conf'), { encoding: 'utf-8' }).then(data => {
  191. data.split('\n').forEach(line => {
  192. if (line.startsWith('DOMAIN-KEYWORD')) {
  193. const [, ...keywords] = line.split(',');
  194. domainKeywordsSet.add(keywords.join(',').trim());
  195. } else if (line.startsWith('DOMAIN-SUFFIX')) {
  196. const [, ...keywords] = line.split(',');
  197. domainSuffixSet.add(keywords.join(',').trim());
  198. }
  199. });
  200. });
  201. console.log(`Import ${domainKeywordsSet.size} black keywords!`);
  202. const beforeDeduping = domainSets.size;
  203. // Dedupe domainSets
  204. console.log(`Start deduping! (${beforeDeduping})`);
  205. const piscina = new Piscina({
  206. filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js')
  207. });
  208. const res2 = await Promise.all([
  209. piscina.run({ keywords: domainKeywordsSet, suffixes: domainSuffixSet, input: domainSets }, { name: 'dedupeKeywords' }),
  210. piscina.run({ whiteList: filterRuleWhitelistDomainSets, input: domainSets }, { name: 'whitelisted' }),
  211. Array.from(domainSets).reduce((result, element, index) => {
  212. const chunk = index % 12;
  213. result[chunk] = result[chunk] ?? [];
  214. result[chunk].push(element);
  215. return result;
  216. }, []).map(chunk => piscina.run({ input: chunk, fullSet: domainSets }, { name: 'dedupe' }))
  217. ]);
  218. res2.forEach(set => {
  219. set.forEach(i => domainSets.delete(i));
  220. });
  221. const diffDeduping = beforeDeduping - domainSets.size;
  222. console.log(`Deduped ${diffDeduping} rules!`);
  223. return fsPromises.writeFile(pathResolve(__dirname, '../List/domainset/reject.conf'), `${[...domainSets].join('\n')}\n`);
  224. })();
  225. function sliceIntoChunks(arr, chunkSize) {
  226. const res = [];
  227. for (let i = 0; i < arr.length; i += chunkSize) {
  228. const chunk = arr.slice(i, i + chunkSize);
  229. res.push(chunk);
  230. }
  231. return res;
  232. }