build-reject-domainset.js 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. const { simpleGet } = require('./util-http-get');
  2. const { promises: fsPromises } = require('fs');
  3. const { resolve: pathResolve } = require('path');
  4. const rIPv4 = /((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)/;
  5. let Piscina;
  6. try {
  7. Piscina = require('piscina');
  8. } catch (e) {
  9. console.log('Dependencies not found');
  10. console.log('"npm i piscina" then try again!');
  11. console.error(e);
  12. process.exit(1);
  13. }
  14. /**
  15. * @param {string | URL} domainListsUrl
  16. */
  17. async function processDomainLists(domainListsUrl) {
  18. if (typeof domainListsUrl === 'string') {
  19. domainListsUrl = new URL(domainListsUrl);
  20. }
  21. /** @type Set<string> */
  22. const domainSets = new Set();
  23. /** @type string[] */
  24. const domains = (await simpleGet.https(domainListsUrl)).split('\n');
  25. domains.forEach(line => {
  26. if (line.startsWith('#')) {
  27. return;
  28. }
  29. if (line.startsWith(' ') || line === '' || line.startsWith('\r') || line.startsWith('\n')) {
  30. return;
  31. }
  32. domainSets.add(line.trim());
  33. });
  34. return [...domainSets];
  35. }
  36. /**
  37. * @param {string | URL} hostsUrl
  38. */
  39. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  40. if (typeof hostsUrl === 'string') {
  41. hostsUrl = new URL(hostsUrl);
  42. }
  43. /** @type Set<string> */
  44. const domainSets = new Set();
  45. /** @type string[] */
  46. const hosts = (await simpleGet.https(hostsUrl)).split('\n');
  47. hosts.forEach(line => {
  48. if (line.includes('#')) {
  49. return;
  50. }
  51. if (line.startsWith(' ') || line.startsWith('\r') || line.startsWith('\n') || line.trim() === '') {
  52. return;
  53. }
  54. const [, ...domains] = line.split(' ');
  55. if (includeAllSubDomain) {
  56. domainSets.add(`.${domains.join(' ')}`.trim());
  57. } else {
  58. domainSets.add(domains.join(' ').trim());
  59. }
  60. });
  61. return [...domainSets];
  62. }
  63. /**
  64. * @param {string | URL} filterRulesUrl
  65. * @returns {Promise<{ white: Set<string>, black: Set<string> }>}
  66. */
  67. async function processFilterRules(filterRulesUrl) {
  68. if (typeof filterRulesUrl === 'string') {
  69. filterRulesUrl = new URL(filterRulesUrl);
  70. }
  71. /** @type Set<string> */
  72. const whitelistDomainSets = new Set([
  73. 'localhost',
  74. 'broadcasthost',
  75. 'ip6-loopback',
  76. 'ip6-localnet',
  77. 'ip6-mcastprefix',
  78. 'ip6-allnodes',
  79. 'ip6-allrouters',
  80. 'ip6-allhosts',
  81. 'mcastprefix',
  82. 'analytics.google.com',
  83. 'msa.cdn.mediaset.net', // Added manually using DOMAIN-KEYWORDS
  84. 'cloud.answerhub.com',
  85. 'ae01.alicdn.com',
  86. 'whoami.akamai.net',
  87. 'whoami.ds.akahelp.net'
  88. ]);
  89. /** @type Set<string> */
  90. const blacklistDomainSets = new Set();
  91. /** @type string[] */
  92. const filterRules = (await simpleGet.https(filterRulesUrl.hostname, filterRulesUrl.pathname)).split('\n');
  93. filterRules.forEach(line => {
  94. if (
  95. line.includes('#')
  96. || line.includes('!')
  97. || line.startsWith(' ')
  98. || line.startsWith('\r')
  99. || line.startsWith('\n')
  100. || line.includes('*')
  101. || line.includes('/')
  102. || line.includes('$')
  103. || line.trim() === ''
  104. || rIPv4.test(line)
  105. ) {
  106. return;
  107. }
  108. if (line.startsWith('@@||')
  109. && (
  110. line.endsWith('^')
  111. || line.endsWith('^|')
  112. )
  113. ) {
  114. whitelistDomainSets.add(`${line.replaceAll('@@||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  115. } else if (
  116. line.startsWith('||')
  117. && (
  118. line.endsWith('^')
  119. || line.endsWith('^|')
  120. )
  121. ) {
  122. blacklistDomainSets.add(`.${line.replaceAll('||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  123. } else if (line.startsWith('://')
  124. && (
  125. line.endsWith('^')
  126. || line.endsWith('^|')
  127. )
  128. ) {
  129. blacklistDomainSets.add(`${line.replaceAll('://', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  130. }
  131. });
  132. return {
  133. white: whitelistDomainSets,
  134. black: blacklistDomainSets
  135. };
  136. }
  137. (async () => {
  138. /** @type Set<string> */
  139. const domainSets = new Set();
  140. // Parse from remote hosts & domain lists
  141. (await Promise.all([
  142. processHosts('https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true),
  143. processHosts('https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt')
  144. ])).forEach(hosts => {
  145. hosts.forEach(host => {
  146. if (host) {
  147. domainSets.add(host.trim());
  148. }
  149. });
  150. });
  151. const hostsSize = domainSets.size;
  152. console.log(`Import ${hostsSize} rules from hosts files!`);
  153. await fsPromises.readFile(pathResolve(__dirname, '../List/domainset/reject_sukka.conf'), { encoding: 'utf-8' }).then(data => {
  154. data.split('\n').forEach(line => {
  155. if (
  156. line.startsWith('#')
  157. || line.startsWith(' ')
  158. || line === '' || line === ' '
  159. || line.startsWith('\r')
  160. || line.startsWith('\n')
  161. ) {
  162. return;
  163. }
  164. /* if (domainSets.has(line) || domainSets.has(`.${line}`)) {
  165. console.warn(`|${line}| is already in the list!`);
  166. } */
  167. domainSets.add(line.trim());
  168. });
  169. });
  170. const sukkaSize = domainSets.size - hostsSize;
  171. console.log(`Import ${sukkaSize} rules from reject_sukka.conf!`);
  172. // Parse from AdGuard Filters
  173. /** @type Set<string> */
  174. const filterRuleWhitelistDomainSets = new Set();
  175. (await Promise.all([
  176. processFilterRules('https://easylist.to/easylist/easylist.txt'),
  177. processFilterRules('https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt'),
  178. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_11_Mobile/filter.txt'),
  179. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_3_Spyware/filter.txt'),
  180. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_2_English/filter.txt'),
  181. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_224_Chinese/filter.txt'),
  182. processFilterRules('https://filters.adtidy.org/extension/ublock/filters/224.txt'),
  183. processFilterRules('https://easylist.to/easylist/easyprivacy.txt'),
  184. processFilterRules('https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt'),
  185. processFilterRules('https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'),
  186. processFilterRules('https://curben.gitlab.io/malware-filter/urlhaus-filter-agh-online.txt')
  187. ])).forEach(({ white, black }) => {
  188. white.forEach(i => filterRuleWhitelistDomainSets.add(i));
  189. black.forEach(i => domainSets.add(i));
  190. });
  191. const adguardSize = domainSets.size - hostsSize - sukkaSize;
  192. console.log(`Import ${adguardSize} rules from adguard filters!`);
  193. // Read DOMAIN Keyword
  194. const domainKeywordsSet = new Set();
  195. const domainSuffixSet = new Set();
  196. await fsPromises.readFile(pathResolve(__dirname, '../List/non_ip/reject.conf'), { encoding: 'utf-8' }).then(data => {
  197. data.split('\n').forEach(line => {
  198. if (line.startsWith('DOMAIN-KEYWORD')) {
  199. const [, ...keywords] = line.split(',');
  200. domainKeywordsSet.add(keywords.join(',').trim());
  201. } else if (line.startsWith('DOMAIN-SUFFIX')) {
  202. const [, ...keywords] = line.split(',');
  203. domainSuffixSet.add(keywords.join(',').trim());
  204. }
  205. });
  206. });
  207. console.log(`Import ${domainKeywordsSet.size} black keywords!`);
  208. const beforeDeduping = domainSets.size;
  209. // Dedupe domainSets
  210. console.log(`Start deduping! (${beforeDeduping})`);
  211. const piscina = new Piscina({
  212. filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js')
  213. });
  214. (await Promise.all([
  215. piscina.run({ keywords: domainKeywordsSet, suffixes: domainSuffixSet, input: domainSets }, { name: 'dedupeKeywords' }),
  216. piscina.run({ whiteList: filterRuleWhitelistDomainSets, input: domainSets }, { name: 'whitelisted' })
  217. ])).forEach(set => {
  218. set.forEach(i => domainSets.delete(i));
  219. });
  220. const fullSet = new Set([...domainSets]);
  221. (await Promise.all(
  222. Array.from(domainSets).reduce((result, element, index) => {
  223. const chunk = index % 12;
  224. result[chunk] = result[chunk] ?? [];
  225. result[chunk].push(element);
  226. return result;
  227. }, []).map(chunk => piscina.run({ input: chunk, fullSet }, { name: 'dedupe' }))
  228. )).forEach(set => {
  229. set.forEach(i => domainSets.delete(i));
  230. });
  231. console.log(`Deduped ${beforeDeduping - domainSets.size} rules!`);
  232. return fsPromises.writeFile(
  233. pathResolve(__dirname, '../List/domainset/reject.conf'),
  234. `${[...domainSets].join('\n')}\n`,
  235. { encoding: 'utf-8' });
  236. })();