build-reject-domainset.js 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. const { default: got } = require('got-cjs');
  2. const { promises: fsPromises } = require('fs');
  3. const { resolve: pathResolve } = require('path');
  4. const { cpus } = require('os');
  5. const threads = Math.max(cpus().length, 12);
  6. const rIPv4 = /((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)/;
  7. const Piscina = require('piscina');
  8. /**
  9. * @param {string | URL} domainListsUrl
  10. */
  11. async function processDomainLists(domainListsUrl) {
  12. if (typeof domainListsUrl === 'string') {
  13. domainListsUrl = new URL(domainListsUrl);
  14. }
  15. /** @type Set<string> */
  16. const domainSets = new Set();
  17. /** @type string[] */
  18. const domains = (await got(domainListsUrl).text()).split('\n');
  19. domains.forEach(line => {
  20. if (
  21. line.startsWith('#')
  22. || line.startsWith('!')
  23. || line.startsWith(' ')
  24. || line === ''
  25. || line.startsWith('\r')
  26. || line.startsWith('\n')
  27. ) {
  28. return;
  29. }
  30. domainSets.add(line.trim());
  31. });
  32. return [...domainSets];
  33. }
  34. /**
  35. * @param {string | URL} hostsUrl
  36. */
  37. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  38. if (typeof hostsUrl === 'string') {
  39. hostsUrl = new URL(hostsUrl);
  40. }
  41. /** @type Set<string> */
  42. const domainSets = new Set();
  43. /** @type string[] */
  44. const hosts = (await got(hostsUrl).text()).split('\n');
  45. hosts.forEach(line => {
  46. if (line.includes('#')) {
  47. return;
  48. }
  49. if (line.startsWith(' ') || line.startsWith('\r') || line.startsWith('\n') || line.trim() === '') {
  50. return;
  51. }
  52. const [, ...domains] = line.split(' ');
  53. if (includeAllSubDomain) {
  54. domainSets.add(`.${domains.join(' ')}`.trim());
  55. } else {
  56. domainSets.add(domains.join(' ').trim());
  57. }
  58. });
  59. return [...domainSets];
  60. }
  61. /**
  62. * @param {string | URL} filterRulesUrl
  63. * @returns {Promise<{ white: Set<string>, black: Set<string> }>}
  64. */
  65. async function processFilterRules(filterRulesUrl) {
  66. if (typeof filterRulesUrl === 'string') {
  67. filterRulesUrl = new URL(filterRulesUrl);
  68. }
  69. /** @type Set<string> */
  70. const whitelistDomainSets = new Set([
  71. 'localhost',
  72. 'broadcasthost',
  73. 'ip6-loopback',
  74. 'ip6-localnet',
  75. 'ip6-mcastprefix',
  76. 'ip6-allnodes',
  77. 'ip6-allrouters',
  78. 'ip6-allhosts',
  79. 'mcastprefix',
  80. 'analytics.google.com',
  81. 'msa.cdn.mediaset.net', // Added manually using DOMAIN-KEYWORDS
  82. 'cloud.answerhub.com',
  83. 'ae01.alicdn.com',
  84. 'whoami.akamai.net',
  85. 'whoami.ds.akahelp.net',
  86. 'pxlk9.net.', // This one is malformed from EasyList, which I will manually add instead
  87. 'instant.page', // No, it doesn't violate anyone's privacy. I will whitelist it
  88. 'piwik.pro',
  89. 'mixpanel.com',
  90. 'heapanalytics.com',
  91. 'dataunlocker.com',
  92. 'segment.com',
  93. 'segment.io',
  94. 'segmentify.com'
  95. ]);
  96. /** @type Set<string> */
  97. const blacklistDomainSets = new Set();
  98. /** @type Set<string> */
  99. const blackIPSets = new Set();
  100. /** @type string[] */
  101. const filterRules = (await got(filterRulesUrl).text()).split('\n');
  102. filterRules.forEach(line => {
  103. if (
  104. line.includes('#')
  105. || line.includes('!')
  106. || line.startsWith(' ')
  107. || line.startsWith('\r')
  108. || line.startsWith('\n')
  109. || line.includes('*')
  110. || line.includes('/')
  111. || line.includes('$')
  112. || line.trim() === ''
  113. || rIPv4.test(line)
  114. ) {
  115. return;
  116. }
  117. if (line.startsWith('@@||')
  118. && (
  119. line.endsWith('^')
  120. || line.endsWith('^|')
  121. )
  122. ) {
  123. whitelistDomainSets.add(`${line.replaceAll('@@||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  124. } else if (
  125. line.startsWith('||')
  126. && (
  127. line.endsWith('^')
  128. || line.endsWith('^|')
  129. )
  130. ) {
  131. blacklistDomainSets.add(`.${line.replaceAll('||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  132. } else if (line.startsWith('://')
  133. && (
  134. line.endsWith('^')
  135. || line.endsWith('^|')
  136. )
  137. ) {
  138. blacklistDomainSets.add(`${line.replaceAll('://', '').replaceAll('^|', '').replaceAll('^', '')}`.trim());
  139. }
  140. });
  141. return {
  142. white: whitelistDomainSets,
  143. black: blacklistDomainSets
  144. };
  145. }
  146. (async () => {
  147. /** @type Set<string> */
  148. const domainSets = new Set();
  149. // Parse from remote hosts & domain lists
  150. (await Promise.all([
  151. processHosts('https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true),
  152. processHosts('https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt')
  153. ])).forEach(hosts => {
  154. hosts.forEach(host => {
  155. if (host) {
  156. domainSets.add(host);
  157. }
  158. });
  159. });
  160. const hostsSize = domainSets.size;
  161. console.log(`Import ${hostsSize} rules from hosts files!`);
  162. await fsPromises.readFile(pathResolve(__dirname, '../List/domainset/reject_sukka.conf'), { encoding: 'utf-8' }).then(data => {
  163. data.split('\n').forEach(line => {
  164. if (
  165. line.startsWith('#')
  166. || line.startsWith(' ')
  167. || line === '' || line === ' '
  168. || line.startsWith('\r')
  169. || line.startsWith('\n')
  170. ) {
  171. return;
  172. }
  173. /* if (domainSets.has(line) || domainSets.has(`.${line}`)) {
  174. console.warn(`|${line}| is already in the list!`);
  175. } */
  176. domainSets.add(line.trim());
  177. });
  178. });
  179. const sukkaSize = domainSets.size - hostsSize;
  180. console.log(`Import ${sukkaSize} rules from reject_sukka.conf!`);
  181. // Parse from AdGuard Filters
  182. /** @type Set<string> */
  183. const filterRuleWhitelistDomainSets = new Set();
  184. (await Promise.all([
  185. processFilterRules('https://easylist.to/easylist/easylist.txt'),
  186. processFilterRules('https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt'),
  187. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_11_Mobile/filter.txt'),
  188. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_3_Spyware/filter.txt'),
  189. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_2_English/filter.txt'),
  190. processFilterRules('https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_224_Chinese/filter.txt'),
  191. processFilterRules('https://filters.adtidy.org/extension/ublock/filters/224.txt'),
  192. processFilterRules('https://easylist.to/easylist/easyprivacy.txt'),
  193. processFilterRules('https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt'),
  194. processFilterRules('https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'),
  195. processFilterRules('https://curben.gitlab.io/malware-filter/urlhaus-filter-agh-online.txt'),
  196. processFilterRules('https://curben.gitlab.io/malware-filter/pup-filter-agh.txt'),
  197. processFilterRules('https://curben.gitlab.io/malware-filter/phishing-filter-agh.txt'),
  198. processFilterRules('https://curben.gitlab.io/malware-filter/pup-filter-agh.txt')
  199. ])).forEach(({ white, black }) => {
  200. white.forEach(i => filterRuleWhitelistDomainSets.add(i));
  201. black.forEach(i => domainSets.add(i));
  202. });
  203. const adguardSize = domainSets.size - hostsSize - sukkaSize;
  204. console.log(`Import ${adguardSize} rules from adguard filters!`);
  205. // Read DOMAIN Keyword
  206. const domainKeywordsSet = new Set();
  207. const domainSuffixSet = new Set();
  208. await fsPromises.readFile(pathResolve(__dirname, '../List/non_ip/reject.conf'), { encoding: 'utf-8' }).then(data => {
  209. data.split('\n').forEach(line => {
  210. if (line.startsWith('DOMAIN-KEYWORD')) {
  211. const [, ...keywords] = line.split(',');
  212. domainKeywordsSet.add(keywords.join(',').trim());
  213. } else if (line.startsWith('DOMAIN-SUFFIX')) {
  214. const [, ...keywords] = line.split(',');
  215. domainSuffixSet.add(keywords.join(',').trim());
  216. }
  217. });
  218. });
  219. console.log(`Import ${domainKeywordsSet.size} black keywords!`);
  220. const beforeDeduping = domainSets.size;
  221. // Dedupe domainSets
  222. console.log(`Start deduping! (${beforeDeduping})`);
  223. const piscina = new Piscina({
  224. filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js')
  225. });
  226. (await Promise.all([
  227. piscina.run({ keywords: domainKeywordsSet, suffixes: domainSuffixSet, input: domainSets }, { name: 'dedupeKeywords' }),
  228. piscina.run({ whiteList: filterRuleWhitelistDomainSets, input: domainSets }, { name: 'whitelisted' })
  229. ])).forEach(set => {
  230. set.forEach(i => domainSets.delete(i));
  231. });
  232. const originalFullSet = new Set([...domainSets]);
  233. (await Promise.all(
  234. Array.from(domainSets).reduce((result, element, index) => {
  235. const chunk = index % threads;
  236. result[chunk] ??= [];
  237. result[chunk].push(element);
  238. return result;
  239. }, []).map(chunk => piscina.run({ input: chunk, fullSet: originalFullSet }, { name: 'dedupe' }))
  240. )).forEach(set => {
  241. set.forEach(i => domainSets.delete(i));
  242. });
  243. console.log(`Deduped ${beforeDeduping - domainSets.size} rules!`);
  244. return fsPromises.writeFile(
  245. pathResolve(__dirname, '../List/domainset/reject.conf'),
  246. `${[...domainSets].join('\n')}\n`,
  247. { encoding: 'utf-8' });
  248. })();