build-reject-domainset.js 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. const { default: got } = require('got-cjs');
  2. const { promises: fsPromises } = require('fs');
  3. const { resolve: pathResolve } = require('path');
  4. const { cpus } = require('os');
  5. const { isIP } = require('net');
  6. const threads = Math.max(cpus().length, 12);
  7. const rDomain = /^(((?!\-))(xn\-\-)?[a-z0-9\-_]{0,61}[a-z0-9]{1,1}\.)*(xn\-\-)?([a-z0-9\-]{1,61}|[a-z0-9\-]{1,30})\.[a-z]{2,}$/m
  8. const Piscina = require('piscina');
  9. /**
  10. * @param {string | URL} domainListsUrl
  11. */
  12. async function processDomainLists(domainListsUrl) {
  13. if (typeof domainListsUrl === 'string') {
  14. domainListsUrl = new URL(domainListsUrl);
  15. }
  16. /** @type Set<string> */
  17. const domainSets = new Set();
  18. /** @type string[] */
  19. const domains = (await got(domainListsUrl).text()).split('\n');
  20. domains.forEach(line => {
  21. if (
  22. line.startsWith('#')
  23. || line.startsWith('!')
  24. || line.startsWith(' ')
  25. || line === ''
  26. || line.startsWith('\r')
  27. || line.startsWith('\n')
  28. ) {
  29. return;
  30. }
  31. domainSets.add(line.trim());
  32. });
  33. return [...domainSets];
  34. }
  35. /**
  36. * @param {string | URL} hostsUrl
  37. */
  38. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  39. if (typeof hostsUrl === 'string') {
  40. hostsUrl = new URL(hostsUrl);
  41. }
  42. /** @type Set<string> */
  43. const domainSets = new Set();
  44. /** @type string[] */
  45. const hosts = (await got(hostsUrl).text()).split('\n');
  46. hosts.forEach(line => {
  47. if (line.includes('#')) {
  48. return;
  49. }
  50. if (line.startsWith(' ') || line.startsWith('\r') || line.startsWith('\n') || line.trim() === '') {
  51. return;
  52. }
  53. const [, ...domains] = line.split(' ');
  54. const domain = domains.join(' ').trim();
  55. if (rDomain.test(domain)) {
  56. if (includeAllSubDomain) {
  57. domainSets.add(`.${domain}`);
  58. } else {
  59. domainSets.add(domain);
  60. }
  61. }
  62. });
  63. return [...domainSets];
  64. }
  65. /**
  66. * @param {string | URL} filterRulesUrl
  67. * @returns {Promise<{ white: Set<string>, black: Set<string> }>}
  68. */
  69. async function processFilterRules(filterRulesUrl) {
  70. if (typeof filterRulesUrl === 'string') {
  71. filterRulesUrl = new URL(filterRulesUrl);
  72. }
  73. /** @type Set<string> */
  74. const whitelistDomainSets = new Set();
  75. /** @type Set<string> */
  76. const blacklistDomainSets = new Set();
  77. /** @type string[] */
  78. const filterRules = (await got(filterRulesUrl).text()).split('\n').map(line => line.trim());
  79. filterRules.forEach(line => {
  80. if (
  81. line === ''
  82. || line.includes('#')
  83. || line.includes('!')
  84. || line.includes('*')
  85. || line.includes('/')
  86. || line.includes('$') && !line.startsWith('@@')
  87. || line.trim() === ''
  88. || isIP(line) !== 0
  89. ) {
  90. return;
  91. }
  92. if (line.startsWith('@@||')
  93. && (
  94. line.endsWith('^')
  95. || line.endsWith('^|')
  96. || line.endsWith('^$badfilter')
  97. || line.endsWith('^$1p')
  98. )
  99. ) {
  100. const domain = line
  101. .replaceAll('@@||', '')
  102. .replaceAll('^$badfilter', '')
  103. .replaceAll('^$1p', '')
  104. .replaceAll('^|', '')
  105. .replaceAll('^', '')
  106. .trim();
  107. if (rDomain.test(domain)) {
  108. whitelistDomainSets.add(domain);
  109. }
  110. } else if (
  111. line.startsWith('||')
  112. && (
  113. line.endsWith('^')
  114. || line.endsWith('^|')
  115. )
  116. ) {
  117. const domain = `${line.replaceAll('||', '').replaceAll('^|', '').replaceAll('^', '')}`.trim();
  118. if (rDomain.test(domain)) {
  119. blacklistDomainSets.add(`.${domain}`);
  120. }
  121. } else if (line.startsWith('://')
  122. && (
  123. line.endsWith('^')
  124. || line.endsWith('^|')
  125. )
  126. ) {
  127. const domain = `${line.replaceAll('://', '').replaceAll('^|', '').replaceAll('^', '')}`.trim();
  128. if (rDomain.test(domain)) {
  129. blacklistDomainSets.add(domain);
  130. }
  131. }
  132. });
  133. return {
  134. white: whitelistDomainSets,
  135. black: blacklistDomainSets
  136. };
  137. }
  138. (async () => {
  139. /** @type Set<string> */
  140. const domainSets = new Set();
  141. // Parse from remote hosts & domain lists
  142. (await Promise.all([
  143. processHosts('https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true),
  144. processHosts('https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt')
  145. ])).forEach(hosts => {
  146. hosts.forEach(host => {
  147. if (host) {
  148. domainSets.add(host);
  149. }
  150. });
  151. });
  152. const hostsSize = domainSets.size;
  153. console.log(`Import ${hostsSize} rules from hosts files!`);
  154. await fsPromises.readFile(pathResolve(__dirname, '../List/domainset/reject_sukka.conf'), { encoding: 'utf-8' }).then(data => {
  155. data.split('\n').forEach(line => {
  156. if (
  157. line.startsWith('#')
  158. || line.startsWith(' ')
  159. || line.startsWith('\r')
  160. || line.startsWith('\n')
  161. || line.trim() === ''
  162. ) {
  163. return;
  164. }
  165. /* if (domainSets.has(line) || domainSets.has(`.${line}`)) {
  166. console.warn(`|${line}| is already in the list!`);
  167. } */
  168. domainSets.add(line.trim());
  169. });
  170. });
  171. const sukkaSize = domainSets.size - hostsSize;
  172. console.log(`Import ${sukkaSize} rules from reject_sukka.conf!`);
  173. // Parse from AdGuard Filters
  174. /** @type Set<string> */
  175. const filterRuleWhitelistDomainSets = new Set([
  176. 'localhost',
  177. 'broadcasthost',
  178. 'ip6-loopback',
  179. 'ip6-localnet',
  180. 'ip6-mcastprefix',
  181. 'ip6-allnodes',
  182. 'ip6-allrouters',
  183. 'ip6-allhosts',
  184. 'mcastprefix',
  185. 'analytics.google.com',
  186. 'msa.cdn.mediaset.net', // Added manually using DOMAIN-KEYWORDS
  187. 'cloud.answerhub.com',
  188. 'ae01.alicdn.com',
  189. 'whoami.akamai.net',
  190. 'whoami.ds.akahelp.net',
  191. 'pxlk9.net.', // This one is malformed from EasyList, which I will manually add instead
  192. 'instant.page', // No, it doesn't violate anyone's privacy. I will whitelist it
  193. 'piwik.pro',
  194. 'mixpanel.com',
  195. 'heapanalytics.com',
  196. 'dataunlocker.com',
  197. 'segment.com',
  198. 'segment.io',
  199. 'segmentify.com'
  200. ]);
  201. (await Promise.all([
  202. // Easy List
  203. 'https://easylist.to/easylist/easylist.txt',
  204. // AdGuard DNS Filter
  205. 'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt',
  206. // uBlock Origin Filter List
  207. 'https://ublockorigin.github.io/uAssetsCDN/filters/filters.txt',
  208. 'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2020.txt',
  209. 'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2021.txt',
  210. 'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2022.txt',
  211. // uBlock Origin Badware Risk List
  212. 'https://ublockorigin.github.io/uAssets/filters/badware.txt',
  213. // uBlock Origin Privacy List
  214. 'https://ublockorigin.github.io/uAssets/filters/privacy.txt',
  215. // uBlock Origin Resource Abuse
  216. 'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt',
  217. // uBlock Origin Unbreak
  218. 'https://ublockorigin.github.io/uAssets/filters/unbreak.txt',
  219. // AdGuard Base Filter
  220. 'https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt',
  221. // AdGuard Mobile AD
  222. 'https://filters.adtidy.org/extension/ublock/filters/11.txt',
  223. // AdGuard Tracking Protection
  224. 'https://filters.adtidy.org/extension/ublock/filters/3.txt',
  225. // AdGuard Japanese filter
  226. 'https://filters.adtidy.org/extension/ublock/filters/7.txt',
  227. // AdGuard Chinese filter (EasyList China + AdGuard Chinese filter)
  228. 'https://filters.adtidy.org/extension/ublock/filters/224.txt',
  229. // Easy Privacy
  230. 'https://easylist.to/easylist/easyprivacy.txt',
  231. // Curben's Malware Online UrlHaus
  232. 'https://curben.gitlab.io/malware-filter/urlhaus-filter-agh-online.txt',
  233. // Curben's Phishing Online Filter
  234. 'https://curben.gitlab.io/malware-filter/phishing-filter-agh.txt',
  235. // Curben's PUP List
  236. 'https://curben.gitlab.io/malware-filter/pup-filter-agh.txt',
  237. // GameConsoleAdblockList
  238. 'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt',
  239. // PiHoleBlocklist
  240. 'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt',
  241. ].map(processFilterRules))).forEach(({ white, black }) => {
  242. white.forEach(i => filterRuleWhitelistDomainSets.add(i));
  243. black.forEach(i => domainSets.add(i));
  244. });
  245. const adguardSize = domainSets.size - hostsSize - sukkaSize;
  246. console.log(`Import ${adguardSize} rules from adguard filters!`);
  247. // Read DOMAIN Keyword
  248. const domainKeywordsSet = new Set();
  249. const domainSuffixSet = new Set();
  250. await fsPromises.readFile(pathResolve(__dirname, '../List/non_ip/reject.conf'), { encoding: 'utf-8' }).then(data => {
  251. data.split('\n').forEach(line => {
  252. if (line.startsWith('DOMAIN-KEYWORD')) {
  253. const [, ...keywords] = line.split(',');
  254. domainKeywordsSet.add(keywords.join(',').trim());
  255. } else if (line.startsWith('DOMAIN-SUFFIX')) {
  256. const [, ...keywords] = line.split(',');
  257. domainSuffixSet.add(keywords.join(',').trim());
  258. }
  259. });
  260. });
  261. console.log(`Import ${domainKeywordsSet.size} black keywords and ${domainSuffixSet.size} black suffixes!`);
  262. const beforeDeduping = domainSets.size;
  263. // Dedupe domainSets
  264. console.log(`Start deduping! (${beforeDeduping})`);
  265. const piscina = new Piscina({
  266. filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js')
  267. });
  268. (await Promise.all([
  269. piscina.run(
  270. { keywords: domainKeywordsSet, suffixes: domainSuffixSet, input: domainSets },
  271. { name: 'dedupeKeywords' }
  272. ),
  273. piscina.run(
  274. { whiteList: filterRuleWhitelistDomainSets, input: domainSets },
  275. { name: 'whitelisted' }
  276. )
  277. ])).forEach(set => {
  278. set.forEach(i => domainSets.delete(i));
  279. });
  280. const originalFullSet = new Set([...domainSets]);
  281. (await Promise.all(
  282. Array.from(domainSets)
  283. .reduce((result, element, index) => {
  284. const chunk = index % threads;
  285. result[chunk] ??= [];
  286. result[chunk].push(element);
  287. return result;
  288. }, [])
  289. .map(chunk => piscina.run(
  290. { input: chunk, fullSet: originalFullSet },
  291. { name: 'dedupe' }
  292. ))
  293. )).forEach(set => {
  294. set.forEach(i => domainSets.delete(i));
  295. });
  296. console.log(`Deduped ${beforeDeduping - domainSets.size} rules!`);
  297. return fsPromises.writeFile(
  298. pathResolve(__dirname, '../List/domainset/reject.conf'),
  299. `${[...domainSets].join('\n')}\n`,
  300. { encoding: 'utf-8' });
  301. })();