parse-filter.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. // @ts-check
  2. const { fetchWithRetry } = require('./fetch-retry');
  3. const tldts = require('tldts');
  4. const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-text-by-line');
  5. const { NetworkFilter } = require('@cliqz/adblocker');
  6. const { processLine } = require('./process-line');
  7. const { performance } = require('perf_hooks');
  8. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  9. let foundDebugDomain = false;
  10. const warnOnceUrl = new Set();
  11. const warnOnce = (url, isWhite, ...message) => {
  12. const key = `${url}${isWhite ? 'white' : 'black'}`;
  13. if (warnOnceUrl.has(key)) {
  14. return;
  15. }
  16. warnOnceUrl.add(key);
  17. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  18. };
  19. const normalizeDomain = (domain) => {
  20. if (!domain) return null;
  21. const { isIcann, isPrivate, hostname, isIp } = tldts.parse(domain);
  22. if (isIp) return null;
  23. if (isIcann || isPrivate) {
  24. if (hostname?.[0] === '.') {
  25. return hostname.slice(1);
  26. }
  27. return hostname;
  28. }
  29. return null;
  30. };
  31. /**
  32. * @param {string | URL} domainListsUrl
  33. */
  34. async function processDomainLists(domainListsUrl) {
  35. if (typeof domainListsUrl === 'string') {
  36. domainListsUrl = new URL(domainListsUrl);
  37. }
  38. /** @type Set<string> */
  39. const domainSets = new Set();
  40. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  41. if (line[0] === '!') {
  42. continue;
  43. }
  44. const domainToAdd = processLine(line);
  45. if (!domainToAdd) {
  46. continue;
  47. }
  48. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  49. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  50. foundDebugDomain = true;
  51. }
  52. domainSets.add(domainToAdd);
  53. }
  54. return domainSets;
  55. }
  56. /**
  57. * @param {string | URL} hostsUrl
  58. */
  59. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  60. console.time(` - processHosts: ${hostsUrl}`);
  61. if (typeof hostsUrl === 'string') {
  62. hostsUrl = new URL(hostsUrl);
  63. }
  64. /** @type Set<string> */
  65. const domainSets = new Set();
  66. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  67. const line = processLine(l);
  68. if (!line) {
  69. continue;
  70. }
  71. const [, ...domains] = line.split(' ');
  72. const _domain = domains.join(' ').trim();
  73. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  74. warnOnce(hostsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  75. foundDebugDomain = true;
  76. }
  77. const domain = normalizeDomain(_domain);
  78. if (domain) {
  79. if (includeAllSubDomain) {
  80. domainSets.add(`.${domain}`);
  81. } else {
  82. domainSets.add(domain);
  83. }
  84. }
  85. }
  86. console.timeEnd(` - processHosts: ${hostsUrl}`);
  87. return domainSets;
  88. }
  89. /**
  90. * @param {string | URL} filterRulesUrl
  91. * @param {readonly (string | URL)[] | undefined} [fallbackUrls]
  92. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
  93. */
  94. async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdParties = false) {
  95. const runStart = performance.now();
  96. /** @type Set<string> */
  97. const whitelistDomainSets = new Set();
  98. /** @type Set<string> */
  99. const blacklistDomainSets = new Set();
  100. const __addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  101. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  102. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  103. foundDebugDomain = true;
  104. }
  105. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  106. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  107. } else {
  108. blacklistDomainSets.add(domainToBeAddedToBlack);
  109. }
  110. };
  111. const addToBlackList = DEBUG_DOMAIN_TO_FIND == null
  112. ? __addToBlackList
  113. : (domainToBeAddedToBlack, isSubDomain) => {
  114. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  115. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  116. foundDebugDomain = true;
  117. }
  118. __addToBlackList(domainToBeAddedToBlack, isSubDomain);
  119. };
  120. const __addToWhiteList = (domainToBeAddedToWhite) => {
  121. whitelistDomainSets.add(domainToBeAddedToWhite);
  122. };
  123. const addToWhiteList = DEBUG_DOMAIN_TO_FIND == null
  124. ? __addToWhiteList
  125. : (domainToBeAddedToWhite) => {
  126. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
  127. warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
  128. foundDebugDomain = true;
  129. }
  130. __addToWhiteList(domainToBeAddedToWhite);
  131. };
  132. let downloadTime = 0;
  133. const lineCb = (line) => {
  134. const result = parse(line, includeThirdParties);
  135. if (result) {
  136. const flag = result[1];
  137. const hostname = result[0];
  138. switch (flag) {
  139. case 0:
  140. addToWhiteList(hostname);
  141. break;
  142. case 1:
  143. addToBlackList(hostname, false);
  144. break;
  145. case 2:
  146. addToBlackList(hostname, true);
  147. break;
  148. default:
  149. throw new Error(`Unknown flag: ${flag}`);
  150. }
  151. }
  152. };
  153. if (!fallbackUrls || fallbackUrls.length === 0) {
  154. const downloadStart = performance.now();
  155. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  156. lineCb(line.trim());
  157. }
  158. downloadTime = performance.now() - downloadStart;
  159. } else {
  160. let filterRules;
  161. const downloadStart = performance.now();
  162. try {
  163. const controller = new AbortController();
  164. /** @type string[] */
  165. filterRules = (
  166. await Promise.any(
  167. [filterRulesUrl, ...(fallbackUrls || [])].map(async url => {
  168. const text = await fetchWithRetry(url, { signal: controller.signal }).then(r => r.text());
  169. controller.abort();
  170. return text;
  171. })
  172. )
  173. ).split('\n').map(line => line.trim());
  174. } catch (e) {
  175. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  176. throw e;
  177. }
  178. downloadTime = performance.now() - downloadStart;
  179. for (let i = 0, len = filterRules.length; i < len; i++) {
  180. lineCb(filterRules[i].trim());
  181. }
  182. }
  183. console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
  184. console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
  185. return {
  186. white: whitelistDomainSets,
  187. black: blacklistDomainSets,
  188. foundDebugDomain
  189. };
  190. }
  191. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#&%~=]/;
  192. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  193. /**
  194. * @param {string} $line
  195. * @param {boolean} includeThirdParties
  196. * @returns {null | [string, 0 | 1 | 2]} - 0 white, 1 black abosulte, 2 black include subdomain
  197. */
  198. function parse($line, includeThirdParties) {
  199. const line = $line.trim();
  200. if (
  201. line === ''
  202. || line[0] === '/'
  203. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
  204. // doesn't include
  205. || !line.includes('.') // rule with out dot can not be a domain
  206. // includes
  207. // || line.includes('#')
  208. || line.includes('!')
  209. || line.includes('?')
  210. || line.includes('*')
  211. // || line.includes('=')
  212. || line.includes('[')
  213. || line.includes('(')
  214. || line.includes(']')
  215. || line.includes(')')
  216. || line.includes(',')
  217. // || line.includes('~')
  218. // || line.includes('&')
  219. // || line.includes('%')
  220. // ends with
  221. || line.endsWith('.')
  222. || line.endsWith('-')
  223. || line.endsWith('_')
  224. // special modifier
  225. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  226. || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
  227. // || line.includes('$popup')
  228. // || line.includes('$removeparam')
  229. // || line.includes('$popunder')
  230. ) {
  231. return null;
  232. }
  233. const filter = NetworkFilter.parse(line);
  234. if (filter) {
  235. if (
  236. filter.isElemHide()
  237. || filter.isGenericHide()
  238. || filter.isSpecificHide()
  239. || filter.isRedirect()
  240. || filter.isRedirectRule()
  241. || filter.hasDomains()
  242. || filter.isCSP() // must not be csp rule
  243. || (!filter.fromAny() && !filter.fromDocument())
  244. ) {
  245. // not supported type
  246. return null;
  247. }
  248. if (
  249. filter.hasHostname() // must have
  250. && filter.isPlain()
  251. && (!filter.isRegex())
  252. && (!filter.isFullRegex())
  253. ) {
  254. const hostname = normalizeDomain(filter.getHostname());
  255. if (hostname) {
  256. if (filter.isException() || filter.isBadFilter()) {
  257. return [hostname, 0];
  258. }
  259. if (filter.firstParty() === filter.thirdParty()) {
  260. return [hostname, 2];
  261. }
  262. if (filter.thirdParty()) {
  263. if (includeThirdParties) {
  264. return [hostname, 2];
  265. }
  266. return null;
  267. }
  268. if (filter.firstParty()) {
  269. return null;
  270. }
  271. } else {
  272. return null;
  273. }
  274. }
  275. }
  276. if (line.includes('$third-party') || line.includes('$frame')) {
  277. return null;
  278. }
  279. const lineEndsWithCaret = line.endsWith('^');
  280. const lineEndsWithCaretVerticalBar = line.endsWith('^|');
  281. if (line[0] === '@' && line[1] === '@') {
  282. if (line.endsWith('$cname')) {
  283. return null;
  284. }
  285. if (
  286. // (line.startsWith('@@|') || line.startsWith('@@.'))
  287. (line[2] === '|' || line[2] === '.')
  288. && (
  289. lineEndsWithCaret
  290. || lineEndsWithCaretVerticalBar
  291. || line.endsWith('$genericblock')
  292. || line.endsWith('$document')
  293. )
  294. ) {
  295. const _domain = line
  296. .replace('@@||', '')
  297. .replace('@@|', '')
  298. .replace('@@.', '')
  299. .replace('^|', '')
  300. .replace('^$genericblock', '')
  301. .replace('$genericblock', '')
  302. .replace('^$document', '')
  303. .replace('$document', '')
  304. .replaceAll('^', '')
  305. .trim();
  306. const domain = normalizeDomain(_domain);
  307. if (domain) {
  308. return [domain, 0];
  309. }
  310. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  311. return null;
  312. }
  313. }
  314. if (
  315. line.startsWith('||')
  316. && (
  317. lineEndsWithCaret
  318. || lineEndsWithCaretVerticalBar
  319. || line.endsWith('$cname')
  320. )
  321. ) {
  322. const _domain = line
  323. .replace('||', '')
  324. .replace('^|', '')
  325. .replace('$cname', '')
  326. .replaceAll('^', '')
  327. .trim();
  328. const domain = normalizeDomain(_domain);
  329. if (domain) {
  330. return [domain, 2];
  331. }
  332. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  333. return null;
  334. }
  335. const lineStartsWithSingleDot = line[0] === '.';
  336. if (
  337. lineStartsWithSingleDot
  338. && (
  339. lineEndsWithCaret
  340. || lineEndsWithCaretVerticalBar
  341. )
  342. ) {
  343. const _domain = line
  344. .replace('^|', '')
  345. .replaceAll('^', '')
  346. .slice(1)
  347. .trim();
  348. const domain = normalizeDomain(_domain);
  349. if (domain) {
  350. return [domain, 2];
  351. }
  352. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  353. return null;
  354. }
  355. if (
  356. (
  357. line.startsWith('://')
  358. || line.startsWith('http://')
  359. || line.startsWith('https://')
  360. || line.startsWith('|http://')
  361. || line.startsWith('|https://')
  362. )
  363. && (
  364. lineEndsWithCaret
  365. || lineEndsWithCaretVerticalBar
  366. )
  367. ) {
  368. const _domain = line
  369. .replace('|https://', '')
  370. .replace('https://', '')
  371. .replace('|http://', '')
  372. .replace('http://', '')
  373. .replace('://', '')
  374. .replace('^|', '')
  375. .replaceAll('^', '')
  376. .trim();
  377. const domain = normalizeDomain(_domain);
  378. if (domain) {
  379. return [domain, 1];
  380. }
  381. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  382. return null;
  383. }
  384. if (line[0] !== '|' && lineEndsWithCaret) {
  385. const _domain = line.slice(0, -1);
  386. const domain = normalizeDomain(_domain);
  387. if (domain) {
  388. return [domain, 1];
  389. }
  390. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  391. return null;
  392. }
  393. const tryNormalizeDomain = normalizeDomain(line);
  394. if (
  395. tryNormalizeDomain
  396. && (
  397. lineStartsWithSingleDot
  398. ? tryNormalizeDomain.length === line.length - 1
  399. : tryNormalizeDomain === line
  400. )
  401. ) {
  402. return [line, 2];
  403. }
  404. if (!line.endsWith('.js')) {
  405. console.warn(' * [parse-filter E0010] can not parse:', line);
  406. }
  407. return null;
  408. }
  409. module.exports.processDomainLists = processDomainLists;
  410. module.exports.processHosts = processHosts;
  411. module.exports.processFilterRules = processFilterRules;