parse-filter.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. // @ts-check
  2. const { fetchWithRetry } = require('./fetch-retry');
  3. const tldts = require('./cached-tld-parse');
  4. const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-text-by-line');
  5. const { NetworkFilter } = require('@cliqz/adblocker');
  6. const { processLine } = require('./process-line');
  7. const { performance } = require('perf_hooks');
  8. const { getGorhillPublicSuffixPromise } = require('./get-gorhill-publicsuffix');
  9. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  10. let foundDebugDomain = false;
  11. const warnOnceUrl = new Set();
  12. /**
  13. *
  14. * @param {string} url
  15. * @param {boolean} isWhite
  16. * @param {...any} message
  17. * @returns
  18. */
  19. const warnOnce = (url, isWhite, ...message) => {
  20. const key = `${url}${isWhite ? 'white' : 'black'}`;
  21. if (warnOnceUrl.has(key)) {
  22. return;
  23. }
  24. warnOnceUrl.add(key);
  25. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  26. };
  27. /**
  28. * @param {string} domain
  29. */
  30. const normalizeDomain = (domain) => {
  31. if (!domain) return null;
  32. const parsed = tldts.parse(domain);
  33. if (parsed.isIp) return null;
  34. if (parsed.isIcann || parsed.isPrivate) {
  35. const h = parsed.hostname;
  36. if (h === null) return null;
  37. return h[0] === '.' ? h.slice(1) : h;
  38. }
  39. return null;
  40. };
  41. /**
  42. * @param {string | URL} domainListsUrl
  43. */
  44. async function processDomainLists(domainListsUrl) {
  45. if (typeof domainListsUrl === 'string') {
  46. domainListsUrl = new URL(domainListsUrl);
  47. }
  48. /** @type Set<string> */
  49. const domainSets = new Set();
  50. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  51. if (line[0] === '!') {
  52. continue;
  53. }
  54. const domainToAdd = processLine(line);
  55. if (!domainToAdd) {
  56. continue;
  57. }
  58. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  59. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  60. foundDebugDomain = true;
  61. }
  62. domainSets.add(domainToAdd);
  63. }
  64. return domainSets;
  65. }
  66. /**
  67. * @param {string | URL} hostsUrl
  68. */
  69. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  70. console.time(` - processHosts: ${hostsUrl}`);
  71. if (typeof hostsUrl === 'string') {
  72. hostsUrl = new URL(hostsUrl);
  73. }
  74. /** @type Set<string> */
  75. const domainSets = new Set();
  76. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  77. const line = processLine(l);
  78. if (!line) {
  79. continue;
  80. }
  81. const [, ...domains] = line.split(' ');
  82. const _domain = domains.join(' ').trim();
  83. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  84. warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
  85. foundDebugDomain = true;
  86. }
  87. const domain = normalizeDomain(_domain);
  88. if (domain) {
  89. if (includeAllSubDomain) {
  90. domainSets.add(`.${domain}`);
  91. } else {
  92. domainSets.add(domain);
  93. }
  94. }
  95. }
  96. console.timeEnd(` - processHosts: ${hostsUrl}`);
  97. return domainSets;
  98. }
  99. /**
  100. * @param {string | URL} filterRulesUrl
  101. * @param {readonly (string | URL)[] | undefined} [fallbackUrls]
  102. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
  103. */
  104. async function processFilterRules(filterRulesUrl, fallbackUrls) {
  105. const runStart = performance.now();
  106. /** @type Set<string> */
  107. const whitelistDomainSets = new Set();
  108. /** @type Set<string> */
  109. const blacklistDomainSets = new Set();
  110. /**
  111. * @param {string} domainToBeAddedToBlack
  112. * @param {boolean} isSubDomain
  113. */
  114. const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  115. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  116. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  117. } else {
  118. blacklistDomainSets.add(domainToBeAddedToBlack);
  119. }
  120. };
  121. /**
  122. * @param {string} domainToBeAddedToWhite
  123. * @param {boolean} [isSubDomain]
  124. */
  125. const addToWhiteList = (domainToBeAddedToWhite, isSubDomain = true) => {
  126. if (isSubDomain && domainToBeAddedToWhite[0] !== '.') {
  127. whitelistDomainSets.add(`.${domainToBeAddedToWhite}`);
  128. } else {
  129. whitelistDomainSets.add(domainToBeAddedToWhite);
  130. }
  131. };
  132. let downloadTime = 0;
  133. const gorhill = await getGorhillPublicSuffixPromise();
  134. /**
  135. * @param {string} line
  136. */
  137. const lineCb = (line) => {
  138. const result = parse(line, gorhill);
  139. if (result) {
  140. const flag = result[1];
  141. const hostname = result[0];
  142. if (DEBUG_DOMAIN_TO_FIND) {
  143. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  144. warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
  145. foundDebugDomain = true;
  146. console.log({ result, flag });
  147. }
  148. }
  149. switch (flag) {
  150. case 0:
  151. addToWhiteList(hostname, true);
  152. break;
  153. case -1:
  154. addToWhiteList(hostname, false);
  155. break;
  156. case 1:
  157. addToBlackList(hostname, false);
  158. break;
  159. case 2:
  160. addToBlackList(hostname, true);
  161. break;
  162. default:
  163. throw new Error(`Unknown flag: ${flag}`);
  164. }
  165. }
  166. };
  167. if (!fallbackUrls || fallbackUrls.length === 0) {
  168. downloadTime = 0;
  169. let last = performance.now();
  170. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  171. const now = performance.now();
  172. downloadTime += performance.now() - last;
  173. last = now;
  174. // don't trim here
  175. lineCb(line);
  176. }
  177. } else {
  178. let filterRules;
  179. const downloadStart = performance.now();
  180. try {
  181. const controller = new AbortController();
  182. /** @type string[] */
  183. filterRules = (
  184. await Promise.any(
  185. [filterRulesUrl, ...(fallbackUrls || [])].map(async url => {
  186. const r = await fetchWithRetry(url, { signal: controller.signal });
  187. const text = await r.text();
  188. controller.abort();
  189. return text;
  190. })
  191. )
  192. ).split('\n');
  193. } catch (e) {
  194. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  195. throw e;
  196. }
  197. downloadTime = performance.now() - downloadStart;
  198. for (let i = 0, len = filterRules.length; i < len; i++) {
  199. lineCb(filterRules[i]);
  200. }
  201. }
  202. console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
  203. console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
  204. return {
  205. white: whitelistDomainSets,
  206. black: blacklistDomainSets,
  207. foundDebugDomain
  208. };
  209. }
  210. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  211. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  212. /**
  213. * @param {string} $line
  214. * @param {import('gorhill-publicsuffixlist').default} gorhill
  215. * @returns {null | [hostname: string, flag: 0 | 1 | 2 | -1]} - 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
  216. */
  217. function parse($line, gorhill) {
  218. if (
  219. // doesn't include
  220. !$line.includes('.') // rule with out dot can not be a domain
  221. // includes
  222. || $line.includes('!')
  223. || $line.includes('?')
  224. || $line.includes('*')
  225. || $line.includes('[')
  226. || $line.includes('(')
  227. || $line.includes(']')
  228. || $line.includes(')')
  229. || $line.includes(',')
  230. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  231. ) {
  232. return null;
  233. }
  234. const line = $line.trim();
  235. /** @example line.length */
  236. const len = line.length;
  237. if (len === 0) {
  238. return null;
  239. }
  240. const firstChar = line[0];
  241. const lastChar = line[len - 1];
  242. if (
  243. firstChar === '/'
  244. // ends with
  245. || lastChar === '.' // || line.endsWith('.')
  246. || lastChar === '-' // || line.endsWith('-')
  247. || lastChar === '_' // || line.endsWith('_')
  248. // special modifier
  249. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  250. // || line.includes('$popup')
  251. // || line.includes('$removeparam')
  252. // || line.includes('$popunder')
  253. ) {
  254. return null;
  255. }
  256. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  257. return null;
  258. }
  259. const filter = NetworkFilter.parse(line);
  260. if (filter) {
  261. if (
  262. filter.isElemHide()
  263. || filter.isGenericHide()
  264. || filter.isSpecificHide()
  265. || filter.isRedirect()
  266. || filter.isRedirectRule()
  267. || filter.hasDomains()
  268. || filter.isCSP() // must not be csp rule
  269. || (!filter.fromAny() && !filter.fromDocument())
  270. ) {
  271. // not supported type
  272. return null;
  273. }
  274. if (
  275. filter.hostname // filter.hasHostname() // must have
  276. && filter.isPlain()
  277. // && (!filter.isRegex()) // isPlain() === !isRegex()
  278. && (!filter.isFullRegex())
  279. ) {
  280. if (!gorhill.getDomain(filter.hostname)) {
  281. return null;
  282. }
  283. const hostname = normalizeDomain(filter.hostname);
  284. if (!hostname) {
  285. return null;
  286. }
  287. // console.log({
  288. // '||': filter.isHostnameAnchor(),
  289. // '|': filter.isLeftAnchor(),
  290. // '|https://': !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  291. // });
  292. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  293. if (filter.isException() || filter.isBadFilter()) {
  294. return [hostname, isIncludeAllSubDomain ? 0 : -1];
  295. }
  296. const _1p = filter.firstParty();
  297. const _3p = filter.thirdParty();
  298. if (_1p) {
  299. if (_1p === _3p) {
  300. return [hostname, isIncludeAllSubDomain ? 2 : 1];
  301. }
  302. return null;
  303. }
  304. if (_3p) {
  305. return null;
  306. }
  307. }
  308. }
  309. /**
  310. * abnormal filter that can not be parsed by NetworkFilter
  311. */
  312. if (line.includes('$third-party') || line.includes('$frame')) {
  313. /*
  314. * `.bbelements.com^$third-party`
  315. * `://o0e.ru^$third-party`
  316. */
  317. return null;
  318. }
  319. /** @example line.endsWith('^') */
  320. const linedEndsWithCaret = lastChar === '^';
  321. /** @example line.endsWith('^|') */
  322. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  323. /** @example line.endsWith('^') || line.endsWith('^|') */
  324. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  325. // whitelist (exception)
  326. if (firstChar === '@' && line[1] === '@') {
  327. /**
  328. * cname exceptional filter can not be parsed by NetworkFilter
  329. *
  330. * `@@||m.faz.net^$cname`
  331. *
  332. * Surge / Clash can't handle CNAME either, so we just ignore them
  333. */
  334. if (line.endsWith('$cname')) {
  335. return null;
  336. }
  337. /**
  338. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  339. * "$genericblock`" is also not supported by NetworkFilter
  340. *
  341. * `@@||cmechina.net^$genericblock`
  342. * `@@|ftp.bmp.ovh^|`
  343. * `@@|adsterra.com^|`
  344. */
  345. if (
  346. (
  347. // line.startsWith('@@|')
  348. line[2] === '|'
  349. // line.startsWith('@@.')
  350. || line[2] === '.'
  351. /**
  352. * line.startsWith('@@://')
  353. *
  354. * `@@://googleadservices.com^|`
  355. * `@@://www.googleadservices.com^|`
  356. */
  357. || (line[2] === ':' && line[3] === '/' && line[4] === '/')
  358. )
  359. && (
  360. lineEndsWithCaretOrCaretVerticalBar
  361. || line.endsWith('$genericblock')
  362. || line.endsWith('$document')
  363. )
  364. ) {
  365. const _domain = line
  366. .replace('@@||', '')
  367. .replace('@@://', '')
  368. .replace('@@|', '')
  369. .replace('@@.', '')
  370. .replace('^|', '')
  371. .replace('^$genericblock', '')
  372. .replace('$genericblock', '')
  373. .replace('^$document', '')
  374. .replace('$document', '')
  375. .replaceAll('^', '')
  376. .trim();
  377. const domain = normalizeDomain(_domain);
  378. if (domain) {
  379. return [domain, 0];
  380. }
  381. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  382. return null;
  383. }
  384. }
  385. if (firstChar === '|') {
  386. const lineEndswithCname = line.endsWith('$cname');
  387. if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
  388. /**
  389. * Some malformed filters can not be parsed by NetworkFilter:
  390. *
  391. * `||smetrics.teambeachbody.com^.com^`
  392. * `||solutions.|pages.indigovision.com^`
  393. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  394. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  395. */
  396. const includeAllSubDomain = line[1] === '|';
  397. const sliceStart = includeAllSubDomain ? 2 : 1;
  398. const sliceEnd = lastChar === '^'
  399. ? -1
  400. : lineEndsWithCaretOrCaretVerticalBar
  401. ? -2
  402. // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
  403. : (lineEndswithCname ? -6 : 0);
  404. const _domain = line
  405. .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
  406. .trim();
  407. const domain = normalizeDomain(_domain);
  408. if (domain) {
  409. return [domain, includeAllSubDomain ? 2 : 1];
  410. }
  411. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  412. return null;
  413. }
  414. }
  415. const lineStartsWithSingleDot = firstChar === '.';
  416. if (
  417. lineStartsWithSingleDot
  418. && lineEndsWithCaretOrCaretVerticalBar
  419. ) {
  420. /**
  421. * `.ay.delivery^`
  422. * `.m.bookben.com^`
  423. * `.wap.x4399.com^`
  424. */
  425. const _domain = line.slice(
  426. 1, // remove prefix dot
  427. linedEndsWithCaret // replaceAll('^', '')
  428. ? -1
  429. : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
  430. );
  431. const suffix = gorhill.getPublicSuffix(_domain);
  432. if (!gorhill.suffixInPSL(suffix)) {
  433. // This exclude domain-like resource like `1.1.4.514.js`
  434. return null;
  435. }
  436. const domain = normalizeDomain(_domain);
  437. if (domain) {
  438. return [domain, 2];
  439. }
  440. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  441. return null;
  442. }
  443. /**
  444. * `|http://x.o2.pl^`
  445. * `://mine.torrent.pw^`
  446. * `://say.ac^`
  447. */
  448. if (
  449. (
  450. line.startsWith('://')
  451. || line.startsWith('http://')
  452. || line.startsWith('https://')
  453. || line.startsWith('|http://')
  454. || line.startsWith('|https://')
  455. )
  456. && lineEndsWithCaretOrCaretVerticalBar
  457. ) {
  458. const _domain = line
  459. .replace('|https://', '')
  460. .replace('https://', '')
  461. .replace('|http://', '')
  462. .replace('http://', '')
  463. .replace('://', '')
  464. .replace('^|', '')
  465. .replaceAll('^', '')
  466. .trim();
  467. const domain = normalizeDomain(_domain);
  468. if (domain) {
  469. return [domain, 1];
  470. }
  471. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  472. return null;
  473. }
  474. /**
  475. * `_vmind.qqvideo.tc.qq.com^`
  476. * `arketing.indianadunes.com^`
  477. * `charlestownwyllie.oaklawnnonantum.com^`
  478. * `-telemetry.officeapps.live.com^`
  479. * `-tracker.biliapi.net`
  480. * `-logging.nextmedia.com`
  481. * `_social_tracking.js^`
  482. */
  483. if (firstChar !== '|' && lastChar === '^') {
  484. const _domain = line.slice(0, -1);
  485. const suffix = gorhill.getPublicSuffix(_domain);
  486. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  487. // This exclude domain-like resource like `_social_tracking.js^`
  488. return null;
  489. }
  490. const domain = normalizeDomain(_domain);
  491. if (domain) {
  492. return [domain, 1];
  493. }
  494. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  495. return null;
  496. }
  497. if (lineStartsWithSingleDot) {
  498. /**
  499. * `.cookielaw.js`
  500. * `.content_tracking.js`
  501. * `.ads.css`
  502. */
  503. const _domain = line.slice(1);
  504. const suffix = gorhill.getPublicSuffix(_domain);
  505. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  506. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  507. return null;
  508. }
  509. const tryNormalizeDomain = normalizeDomain(_domain);
  510. if (tryNormalizeDomain === _domain) {
  511. // the entire rule is domain
  512. return [line, 2];
  513. }
  514. } else {
  515. /**
  516. * `_prebid.js`
  517. * `t.yesware.com`
  518. * `ubmcmm.baidustatic.com`
  519. * `://www.smfg-card.$document`
  520. * `portal.librus.pl$$advertisement-module`
  521. * `@@-ds.metric.gstatic.com^|`
  522. * `://gom.ge/cookie.js`
  523. * `://accout-update-smba.jp.$document`
  524. * `_200x250.png`
  525. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  526. */
  527. const tryNormalizeDomain = normalizeDomain(line);
  528. if (tryNormalizeDomain === line) {
  529. // the entire rule is domain
  530. return [line, 2];
  531. }
  532. }
  533. console.warn(' * [parse-filter E0010] can not parse:', line);
  534. return null;
  535. }
  536. module.exports.processDomainLists = processDomainLists;
  537. module.exports.processHosts = processHosts;
  538. module.exports.processFilterRules = processFilterRules;