parse-filter.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. // @ts-check
  2. const { fetchWithRetry } = require('./fetch-retry');
  3. const tldts = require('./cached-tld-parse');
  4. const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-text-by-line');
  5. const { NetworkFilter } = require('@cliqz/adblocker');
  6. const { processLine } = require('./process-line');
  7. const { performance } = require('perf_hooks');
  8. const { getGorhillPublicSuffixPromise } = require('./get-gorhill-publicsuffix');
  9. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  10. let foundDebugDomain = false;
  11. const warnOnceUrl = new Set();
  12. const warnOnce = (url, isWhite, ...message) => {
  13. const key = `${url}${isWhite ? 'white' : 'black'}`;
  14. if (warnOnceUrl.has(key)) {
  15. return;
  16. }
  17. warnOnceUrl.add(key);
  18. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  19. };
  20. /**
  21. * @param {string} domain
  22. */
  23. const normalizeDomain = (domain) => {
  24. if (!domain) return null;
  25. const parsed = tldts.parse(domain);
  26. if (parsed.isIp) return null;
  27. if (parsed.isIcann || parsed.isPrivate) {
  28. const h = parsed.hostname;
  29. if (h === null) return null;
  30. return h[0] === '.' ? h.slice(1) : h;
  31. }
  32. return null;
  33. };
  34. /**
  35. * @param {string | URL} domainListsUrl
  36. */
  37. async function processDomainLists(domainListsUrl) {
  38. if (typeof domainListsUrl === 'string') {
  39. domainListsUrl = new URL(domainListsUrl);
  40. }
  41. /** @type Set<string> */
  42. const domainSets = new Set();
  43. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  44. if (line[0] === '!') {
  45. continue;
  46. }
  47. const domainToAdd = processLine(line);
  48. if (!domainToAdd) {
  49. continue;
  50. }
  51. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  52. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  53. foundDebugDomain = true;
  54. }
  55. domainSets.add(domainToAdd);
  56. }
  57. return domainSets;
  58. }
  59. /**
  60. * @param {string | URL} hostsUrl
  61. */
  62. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  63. console.time(` - processHosts: ${hostsUrl}`);
  64. if (typeof hostsUrl === 'string') {
  65. hostsUrl = new URL(hostsUrl);
  66. }
  67. /** @type Set<string> */
  68. const domainSets = new Set();
  69. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  70. const line = processLine(l);
  71. if (!line) {
  72. continue;
  73. }
  74. const [, ...domains] = line.split(' ');
  75. const _domain = domains.join(' ').trim();
  76. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  77. warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
  78. foundDebugDomain = true;
  79. }
  80. const domain = normalizeDomain(_domain);
  81. if (domain) {
  82. if (includeAllSubDomain) {
  83. domainSets.add(`.${domain}`);
  84. } else {
  85. domainSets.add(domain);
  86. }
  87. }
  88. }
  89. console.timeEnd(` - processHosts: ${hostsUrl}`);
  90. return domainSets;
  91. }
  92. /**
  93. * @param {string | URL} filterRulesUrl
  94. * @param {readonly (string | URL)[] | undefined} [fallbackUrls]
  95. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
  96. */
  97. async function processFilterRules(filterRulesUrl, fallbackUrls) {
  98. const runStart = performance.now();
  99. /** @type Set<string> */
  100. const whitelistDomainSets = new Set();
  101. /** @type Set<string> */
  102. const blacklistDomainSets = new Set();
  103. /**
  104. * @param {string} domainToBeAddedToBlack
  105. * @param {boolean} isSubDomain
  106. */
  107. const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  108. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  109. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  110. } else {
  111. blacklistDomainSets.add(domainToBeAddedToBlack);
  112. }
  113. };
  114. /**
  115. * @param {string} domainToBeAddedToWhite
  116. * @param {boolean} [isSubDomain]
  117. */
  118. const addToWhiteList = (domainToBeAddedToWhite, isSubDomain = true) => {
  119. if (isSubDomain && domainToBeAddedToWhite[0] !== '.') {
  120. whitelistDomainSets.add(`.${domainToBeAddedToWhite}`);
  121. } else {
  122. whitelistDomainSets.add(domainToBeAddedToWhite);
  123. }
  124. };
  125. let downloadTime = 0;
  126. const gorhill = await getGorhillPublicSuffixPromise();
  127. const lineCb = (line) => {
  128. const result = parse(line, gorhill);
  129. if (result) {
  130. const flag = result[1];
  131. const hostname = result[0];
  132. if (DEBUG_DOMAIN_TO_FIND) {
  133. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  134. warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
  135. foundDebugDomain = true;
  136. console.log({ result, flag });
  137. }
  138. }
  139. switch (flag) {
  140. case 0:
  141. addToWhiteList(hostname, true);
  142. break;
  143. case -1:
  144. addToWhiteList(hostname, false);
  145. break;
  146. case 1:
  147. addToBlackList(hostname, false);
  148. break;
  149. case 2:
  150. addToBlackList(hostname, true);
  151. break;
  152. default:
  153. throw new Error(`Unknown flag: ${flag}`);
  154. }
  155. }
  156. };
  157. if (!fallbackUrls || fallbackUrls.length === 0) {
  158. downloadTime = 0;
  159. let last = performance.now();
  160. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  161. const now = performance.now();
  162. downloadTime += performance.now() - last;
  163. last = now;
  164. // don't trim here
  165. lineCb(line);
  166. }
  167. } else {
  168. let filterRules;
  169. const downloadStart = performance.now();
  170. try {
  171. const controller = new AbortController();
  172. /** @type string[] */
  173. filterRules = (
  174. await Promise.any(
  175. [filterRulesUrl, ...(fallbackUrls || [])].map(async url => {
  176. const r = await fetchWithRetry(url, { signal: controller.signal });
  177. const text = await r.text();
  178. controller.abort();
  179. return text;
  180. })
  181. )
  182. ).split('\n');
  183. } catch (e) {
  184. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  185. throw e;
  186. }
  187. downloadTime = performance.now() - downloadStart;
  188. for (let i = 0, len = filterRules.length; i < len; i++) {
  189. lineCb(filterRules[i]);
  190. }
  191. }
  192. console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
  193. console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
  194. return {
  195. white: whitelistDomainSets,
  196. black: blacklistDomainSets,
  197. foundDebugDomain
  198. };
  199. }
  200. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  201. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  202. /**
  203. * @param {string} $line
  204. * @param {import('gorhill-publicsuffixlist').default} gorhill
  205. * @returns {null | [hostname: string, flag: 0 | 1 | 2 | -1]} - 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
  206. */
  207. function parse($line, gorhill) {
  208. if (
  209. // doesn't include
  210. !$line.includes('.') // rule with out dot can not be a domain
  211. // includes
  212. || $line.includes('!')
  213. || $line.includes('?')
  214. || $line.includes('*')
  215. || $line.includes('[')
  216. || $line.includes('(')
  217. || $line.includes(']')
  218. || $line.includes(')')
  219. || $line.includes(',')
  220. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  221. ) {
  222. return null;
  223. }
  224. const line = $line.trim();
  225. /** @example line.length */
  226. const len = line.length;
  227. if (len === 0) {
  228. return null;
  229. }
  230. const firstChar = line[0];
  231. const lastChar = line[len - 1];
  232. if (
  233. firstChar === '/'
  234. // ends with
  235. || lastChar === '.' // || line.endsWith('.')
  236. || lastChar === '-' // || line.endsWith('-')
  237. || lastChar === '_' // || line.endsWith('_')
  238. // special modifier
  239. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  240. // || line.includes('$popup')
  241. // || line.includes('$removeparam')
  242. // || line.includes('$popunder')
  243. ) {
  244. return null;
  245. }
  246. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  247. return null;
  248. }
  249. const filter = NetworkFilter.parse(line);
  250. if (filter) {
  251. if (
  252. filter.isElemHide()
  253. || filter.isGenericHide()
  254. || filter.isSpecificHide()
  255. || filter.isRedirect()
  256. || filter.isRedirectRule()
  257. || filter.hasDomains()
  258. || filter.isCSP() // must not be csp rule
  259. || (!filter.fromAny() && !filter.fromDocument())
  260. ) {
  261. // not supported type
  262. return null;
  263. }
  264. if (
  265. filter.hostname // filter.hasHostname() // must have
  266. && filter.isPlain()
  267. // && (!filter.isRegex()) // isPlain() === !isRegex()
  268. && (!filter.isFullRegex())
  269. ) {
  270. if (!gorhill.getDomain(filter.hostname)) {
  271. return null;
  272. }
  273. const hostname = normalizeDomain(filter.hostname);
  274. if (!hostname) {
  275. return null;
  276. }
  277. // console.log({
  278. // '||': filter.isHostnameAnchor(),
  279. // '|': filter.isLeftAnchor(),
  280. // '|https://': !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  281. // });
  282. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  283. if (filter.isException() || filter.isBadFilter()) {
  284. return [hostname, isIncludeAllSubDomain ? 0 : -1];
  285. }
  286. const _1p = filter.firstParty();
  287. const _3p = filter.thirdParty();
  288. if (_1p) {
  289. if (_1p === _3p) {
  290. return [hostname, isIncludeAllSubDomain ? 2 : 1];
  291. }
  292. return null;
  293. }
  294. if (_3p) {
  295. return null;
  296. }
  297. }
  298. }
  299. /**
  300. * abnormal filter that can not be parsed by NetworkFilter
  301. */
  302. if (line.includes('$third-party') || line.includes('$frame')) {
  303. /*
  304. * `.bbelements.com^$third-party`
  305. * `://o0e.ru^$third-party`
  306. */
  307. return null;
  308. }
  309. /** @example line.endsWith('^') */
  310. const linedEndsWithCaret = lastChar === '^';
  311. /** @example line.endsWith('^|') */
  312. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  313. /** @example line.endsWith('^') || line.endsWith('^|') */
  314. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  315. // whitelist (exception)
  316. if (firstChar === '@' && line[1] === '@') {
  317. /**
  318. * cname exceptional filter can not be parsed by NetworkFilter
  319. *
  320. * `@@||m.faz.net^$cname`
  321. *
  322. * Surge / Clash can't handle CNAME either, so we just ignore them
  323. */
  324. if (line.endsWith('$cname')) {
  325. return null;
  326. }
  327. /**
  328. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  329. * "$genericblock`" is also not supported by NetworkFilter
  330. *
  331. * `@@||cmechina.net^$genericblock`
  332. * `@@|ftp.bmp.ovh^|`
  333. * `@@|adsterra.com^|`
  334. */
  335. if (
  336. (
  337. // line.startsWith('@@|')
  338. line[2] === '|'
  339. // line.startsWith('@@.')
  340. || line[2] === '.'
  341. /**
  342. * line.startsWith('@@://')
  343. *
  344. * `@@://googleadservices.com^|`
  345. * `@@://www.googleadservices.com^|`
  346. */
  347. || (line[2] === ':' && line[3] === '/' && line[4] === '/')
  348. )
  349. && (
  350. lineEndsWithCaretOrCaretVerticalBar
  351. || line.endsWith('$genericblock')
  352. || line.endsWith('$document')
  353. )
  354. ) {
  355. const _domain = line
  356. .replace('@@||', '')
  357. .replace('@@://', '')
  358. .replace('@@|', '')
  359. .replace('@@.', '')
  360. .replace('^|', '')
  361. .replace('^$genericblock', '')
  362. .replace('$genericblock', '')
  363. .replace('^$document', '')
  364. .replace('$document', '')
  365. .replaceAll('^', '')
  366. .trim();
  367. const domain = normalizeDomain(_domain);
  368. if (domain) {
  369. return [domain, 0];
  370. }
  371. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  372. return null;
  373. }
  374. }
  375. if (firstChar === '|') {
  376. const lineEndswithCname = line.endsWith('$cname');
  377. if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
  378. /**
  379. * Some malformed filters can not be parsed by NetworkFilter:
  380. *
  381. * `||smetrics.teambeachbody.com^.com^`
  382. * `||solutions.|pages.indigovision.com^`
  383. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  384. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  385. */
  386. const includeAllSubDomain = line[1] === '|';
  387. const sliceStart = includeAllSubDomain ? 2 : 1;
  388. const sliceEnd = lastChar === '^'
  389. ? -1
  390. : lineEndsWithCaretOrCaretVerticalBar
  391. ? -2
  392. // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
  393. : (lineEndswithCname ? -6 : 0);
  394. const _domain = line
  395. .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
  396. .trim();
  397. const domain = normalizeDomain(_domain);
  398. if (domain) {
  399. return [domain, includeAllSubDomain ? 2 : 1];
  400. }
  401. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  402. return null;
  403. }
  404. }
  405. const lineStartsWithSingleDot = firstChar === '.';
  406. if (
  407. lineStartsWithSingleDot
  408. && lineEndsWithCaretOrCaretVerticalBar
  409. ) {
  410. /**
  411. * `.ay.delivery^`
  412. * `.m.bookben.com^`
  413. * `.wap.x4399.com^`
  414. */
  415. const _domain = line.slice(
  416. 1, // remove prefix dot
  417. linedEndsWithCaret // replaceAll('^', '')
  418. ? -1
  419. : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
  420. );
  421. const suffix = gorhill.getPublicSuffix(_domain);
  422. if (!gorhill.suffixInPSL(suffix)) {
  423. // This exclude domain-like resource like `1.1.4.514.js`
  424. return null;
  425. }
  426. const domain = normalizeDomain(_domain);
  427. if (domain) {
  428. return [domain, 2];
  429. }
  430. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  431. return null;
  432. }
  433. /**
  434. * `|http://x.o2.pl^`
  435. * `://mine.torrent.pw^`
  436. * `://say.ac^`
  437. */
  438. if (
  439. (
  440. line.startsWith('://')
  441. || line.startsWith('http://')
  442. || line.startsWith('https://')
  443. || line.startsWith('|http://')
  444. || line.startsWith('|https://')
  445. )
  446. && lineEndsWithCaretOrCaretVerticalBar
  447. ) {
  448. const _domain = line
  449. .replace('|https://', '')
  450. .replace('https://', '')
  451. .replace('|http://', '')
  452. .replace('http://', '')
  453. .replace('://', '')
  454. .replace('^|', '')
  455. .replaceAll('^', '')
  456. .trim();
  457. const domain = normalizeDomain(_domain);
  458. if (domain) {
  459. return [domain, 1];
  460. }
  461. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  462. return null;
  463. }
  464. /**
  465. * `_vmind.qqvideo.tc.qq.com^`
  466. * `arketing.indianadunes.com^`
  467. * `charlestownwyllie.oaklawnnonantum.com^`
  468. * `-telemetry.officeapps.live.com^`
  469. * `-tracker.biliapi.net`
  470. * `-logging.nextmedia.com`
  471. * `_social_tracking.js^`
  472. */
  473. if (firstChar !== '|' && lastChar === '^') {
  474. const _domain = line.slice(0, -1);
  475. const suffix = gorhill.getPublicSuffix(_domain);
  476. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  477. // This exclude domain-like resource like `_social_tracking.js^`
  478. return null;
  479. }
  480. const domain = normalizeDomain(_domain);
  481. if (domain) {
  482. return [domain, 1];
  483. }
  484. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  485. return null;
  486. }
  487. if (lineStartsWithSingleDot) {
  488. /**
  489. * `.cookielaw.js`
  490. * `.content_tracking.js`
  491. * `.ads.css`
  492. */
  493. const _domain = line.slice(1);
  494. const suffix = gorhill.getPublicSuffix(_domain);
  495. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  496. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  497. return null;
  498. }
  499. const tryNormalizeDomain = normalizeDomain(_domain);
  500. if (tryNormalizeDomain === _domain) {
  501. // the entire rule is domain
  502. return [line, 2];
  503. }
  504. } else {
  505. /**
  506. * `_prebid.js`
  507. * `t.yesware.com`
  508. * `ubmcmm.baidustatic.com`
  509. * `://www.smfg-card.$document`
  510. * `portal.librus.pl$$advertisement-module`
  511. * `@@-ds.metric.gstatic.com^|`
  512. * `://gom.ge/cookie.js`
  513. * `://accout-update-smba.jp.$document`
  514. * `_200x250.png`
  515. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  516. */
  517. const tryNormalizeDomain = normalizeDomain(line);
  518. if (tryNormalizeDomain === line) {
  519. // the entire rule is domain
  520. return [line, 2];
  521. }
  522. }
  523. console.warn(' * [parse-filter E0010] can not parse:', line);
  524. return null;
  525. }
  526. module.exports.processDomainLists = processDomainLists;
  527. module.exports.processHosts = processHosts;
  528. module.exports.processFilterRules = processFilterRules;