parse-filter.ts 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. // @ts-check
  2. import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
  3. import * as tldts from './cached-tld-parse';
  4. import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
  5. import { NetworkFilter } from '@cliqz/adblocker';
  6. import { processLine } from './process-line';
  7. import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
  8. import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
  9. import { isProbablyIpv4 } from './is-fast-ip';
  10. import { traceAsync } from './trace-runner';
  11. import picocolors from 'picocolors';
  12. const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
  13. let foundDebugDomain = false;
  14. const warnOnceUrl = new Set<string>();
  15. const warnOnce = (url: string, isWhite: boolean, ...message: any[]) => {
  16. const key = `${url}${isWhite ? 'white' : 'black'}`;
  17. if (warnOnceUrl.has(key)) {
  18. return;
  19. }
  20. warnOnceUrl.add(key);
  21. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  22. };
  23. const normalizeDomain = (domain: string) => {
  24. if (!domain) return null;
  25. if (isProbablyIpv4(domain)) return null;
  26. const parsed = tldts.parse2(domain);
  27. if (parsed.isIp) return null;
  28. if (!parsed.isIcann && !parsed.isPrivate) return null;
  29. const h = parsed.hostname;
  30. if (!h) return null;
  31. return h[0] === '.' ? h.slice(1) : h;
  32. };
  33. export async function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) {
  34. const domainSets = new Set<string>();
  35. for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) {
  36. const domainToAdd = processLine(line);
  37. if (!domainToAdd) {
  38. continue;
  39. }
  40. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  41. warnOnce(domainListsUrl, false, DEBUG_DOMAIN_TO_FIND);
  42. foundDebugDomain = true;
  43. }
  44. if (includeAllSubDomain) {
  45. domainSets.add(`.${domainToAdd}`);
  46. } else {
  47. domainSets.add(domainToAdd);
  48. }
  49. }
  50. return domainSets;
  51. }
  52. export async function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) {
  53. return traceAsync(`- processHosts: ${hostsUrl}`, async () => {
  54. const domainSets = new Set<string>();
  55. for await (const l of await fetchRemoteTextAndReadByLine(hostsUrl)) {
  56. const line = processLine(l);
  57. if (!line) {
  58. continue;
  59. }
  60. const [, ...domains] = line.split(' ');
  61. const _domain = domains.join(' ').trim();
  62. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  63. warnOnce(hostsUrl, false, DEBUG_DOMAIN_TO_FIND);
  64. foundDebugDomain = true;
  65. }
  66. const domain = skipDomainCheck ? _domain : normalizeDomain(_domain);
  67. if (domain) {
  68. if (includeAllSubDomain) {
  69. domainSets.add(`.${domain}`);
  70. } else {
  71. domainSets.add(domain);
  72. }
  73. }
  74. }
  75. return domainSets;
  76. });
  77. }
  78. // eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
  79. const enum ParseType {
  80. WhiteIncludeSubdomain = 0,
  81. WhiteAbsolute = -1,
  82. BlackAbsolute = 1,
  83. BlackIncludeSubdomain = 2,
  84. ErrorMessage = 10
  85. }
  86. export async function processFilterRules(
  87. filterRulesUrl: string,
  88. fallbackUrls?: readonly string[] | undefined
  89. ): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
  90. const whitelistDomainSets = new Set<string>();
  91. const blacklistDomainSets = new Set<string>();
  92. const warningMessages: string[] = [];
  93. await traceAsync(`- processFilterRules: ${filterRulesUrl}`, async () => {
  94. const gorhill = await getGorhillPublicSuffixPromise();
  95. /**
  96. * @param {string} line
  97. */
  98. const lineCb = (line: string) => {
  99. const result = parse(line, gorhill);
  100. if (!result) {
  101. return;
  102. }
  103. const flag = result[1];
  104. const hostname = result[0];
  105. if (DEBUG_DOMAIN_TO_FIND) {
  106. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  107. warnOnce(filterRulesUrl, flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute, DEBUG_DOMAIN_TO_FIND);
  108. foundDebugDomain = true;
  109. }
  110. }
  111. switch (flag) {
  112. case ParseType.WhiteIncludeSubdomain:
  113. if (hostname[0] !== '.') {
  114. whitelistDomainSets.add(`.${hostname}`);
  115. } else {
  116. whitelistDomainSets.add(hostname);
  117. }
  118. break;
  119. case ParseType.WhiteAbsolute:
  120. whitelistDomainSets.add(hostname);
  121. break;
  122. case ParseType.BlackAbsolute:
  123. blacklistDomainSets.add(hostname);
  124. break;
  125. case ParseType.BlackIncludeSubdomain:
  126. if (hostname[0] !== '.') {
  127. blacklistDomainSets.add(`.${hostname}`);
  128. } else {
  129. blacklistDomainSets.add(hostname);
  130. }
  131. break;
  132. case ParseType.ErrorMessage:
  133. warningMessages.push(hostname);
  134. break;
  135. default:
  136. throw new Error(`Unknown flag: ${flag as any}`);
  137. }
  138. };
  139. if (!fallbackUrls || fallbackUrls.length === 0) {
  140. for await (const line of await fetchRemoteTextAndReadByLine(filterRulesUrl)) {
  141. // don't trim here
  142. lineCb(line);
  143. }
  144. } else {
  145. const filterRules = (await traceAsync(
  146. picocolors.gray(`- download ${filterRulesUrl}`),
  147. () => fetchAssets(filterRulesUrl, fallbackUrls),
  148. picocolors.gray
  149. )).split('\n');
  150. for (let i = 0, len = filterRules.length; i < len; i++) {
  151. lineCb(filterRules[i]);
  152. }
  153. }
  154. });
  155. warningMessages.forEach(msg => {
  156. console.warn(
  157. picocolors.yellow(msg),
  158. picocolors.gray(picocolors.underline(filterRulesUrl))
  159. );
  160. });
  161. return {
  162. white: whitelistDomainSets,
  163. black: blacklistDomainSets,
  164. foundDebugDomain
  165. };
  166. }
  167. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  168. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  169. function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: ParseType] {
  170. if (
  171. // doesn't include
  172. !$line.includes('.') // rule with out dot can not be a domain
  173. // includes
  174. || $line.includes('!')
  175. || $line.includes('?')
  176. || $line.includes('*')
  177. || $line.includes('[')
  178. || $line.includes('(')
  179. || $line.includes(']')
  180. || $line.includes(')')
  181. || $line.includes(',')
  182. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  183. ) {
  184. return null;
  185. }
  186. const line = $line.trim();
  187. /** @example line.length */
  188. const len = line.length;
  189. if (len === 0) {
  190. return null;
  191. }
  192. const firstChar = line[0];
  193. const lastChar = line[len - 1];
  194. if (
  195. firstChar === '/'
  196. // ends with
  197. || lastChar === '.' // || line.endsWith('.')
  198. || lastChar === '-' // || line.endsWith('-')
  199. || lastChar === '_' // || line.endsWith('_')
  200. // special modifier
  201. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  202. // || line.includes('$popup')
  203. // || line.includes('$removeparam')
  204. // || line.includes('$popunder')
  205. ) {
  206. return null;
  207. }
  208. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  209. return null;
  210. }
  211. const filter = NetworkFilter.parse(line);
  212. if (filter) {
  213. if (
  214. filter.isElemHide()
  215. || filter.isGenericHide()
  216. || filter.isSpecificHide()
  217. || filter.isRedirect()
  218. || filter.isRedirectRule()
  219. || filter.hasDomains()
  220. || filter.isCSP() // must not be csp rule
  221. || (!filter.fromAny() && !filter.fromDocument())
  222. ) {
  223. // not supported type
  224. return null;
  225. }
  226. if (
  227. filter.hostname // filter.hasHostname() // must have
  228. && filter.isPlain()
  229. // && (!filter.isRegex()) // isPlain() === !isRegex()
  230. && (!filter.isFullRegex())
  231. ) {
  232. const hostname = normalizeDomain(filter.hostname);
  233. if (!hostname) {
  234. return null;
  235. }
  236. // |: filter.isHostnameAnchor(),
  237. // |: filter.isLeftAnchor(),
  238. // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  239. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  240. if (filter.isException() || filter.isBadFilter()) {
  241. return [hostname, isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute];
  242. }
  243. const _1p = filter.firstParty();
  244. const _3p = filter.thirdParty();
  245. if (_1p) {
  246. if (_1p === _3p) {
  247. return [hostname, isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
  248. }
  249. return null;
  250. }
  251. if (_3p) {
  252. return null;
  253. }
  254. }
  255. }
  256. /**
  257. * abnormal filter that can not be parsed by NetworkFilter
  258. */
  259. if (line.includes('$third-party') || line.includes('$frame')) {
  260. /*
  261. * `.bbelements.com^$third-party`
  262. * `://o0e.ru^$third-party`
  263. */
  264. return null;
  265. }
  266. /** @example line.endsWith('^') */
  267. const linedEndsWithCaret = lastChar === '^';
  268. /** @example line.endsWith('^|') */
  269. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  270. /** @example line.endsWith('^') || line.endsWith('^|') */
  271. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  272. // whitelist (exception)
  273. if (firstChar === '@' && line[1] === '@') {
  274. /**
  275. * cname exceptional filter can not be parsed by NetworkFilter
  276. *
  277. * `@@||m.faz.net^$cname`
  278. *
  279. * Surge / Clash can't handle CNAME either, so we just ignore them
  280. */
  281. if (line.endsWith('$cname')) {
  282. return null;
  283. }
  284. /**
  285. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  286. * "$genericblock`" is also not supported by NetworkFilter
  287. *
  288. * `@@||cmechina.net^$genericblock`
  289. * `@@|ftp.bmp.ovh^|`
  290. * `@@|adsterra.com^|`
  291. */
  292. if (
  293. (
  294. // line.startsWith('@@|')
  295. line[2] === '|'
  296. // line.startsWith('@@.')
  297. || line[2] === '.'
  298. /**
  299. * line.startsWith('@@://')
  300. *
  301. * `@@://googleadservices.com^|`
  302. * `@@://www.googleadservices.com^|`
  303. */
  304. || (line[2] === ':' && line[3] === '/' && line[4] === '/')
  305. )
  306. && (
  307. lineEndsWithCaretOrCaretVerticalBar
  308. || line.endsWith('$genericblock')
  309. || line.endsWith('$document')
  310. )
  311. ) {
  312. const _domain = line
  313. .replace('@@||', '')
  314. .replace('@@://', '')
  315. .replace('@@|', '')
  316. .replace('@@.', '')
  317. .replace('^|', '')
  318. .replace('^$genericblock', '')
  319. .replace('$genericblock', '')
  320. .replace('^$document', '')
  321. .replace('$document', '')
  322. .replaceAll('^', '')
  323. .trim();
  324. const domain = normalizeDomain(_domain);
  325. if (domain) {
  326. return [domain, ParseType.WhiteIncludeSubdomain];
  327. }
  328. return [
  329. `[parse-filter E0001] (white) invalid domain: ${_domain}`,
  330. ParseType.ErrorMessage
  331. ];
  332. }
  333. }
  334. if (firstChar === '|') {
  335. const lineEndswithCname = line.endsWith('$cname');
  336. if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
  337. /**
  338. * Some malformed filters can not be parsed by NetworkFilter:
  339. *
  340. * `||smetrics.teambeachbody.com^.com^`
  341. * `||solutions.|pages.indigovision.com^`
  342. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  343. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  344. */
  345. const includeAllSubDomain = line[1] === '|';
  346. const sliceStart = includeAllSubDomain ? 2 : 1;
  347. const sliceEnd = lastChar === '^'
  348. ? -1
  349. : lineEndsWithCaretOrCaretVerticalBar
  350. ? -2
  351. // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
  352. : (lineEndswithCname ? -6 : 0);
  353. const _domain = line
  354. .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
  355. .trim();
  356. const domain = normalizeDomain(_domain);
  357. if (domain) {
  358. return [domain, includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
  359. }
  360. return [
  361. `[parse-filter E0002] (black) invalid domain: ${_domain}`,
  362. ParseType.ErrorMessage
  363. ];
  364. }
  365. }
  366. const lineStartsWithSingleDot = firstChar === '.';
  367. if (
  368. lineStartsWithSingleDot
  369. && lineEndsWithCaretOrCaretVerticalBar
  370. ) {
  371. /**
  372. * `.ay.delivery^`
  373. * `.m.bookben.com^`
  374. * `.wap.x4399.com^`
  375. */
  376. const _domain = line.slice(
  377. 1, // remove prefix dot
  378. linedEndsWithCaret // replaceAll('^', '')
  379. ? -1
  380. : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
  381. );
  382. const suffix = gorhill.getPublicSuffix(_domain);
  383. if (!gorhill.suffixInPSL(suffix)) {
  384. // This exclude domain-like resource like `1.1.4.514.js`
  385. return null;
  386. }
  387. const domain = normalizeDomain(_domain);
  388. if (domain) {
  389. return [domain, ParseType.BlackIncludeSubdomain];
  390. }
  391. return [
  392. `[paparse-filter E0003] (black) invalid domain: ${_domain}`,
  393. ParseType.ErrorMessage
  394. ];
  395. }
  396. /**
  397. * `|http://x.o2.pl^`
  398. * `://mine.torrent.pw^`
  399. * `://say.ac^`
  400. */
  401. if (
  402. (
  403. line.startsWith('://')
  404. || line.startsWith('http://')
  405. || line.startsWith('https://')
  406. || line.startsWith('|http://')
  407. || line.startsWith('|https://')
  408. )
  409. && lineEndsWithCaretOrCaretVerticalBar
  410. ) {
  411. const _domain = line
  412. .replace('|https://', '')
  413. .replace('https://', '')
  414. .replace('|http://', '')
  415. .replace('http://', '')
  416. .replace('://', '')
  417. .replace('^|', '')
  418. .replaceAll('^', '')
  419. .trim();
  420. const domain = normalizeDomain(_domain);
  421. if (domain) {
  422. return [domain, ParseType.BlackAbsolute];
  423. }
  424. return [
  425. `[parse-filter E0004] (black) invalid domain: ${_domain}`,
  426. ParseType.ErrorMessage
  427. ];
  428. }
  429. /**
  430. * `_vmind.qqvideo.tc.qq.com^`
  431. * `arketing.indianadunes.com^`
  432. * `charlestownwyllie.oaklawnnonantum.com^`
  433. * `-telemetry.officeapps.live.com^`
  434. * `-tracker.biliapi.net`
  435. * `-logging.nextmedia.com`
  436. * `_social_tracking.js^`
  437. */
  438. if (firstChar !== '|' && lastChar === '^') {
  439. const _domain = line.slice(0, -1);
  440. const suffix = gorhill.getPublicSuffix(_domain);
  441. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  442. // This exclude domain-like resource like `_social_tracking.js^`
  443. return null;
  444. }
  445. const domain = normalizeDomain(_domain);
  446. if (domain) {
  447. return [domain, ParseType.BlackAbsolute];
  448. }
  449. return [
  450. `[parse-filter E0005] (black) invalid domain: ${_domain}`,
  451. ParseType.ErrorMessage
  452. ];
  453. }
  454. if (lineStartsWithSingleDot) {
  455. /**
  456. * `.cookielaw.js`
  457. * `.content_tracking.js`
  458. * `.ads.css`
  459. */
  460. const _domain = line.slice(1);
  461. const suffix = gorhill.getPublicSuffix(_domain);
  462. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  463. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  464. return null;
  465. }
  466. const tryNormalizeDomain = normalizeDomain(_domain);
  467. if (tryNormalizeDomain === _domain) {
  468. // the entire rule is domain
  469. return [line, ParseType.BlackIncludeSubdomain];
  470. }
  471. } else {
  472. /**
  473. * `_prebid.js`
  474. * `t.yesware.com`
  475. * `ubmcmm.baidustatic.com`
  476. * `://www.smfg-card.$document`
  477. * `portal.librus.pl$$advertisement-module`
  478. * `@@-ds.metric.gstatic.com^|`
  479. * `://gom.ge/cookie.js`
  480. * `://accout-update-smba.jp.$document`
  481. * `_200x250.png`
  482. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  483. */
  484. const tryNormalizeDomain = normalizeDomain(line);
  485. if (tryNormalizeDomain === line) {
  486. // the entire rule is domain
  487. return [line, ParseType.BlackIncludeSubdomain];
  488. }
  489. }
  490. return [
  491. `[parse-filter E0010] can not parse: ${line}`,
  492. ParseType.ErrorMessage
  493. ];
  494. }
  495. class CustomAbortError extends Error {
  496. public readonly name = 'AbortError';
  497. public readonly digest = 'AbortError';
  498. }
  499. function sleepWithAbort(ms: number, signal: AbortSignal) {
  500. return new Promise<void>((resolve, reject) => {
  501. signal.throwIfAborted();
  502. signal.addEventListener('abort', stop);
  503. Bun.sleep(ms).then(done).catch(doReject);
  504. function done() {
  505. signal.removeEventListener('abort', stop);
  506. resolve();
  507. }
  508. function stop(this: AbortSignal) {
  509. reject(this.reason);
  510. }
  511. function doReject(reason: unknown) {
  512. signal.removeEventListener('abort', stop);
  513. reject(reason);
  514. }
  515. });
  516. }
  517. async function fetchAssets(url: string, fallbackUrls: string[] | readonly string[]) {
  518. const controller = new AbortController();
  519. const fetchMainPromise = fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit })
  520. .then(r => r.text())
  521. .then(text => {
  522. console.log(picocolors.gray('[fetch finish]'), picocolors.gray(url));
  523. controller.abort();
  524. return text;
  525. });
  526. const createFetchFallbackPromise = async (url: string, index: number) => {
  527. // Most assets can be downloaded within 250ms. To avoid wasting bandwidth, we will wait for 350ms before downloading from the fallback URL.
  528. try {
  529. await sleepWithAbort(200 + (index + 1) * 10, controller.signal);
  530. } catch {
  531. console.log(picocolors.gray('[fetch cancelled early]'), picocolors.gray(url));
  532. throw new CustomAbortError();
  533. }
  534. if (controller.signal.aborted) {
  535. console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url));
  536. throw new CustomAbortError();
  537. }
  538. const res = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
  539. const text = await res.text();
  540. controller.abort();
  541. return text;
  542. };
  543. return Promise.any([
  544. fetchMainPromise,
  545. ...fallbackUrls.map(createFetchFallbackPromise)
  546. ]).catch(e => {
  547. console.log(`Download Rule for [${url}] failed`);
  548. throw e;
  549. });
  550. }