parse-filter.ts 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. // @ts-check
  2. import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
  3. import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
  4. import { NetworkFilter } from '@cliqz/adblocker';
  5. import { processLine } from './process-line';
  6. import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
  7. import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
  8. import { traceAsync } from './trace-runner';
  9. import picocolors from 'picocolors';
  10. import { normalizeDomain } from './normalize-domain';
  11. const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
  12. let foundDebugDomain = false;
  13. const warnOnceUrl = new Set<string>();
  14. const warnOnce = (url: string, isWhite: boolean, ...message: any[]) => {
  15. const key = `${url}${isWhite ? 'white' : 'black'}`;
  16. if (warnOnceUrl.has(key)) {
  17. return;
  18. }
  19. warnOnceUrl.add(key);
  20. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  21. };
  22. export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) {
  23. return traceAsync(`- processDomainLists: ${domainListsUrl}`, async () => {
  24. const domainSets = new Set<string>();
  25. for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) {
  26. const domainToAdd = processLine(line);
  27. if (!domainToAdd) continue;
  28. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  29. warnOnce(domainListsUrl, false, DEBUG_DOMAIN_TO_FIND);
  30. foundDebugDomain = true;
  31. }
  32. domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
  33. }
  34. return domainSets;
  35. });
  36. }
  37. export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) {
  38. return traceAsync(`- processHosts: ${hostsUrl}`, async () => {
  39. const domainSets = new Set<string>();
  40. for await (const l of await fetchRemoteTextAndReadByLine(hostsUrl)) {
  41. const line = processLine(l);
  42. if (!line) {
  43. continue;
  44. }
  45. const [, domain] = line.split(/\s/);
  46. if (!domain) {
  47. continue;
  48. }
  49. const _domain = domain.trim();
  50. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  51. warnOnce(hostsUrl, false, DEBUG_DOMAIN_TO_FIND);
  52. foundDebugDomain = true;
  53. }
  54. const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain);
  55. if (domainToAdd) {
  56. domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
  57. }
  58. }
  59. console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
  60. return domainSets;
  61. });
  62. }
  63. // eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
  64. const enum ParseType {
  65. WhiteIncludeSubdomain = 0,
  66. WhiteAbsolute = -1,
  67. BlackAbsolute = 1,
  68. BlackIncludeSubdomain = 2,
  69. ErrorMessage = 10
  70. }
  71. export async function processFilterRules(
  72. filterRulesUrl: string,
  73. fallbackUrls?: readonly string[] | undefined
  74. ): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
  75. const whitelistDomainSets = new Set<string>();
  76. const blacklistDomainSets = new Set<string>();
  77. const warningMessages: string[] = [];
  78. await traceAsync(`- processFilterRules: ${filterRulesUrl}`, async () => {
  79. const gorhill = await getGorhillPublicSuffixPromise();
  80. /**
  81. * @param {string} line
  82. */
  83. const lineCb = (line: string) => {
  84. const result = parse(line, gorhill);
  85. if (!result) {
  86. return;
  87. }
  88. const flag = result[1];
  89. const hostname = result[0];
  90. if (DEBUG_DOMAIN_TO_FIND) {
  91. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  92. warnOnce(filterRulesUrl, flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute, DEBUG_DOMAIN_TO_FIND);
  93. foundDebugDomain = true;
  94. }
  95. }
  96. switch (flag) {
  97. case ParseType.WhiteIncludeSubdomain:
  98. if (hostname[0] !== '.') {
  99. whitelistDomainSets.add(`.${hostname}`);
  100. } else {
  101. whitelistDomainSets.add(hostname);
  102. }
  103. break;
  104. case ParseType.WhiteAbsolute:
  105. whitelistDomainSets.add(hostname);
  106. break;
  107. case ParseType.BlackAbsolute:
  108. blacklistDomainSets.add(hostname);
  109. break;
  110. case ParseType.BlackIncludeSubdomain:
  111. if (hostname[0] !== '.') {
  112. blacklistDomainSets.add(`.${hostname}`);
  113. } else {
  114. blacklistDomainSets.add(hostname);
  115. }
  116. break;
  117. case ParseType.ErrorMessage:
  118. warningMessages.push(hostname);
  119. break;
  120. default:
  121. break;
  122. }
  123. };
  124. if (!fallbackUrls || fallbackUrls.length === 0) {
  125. for await (const line of await fetchRemoteTextAndReadByLine(filterRulesUrl)) {
  126. // don't trim here
  127. lineCb(line);
  128. }
  129. } else {
  130. const filterRules = (await traceAsync(
  131. picocolors.gray(`- download ${filterRulesUrl}`),
  132. () => fetchAssets(filterRulesUrl, fallbackUrls),
  133. picocolors.gray
  134. )).split('\n');
  135. for (let i = 0, len = filterRules.length; i < len; i++) {
  136. lineCb(filterRules[i]);
  137. }
  138. }
  139. });
  140. warningMessages.forEach(msg => {
  141. console.warn(
  142. picocolors.yellow(msg),
  143. picocolors.gray(picocolors.underline(filterRulesUrl))
  144. );
  145. });
  146. console.log(
  147. picocolors.gray('[process filter]'),
  148. picocolors.gray(filterRulesUrl),
  149. picocolors.gray(`white: ${whitelistDomainSets.size}`),
  150. picocolors.gray(`black: ${blacklistDomainSets.size}`)
  151. );
  152. return {
  153. white: whitelistDomainSets,
  154. black: blacklistDomainSets,
  155. foundDebugDomain
  156. };
  157. }
  158. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  159. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  160. function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: ParseType] {
  161. if (
  162. // doesn't include
  163. !$line.includes('.') // rule with out dot can not be a domain
  164. // includes
  165. || $line.includes('!')
  166. || $line.includes('?')
  167. || $line.includes('*')
  168. || $line.includes('[')
  169. || $line.includes('(')
  170. || $line.includes(']')
  171. || $line.includes(')')
  172. || $line.includes(',')
  173. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  174. ) {
  175. return null;
  176. }
  177. const line = $line.trim();
  178. /** @example line.length */
  179. const len = line.length;
  180. if (len === 0) {
  181. return null;
  182. }
  183. const firstChar = line[0];
  184. const lastChar = line[len - 1];
  185. if (
  186. firstChar === '/'
  187. // ends with
  188. || lastChar === '.' // || line.endsWith('.')
  189. || lastChar === '-' // || line.endsWith('-')
  190. || lastChar === '_' // || line.endsWith('_')
  191. // special modifier
  192. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  193. // || line.includes('$popup')
  194. // || line.includes('$removeparam')
  195. // || line.includes('$popunder')
  196. ) {
  197. return null;
  198. }
  199. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  200. return null;
  201. }
  202. const filter = NetworkFilter.parse(line);
  203. if (filter) {
  204. if (
  205. filter.isElemHide()
  206. || filter.isGenericHide()
  207. || filter.isSpecificHide()
  208. || filter.isRedirect()
  209. || filter.isRedirectRule()
  210. || filter.hasDomains()
  211. || filter.isCSP() // must not be csp rule
  212. || (!filter.fromAny() && !filter.fromDocument())
  213. ) {
  214. // not supported type
  215. return null;
  216. }
  217. if (
  218. filter.hostname // filter.hasHostname() // must have
  219. && filter.isPlain()
  220. // && (!filter.isRegex()) // isPlain() === !isRegex()
  221. && (!filter.isFullRegex())
  222. ) {
  223. const hostname = normalizeDomain(filter.hostname);
  224. if (!hostname) {
  225. return null;
  226. }
  227. // |: filter.isHostnameAnchor(),
  228. // |: filter.isLeftAnchor(),
  229. // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  230. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  231. if (filter.isException() || filter.isBadFilter()) {
  232. return [hostname, isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute];
  233. }
  234. const _1p = filter.firstParty();
  235. const _3p = filter.thirdParty();
  236. if (_1p) {
  237. if (_1p === _3p) {
  238. return [hostname, isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
  239. }
  240. return null;
  241. }
  242. if (_3p) {
  243. return null;
  244. }
  245. }
  246. }
  247. /**
  248. * abnormal filter that can not be parsed by NetworkFilter
  249. */
  250. if (line.includes('$third-party') || line.includes('$frame')) {
  251. /*
  252. * `.bbelements.com^$third-party`
  253. * `://o0e.ru^$third-party`
  254. */
  255. return null;
  256. }
  257. /** @example line.endsWith('^') */
  258. const linedEndsWithCaret = lastChar === '^';
  259. /** @example line.endsWith('^|') */
  260. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  261. /** @example line.endsWith('^') || line.endsWith('^|') */
  262. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  263. // whitelist (exception)
  264. if (firstChar === '@' && line[1] === '@') {
  265. /**
  266. * cname exceptional filter can not be parsed by NetworkFilter
  267. *
  268. * `@@||m.faz.net^$cname`
  269. *
  270. * Surge / Clash can't handle CNAME either, so we just ignore them
  271. */
  272. if (line.endsWith('$cname')) {
  273. return null;
  274. }
  275. /**
  276. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  277. * "$genericblock`" is also not supported by NetworkFilter
  278. *
  279. * `@@||cmechina.net^$genericblock`
  280. * `@@|ftp.bmp.ovh^|`
  281. * `@@|adsterra.com^|`
  282. */
  283. if (
  284. (
  285. // line.startsWith('@@|')
  286. line[2] === '|'
  287. // line.startsWith('@@.')
  288. || line[2] === '.'
  289. /**
  290. * line.startsWith('@@://')
  291. *
  292. * `@@://googleadservices.com^|`
  293. * `@@://www.googleadservices.com^|`
  294. */
  295. || (line[2] === ':' && line[3] === '/' && line[4] === '/')
  296. )
  297. && (
  298. lineEndsWithCaretOrCaretVerticalBar
  299. || line.endsWith('$genericblock')
  300. || line.endsWith('$document')
  301. )
  302. ) {
  303. const _domain = line
  304. .replace('@@||', '')
  305. .replace('@@://', '')
  306. .replace('@@|', '')
  307. .replace('@@.', '')
  308. .replace('^|', '')
  309. .replace('^$genericblock', '')
  310. .replace('$genericblock', '')
  311. .replace('^$document', '')
  312. .replace('$document', '')
  313. .replaceAll('^', '')
  314. .trim();
  315. const domain = normalizeDomain(_domain);
  316. if (domain) {
  317. return [domain, ParseType.WhiteIncludeSubdomain];
  318. }
  319. return [
  320. `[parse-filter E0001] (white) invalid domain: ${_domain}`,
  321. ParseType.ErrorMessage
  322. ];
  323. }
  324. }
  325. if (firstChar === '|') {
  326. const lineEndswithCname = line.endsWith('$cname');
  327. if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
  328. /**
  329. * Some malformed filters can not be parsed by NetworkFilter:
  330. *
  331. * `||smetrics.teambeachbody.com^.com^`
  332. * `||solutions.|pages.indigovision.com^`
  333. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  334. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  335. */
  336. const includeAllSubDomain = line[1] === '|';
  337. const sliceStart = includeAllSubDomain ? 2 : 1;
  338. const sliceEnd = lastChar === '^'
  339. ? -1
  340. : lineEndsWithCaretOrCaretVerticalBar
  341. ? -2
  342. // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
  343. : (lineEndswithCname ? -6 : 0);
  344. const _domain = line
  345. .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
  346. .trim();
  347. const domain = normalizeDomain(_domain);
  348. if (domain) {
  349. return [domain, includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
  350. }
  351. return [
  352. `[parse-filter E0002] (black) invalid domain: ${_domain}`,
  353. ParseType.ErrorMessage
  354. ];
  355. }
  356. }
  357. const lineStartsWithSingleDot = firstChar === '.';
  358. if (
  359. lineStartsWithSingleDot
  360. && lineEndsWithCaretOrCaretVerticalBar
  361. ) {
  362. /**
  363. * `.ay.delivery^`
  364. * `.m.bookben.com^`
  365. * `.wap.x4399.com^`
  366. */
  367. const _domain = line.slice(
  368. 1, // remove prefix dot
  369. linedEndsWithCaret // replaceAll('^', '')
  370. ? -1
  371. : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
  372. );
  373. const suffix = gorhill.getPublicSuffix(_domain);
  374. if (!gorhill.suffixInPSL(suffix)) {
  375. // This exclude domain-like resource like `1.1.4.514.js`
  376. return null;
  377. }
  378. const domain = normalizeDomain(_domain);
  379. if (domain) {
  380. return [domain, ParseType.BlackIncludeSubdomain];
  381. }
  382. return [
  383. `[paparse-filter E0003] (black) invalid domain: ${_domain}`,
  384. ParseType.ErrorMessage
  385. ];
  386. }
  387. /**
  388. * `|http://x.o2.pl^`
  389. * `://mine.torrent.pw^`
  390. * `://say.ac^`
  391. */
  392. if (
  393. (
  394. line.startsWith('://')
  395. || line.startsWith('http://')
  396. || line.startsWith('https://')
  397. || line.startsWith('|http://')
  398. || line.startsWith('|https://')
  399. )
  400. && lineEndsWithCaretOrCaretVerticalBar
  401. ) {
  402. const _domain = line
  403. .replace('|https://', '')
  404. .replace('https://', '')
  405. .replace('|http://', '')
  406. .replace('http://', '')
  407. .replace('://', '')
  408. .replace('^|', '')
  409. .replaceAll('^', '')
  410. .trim();
  411. const domain = normalizeDomain(_domain);
  412. if (domain) {
  413. return [domain, ParseType.BlackAbsolute];
  414. }
  415. return [
  416. `[parse-filter E0004] (black) invalid domain: ${_domain}`,
  417. ParseType.ErrorMessage
  418. ];
  419. }
  420. /**
  421. * `_vmind.qqvideo.tc.qq.com^`
  422. * `arketing.indianadunes.com^`
  423. * `charlestownwyllie.oaklawnnonantum.com^`
  424. * `-telemetry.officeapps.live.com^`
  425. * `-tracker.biliapi.net`
  426. * `-logging.nextmedia.com`
  427. * `_social_tracking.js^`
  428. */
  429. if (firstChar !== '|' && lastChar === '^') {
  430. const _domain = line.slice(0, -1);
  431. const suffix = gorhill.getPublicSuffix(_domain);
  432. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  433. // This exclude domain-like resource like `_social_tracking.js^`
  434. return null;
  435. }
  436. const domain = normalizeDomain(_domain);
  437. if (domain) {
  438. return [domain, ParseType.BlackAbsolute];
  439. }
  440. return [
  441. `[parse-filter E0005] (black) invalid domain: ${_domain}`,
  442. ParseType.ErrorMessage
  443. ];
  444. }
  445. if (lineStartsWithSingleDot) {
  446. /**
  447. * `.cookielaw.js`
  448. * `.content_tracking.js`
  449. * `.ads.css`
  450. */
  451. const _domain = line.slice(1);
  452. const suffix = gorhill.getPublicSuffix(_domain);
  453. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  454. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  455. return null;
  456. }
  457. const tryNormalizeDomain = normalizeDomain(_domain);
  458. if (tryNormalizeDomain === _domain) {
  459. // the entire rule is domain
  460. return [line, ParseType.BlackIncludeSubdomain];
  461. }
  462. } else {
  463. /**
  464. * `_prebid.js`
  465. * `t.yesware.com`
  466. * `ubmcmm.baidustatic.com`
  467. * `://www.smfg-card.$document`
  468. * `portal.librus.pl$$advertisement-module`
  469. * `@@-ds.metric.gstatic.com^|`
  470. * `://gom.ge/cookie.js`
  471. * `://accout-update-smba.jp.$document`
  472. * `_200x250.png`
  473. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  474. */
  475. const tryNormalizeDomain = normalizeDomain(line);
  476. if (tryNormalizeDomain === line) {
  477. // the entire rule is domain
  478. return [line, ParseType.BlackIncludeSubdomain];
  479. }
  480. }
  481. return [
  482. `[parse-filter E0010] can not parse: ${line}`,
  483. ParseType.ErrorMessage
  484. ];
  485. }
  486. class CustomAbortError extends Error {
  487. public readonly name = 'AbortError';
  488. public readonly digest = 'AbortError';
  489. }
  490. const sleepWithAbort = (ms: number, signal: AbortSignal) => new Promise<void>((resolve, reject) => {
  491. signal.throwIfAborted();
  492. signal.addEventListener('abort', stop);
  493. Bun.sleep(ms).then(done).catch(doReject);
  494. function done() {
  495. signal.removeEventListener('abort', stop);
  496. resolve();
  497. }
  498. function stop(this: AbortSignal) {
  499. reject(this.reason);
  500. }
  501. function doReject(reason: unknown) {
  502. signal.removeEventListener('abort', stop);
  503. reject(reason);
  504. }
  505. });
  506. async function fetchAssets(url: string, fallbackUrls: string[] | readonly string[]) {
  507. const controller = new AbortController();
  508. const fetchMainPromise = fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit })
  509. .then(r => r.text())
  510. .then(text => {
  511. console.log(picocolors.gray('[fetch finish]'), picocolors.gray(url));
  512. controller.abort();
  513. return text;
  514. });
  515. const createFetchFallbackPromise = async (url: string, index: number) => {
  516. // Most assets can be downloaded within 250ms. To avoid wasting bandwidth, we will wait for 350ms before downloading from the fallback URL.
  517. try {
  518. await sleepWithAbort(300 + (index + 1) * 20, controller.signal);
  519. } catch {
  520. console.log(picocolors.gray('[fetch cancelled early]'), picocolors.gray(url));
  521. throw new CustomAbortError();
  522. }
  523. if (controller.signal.aborted) {
  524. console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url));
  525. throw new CustomAbortError();
  526. }
  527. const res = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
  528. const text = await res.text();
  529. controller.abort();
  530. return text;
  531. };
  532. return Promise.any([
  533. fetchMainPromise,
  534. ...fallbackUrls.map(createFetchFallbackPromise)
  535. ]).catch(e => {
  536. console.log(`Download Rule for [${url}] failed`);
  537. throw e;
  538. });
  539. }