parse-filter.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. // @ts-check
  2. import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
  3. import * as tldts from './cached-tld-parse';
  4. import { fetchRemoteTextAndCreateReadlineInterface } from './fetch-remote-text-by-line';
  5. import { NetworkFilter } from '@cliqz/adblocker';
  6. import { processLine } from './process-line';
  7. import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
  8. import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
  9. import { isProbablyIpv4 } from './is-fast-ip';
  10. const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
  11. let foundDebugDomain = false;
  12. const warnOnceUrl = new Set<string>();
  13. const warnOnce = (url: string, isWhite: boolean, ...message: any[]) => {
  14. const key = `${url}${isWhite ? 'white' : 'black'}`;
  15. if (warnOnceUrl.has(key)) {
  16. return;
  17. }
  18. warnOnceUrl.add(key);
  19. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  20. };
  21. const normalizeDomain = (domain: string) => {
  22. if (!domain) return null;
  23. if (isProbablyIpv4(domain)) return null;
  24. const parsed = tldts.parse2(domain);
  25. if (parsed.isIp) return null;
  26. if (!parsed.isIcann && !parsed.isPrivate) return null;
  27. const h = parsed.hostname;
  28. if (!h) return null;
  29. return h[0] === '.' ? h.slice(1) : h;
  30. };
  31. export async function processDomainLists(domainListsUrl: string | URL, includeAllSubDomain = false) {
  32. if (typeof domainListsUrl === 'string') {
  33. domainListsUrl = new URL(domainListsUrl);
  34. }
  35. const domainSets = new Set<string>();
  36. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  37. const domainToAdd = processLine(line);
  38. if (!domainToAdd) {
  39. continue;
  40. }
  41. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  42. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  43. foundDebugDomain = true;
  44. }
  45. if (includeAllSubDomain) {
  46. domainSets.add(`.${domainToAdd}`);
  47. } else {
  48. domainSets.add(domainToAdd);
  49. }
  50. }
  51. return domainSets;
  52. }
  53. export async function processHosts(hostsUrl: string | URL, includeAllSubDomain = false, skipDomainCheck = false) {
  54. console.time(`- processHosts: ${hostsUrl.toString()}`);
  55. if (typeof hostsUrl === 'string') {
  56. hostsUrl = new URL(hostsUrl);
  57. }
  58. const domainSets = new Set<string>();
  59. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  60. const line = processLine(l);
  61. if (!line) {
  62. continue;
  63. }
  64. const [, ...domains] = line.split(' ');
  65. const _domain = domains.join(' ').trim();
  66. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  67. warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
  68. foundDebugDomain = true;
  69. }
  70. const domain = skipDomainCheck ? _domain : normalizeDomain(_domain);
  71. if (domain) {
  72. if (includeAllSubDomain) {
  73. domainSets.add(`.${domain}`);
  74. } else {
  75. domainSets.add(domain);
  76. }
  77. }
  78. }
  79. console.timeEnd(` - processHosts: ${hostsUrl.toString()}`);
  80. return domainSets;
  81. }
  82. export async function processFilterRules(
  83. filterRulesUrl: string | URL,
  84. fallbackUrls?: ReadonlyArray<string | URL> | undefined
  85. ): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
  86. const runStart = Bun.nanoseconds();
  87. const whitelistDomainSets = new Set<string>();
  88. const blacklistDomainSets = new Set<string>();
  89. /**
  90. * @param {string} domainToBeAddedToBlack
  91. * @param {boolean} isSubDomain
  92. */
  93. const addToBlackList = (domainToBeAddedToBlack: string, isSubDomain: boolean) => {
  94. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  95. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  96. } else {
  97. blacklistDomainSets.add(domainToBeAddedToBlack);
  98. }
  99. };
  100. /**
  101. * @param {string} domainToBeAddedToWhite
  102. * @param {boolean} [isSubDomain]
  103. */
  104. const addToWhiteList = (domainToBeAddedToWhite: string, isSubDomain = true) => {
  105. if (isSubDomain && domainToBeAddedToWhite[0] !== '.') {
  106. whitelistDomainSets.add(`.${domainToBeAddedToWhite}`);
  107. } else {
  108. whitelistDomainSets.add(domainToBeAddedToWhite);
  109. }
  110. };
  111. let downloadTime = 0;
  112. const gorhill = await getGorhillPublicSuffixPromise();
  113. /**
  114. * @param {string} line
  115. */
  116. const lineCb = (line: string) => {
  117. const result = parse(line, gorhill);
  118. if (result) {
  119. const flag = result[1];
  120. const hostname = result[0];
  121. if (DEBUG_DOMAIN_TO_FIND) {
  122. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  123. warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
  124. foundDebugDomain = true;
  125. console.log({ result, flag });
  126. }
  127. }
  128. switch (flag) {
  129. case 0:
  130. addToWhiteList(hostname, true);
  131. break;
  132. case -1:
  133. addToWhiteList(hostname, false);
  134. break;
  135. case 1:
  136. addToBlackList(hostname, false);
  137. break;
  138. case 2:
  139. addToBlackList(hostname, true);
  140. break;
  141. default:
  142. throw new Error(`Unknown flag: ${flag as any}`);
  143. }
  144. }
  145. };
  146. if (!fallbackUrls || fallbackUrls.length === 0) {
  147. downloadTime = 0;
  148. let last = Bun.nanoseconds();
  149. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  150. const now = Bun.nanoseconds();
  151. downloadTime += Bun.nanoseconds() - last;
  152. last = now;
  153. // don't trim here
  154. lineCb(line);
  155. }
  156. } else {
  157. let filterRules;
  158. const downloadStart = Bun.nanoseconds();
  159. try {
  160. const controller = new AbortController();
  161. /** @type string[] */
  162. filterRules = (
  163. await Promise.any(
  164. [filterRulesUrl, ...fallbackUrls].map(async url => {
  165. const r = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
  166. const text = await r.text();
  167. controller.abort();
  168. return text;
  169. })
  170. )
  171. ).split('\n');
  172. } catch (e) {
  173. console.log(`Download Rule for [${filterRulesUrl.toString()}] failed`);
  174. throw e;
  175. }
  176. downloadTime = Bun.nanoseconds() - downloadStart;
  177. for (let i = 0, len = filterRules.length; i < len; i++) {
  178. lineCb(filterRules[i]);
  179. }
  180. }
  181. console.log(` ┬ processFilterRules (${filterRulesUrl.toString()}): ${((Bun.nanoseconds() - runStart) / 1e6).toFixed(3)}ms`);
  182. console.log(` └── download time: ${(downloadTime / 1e6).toFixed(3)}ms`);
  183. return {
  184. white: whitelistDomainSets,
  185. black: blacklistDomainSets,
  186. foundDebugDomain
  187. };
  188. }
  189. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  190. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  191. /**
  192. * 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
  193. */
  194. function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: 0 | 1 | 2 | -1] {
  195. if (
  196. // doesn't include
  197. !$line.includes('.') // rule with out dot can not be a domain
  198. // includes
  199. || $line.includes('!')
  200. || $line.includes('?')
  201. || $line.includes('*')
  202. || $line.includes('[')
  203. || $line.includes('(')
  204. || $line.includes(']')
  205. || $line.includes(')')
  206. || $line.includes(',')
  207. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  208. ) {
  209. return null;
  210. }
  211. const line = $line.trim();
  212. /** @example line.length */
  213. const len = line.length;
  214. if (len === 0) {
  215. return null;
  216. }
  217. const firstChar = line[0];
  218. const lastChar = line[len - 1];
  219. if (
  220. firstChar === '/'
  221. // ends with
  222. || lastChar === '.' // || line.endsWith('.')
  223. || lastChar === '-' // || line.endsWith('-')
  224. || lastChar === '_' // || line.endsWith('_')
  225. // special modifier
  226. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  227. // || line.includes('$popup')
  228. // || line.includes('$removeparam')
  229. // || line.includes('$popunder')
  230. ) {
  231. return null;
  232. }
  233. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  234. return null;
  235. }
  236. const filter = NetworkFilter.parse(line);
  237. if (filter) {
  238. if (
  239. filter.isElemHide()
  240. || filter.isGenericHide()
  241. || filter.isSpecificHide()
  242. || filter.isRedirect()
  243. || filter.isRedirectRule()
  244. || filter.hasDomains()
  245. || filter.isCSP() // must not be csp rule
  246. || (!filter.fromAny() && !filter.fromDocument())
  247. ) {
  248. // not supported type
  249. return null;
  250. }
  251. if (
  252. filter.hostname // filter.hasHostname() // must have
  253. && filter.isPlain()
  254. // && (!filter.isRegex()) // isPlain() === !isRegex()
  255. && (!filter.isFullRegex())
  256. ) {
  257. const hostname = normalizeDomain(filter.hostname);
  258. if (!hostname) {
  259. console.log(' * [parse-filter E0000] invalid domain:', filter.hostname);
  260. return null;
  261. }
  262. // |: filter.isHostnameAnchor(),
  263. // |: filter.isLeftAnchor(),
  264. // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  265. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  266. if (filter.isException() || filter.isBadFilter()) {
  267. return [hostname, isIncludeAllSubDomain ? 0 : -1];
  268. }
  269. const _1p = filter.firstParty();
  270. const _3p = filter.thirdParty();
  271. if (_1p) {
  272. if (_1p === _3p) {
  273. return [hostname, isIncludeAllSubDomain ? 2 : 1];
  274. }
  275. return null;
  276. }
  277. if (_3p) {
  278. return null;
  279. }
  280. }
  281. }
  282. /**
  283. * abnormal filter that can not be parsed by NetworkFilter
  284. */
  285. if (line.includes('$third-party') || line.includes('$frame')) {
  286. /*
  287. * `.bbelements.com^$third-party`
  288. * `://o0e.ru^$third-party`
  289. */
  290. return null;
  291. }
  292. /** @example line.endsWith('^') */
  293. const linedEndsWithCaret = lastChar === '^';
  294. /** @example line.endsWith('^|') */
  295. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  296. /** @example line.endsWith('^') || line.endsWith('^|') */
  297. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  298. // whitelist (exception)
  299. if (firstChar === '@' && line[1] === '@') {
  300. /**
  301. * cname exceptional filter can not be parsed by NetworkFilter
  302. *
  303. * `@@||m.faz.net^$cname`
  304. *
  305. * Surge / Clash can't handle CNAME either, so we just ignore them
  306. */
  307. if (line.endsWith('$cname')) {
  308. return null;
  309. }
  310. /**
  311. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  312. * "$genericblock`" is also not supported by NetworkFilter
  313. *
  314. * `@@||cmechina.net^$genericblock`
  315. * `@@|ftp.bmp.ovh^|`
  316. * `@@|adsterra.com^|`
  317. */
  318. if (
  319. (
  320. // line.startsWith('@@|')
  321. line[2] === '|'
  322. // line.startsWith('@@.')
  323. || line[2] === '.'
  324. /**
  325. * line.startsWith('@@://')
  326. *
  327. * `@@://googleadservices.com^|`
  328. * `@@://www.googleadservices.com^|`
  329. */
  330. || (line[2] === ':' && line[3] === '/' && line[4] === '/')
  331. )
  332. && (
  333. lineEndsWithCaretOrCaretVerticalBar
  334. || line.endsWith('$genericblock')
  335. || line.endsWith('$document')
  336. )
  337. ) {
  338. const _domain = line
  339. .replace('@@||', '')
  340. .replace('@@://', '')
  341. .replace('@@|', '')
  342. .replace('@@.', '')
  343. .replace('^|', '')
  344. .replace('^$genericblock', '')
  345. .replace('$genericblock', '')
  346. .replace('^$document', '')
  347. .replace('$document', '')
  348. .replaceAll('^', '')
  349. .trim();
  350. const domain = normalizeDomain(_domain);
  351. if (domain) {
  352. return [domain, 0];
  353. }
  354. console.warn(' * [parse-filter E0001] (white) invalid domain:', _domain);
  355. return null;
  356. }
  357. }
  358. if (firstChar === '|') {
  359. const lineEndswithCname = line.endsWith('$cname');
  360. if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
  361. /**
  362. * Some malformed filters can not be parsed by NetworkFilter:
  363. *
  364. * `||smetrics.teambeachbody.com^.com^`
  365. * `||solutions.|pages.indigovision.com^`
  366. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  367. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  368. */
  369. const includeAllSubDomain = line[1] === '|';
  370. const sliceStart = includeAllSubDomain ? 2 : 1;
  371. const sliceEnd = lastChar === '^'
  372. ? -1
  373. : lineEndsWithCaretOrCaretVerticalBar
  374. ? -2
  375. // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
  376. : (lineEndswithCname ? -6 : 0);
  377. const _domain = line
  378. .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
  379. .trim();
  380. const domain = normalizeDomain(_domain);
  381. if (domain) {
  382. return [domain, includeAllSubDomain ? 2 : 1];
  383. }
  384. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  385. return null;
  386. }
  387. }
  388. const lineStartsWithSingleDot = firstChar === '.';
  389. if (
  390. lineStartsWithSingleDot
  391. && lineEndsWithCaretOrCaretVerticalBar
  392. ) {
  393. /**
  394. * `.ay.delivery^`
  395. * `.m.bookben.com^`
  396. * `.wap.x4399.com^`
  397. */
  398. const _domain = line.slice(
  399. 1, // remove prefix dot
  400. linedEndsWithCaret // replaceAll('^', '')
  401. ? -1
  402. : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
  403. );
  404. const suffix = gorhill.getPublicSuffix(_domain);
  405. if (!gorhill.suffixInPSL(suffix)) {
  406. // This exclude domain-like resource like `1.1.4.514.js`
  407. return null;
  408. }
  409. const domain = normalizeDomain(_domain);
  410. if (domain) {
  411. return [domain, 2];
  412. }
  413. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  414. return null;
  415. }
  416. /**
  417. * `|http://x.o2.pl^`
  418. * `://mine.torrent.pw^`
  419. * `://say.ac^`
  420. */
  421. if (
  422. (
  423. line.startsWith('://')
  424. || line.startsWith('http://')
  425. || line.startsWith('https://')
  426. || line.startsWith('|http://')
  427. || line.startsWith('|https://')
  428. )
  429. && lineEndsWithCaretOrCaretVerticalBar
  430. ) {
  431. const _domain = line
  432. .replace('|https://', '')
  433. .replace('https://', '')
  434. .replace('|http://', '')
  435. .replace('http://', '')
  436. .replace('://', '')
  437. .replace('^|', '')
  438. .replaceAll('^', '')
  439. .trim();
  440. const domain = normalizeDomain(_domain);
  441. if (domain) {
  442. return [domain, 1];
  443. }
  444. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  445. return null;
  446. }
  447. /**
  448. * `_vmind.qqvideo.tc.qq.com^`
  449. * `arketing.indianadunes.com^`
  450. * `charlestownwyllie.oaklawnnonantum.com^`
  451. * `-telemetry.officeapps.live.com^`
  452. * `-tracker.biliapi.net`
  453. * `-logging.nextmedia.com`
  454. * `_social_tracking.js^`
  455. */
  456. if (firstChar !== '|' && lastChar === '^') {
  457. const _domain = line.slice(0, -1);
  458. const suffix = gorhill.getPublicSuffix(_domain);
  459. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  460. // This exclude domain-like resource like `_social_tracking.js^`
  461. return null;
  462. }
  463. const domain = normalizeDomain(_domain);
  464. if (domain) {
  465. return [domain, 1];
  466. }
  467. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  468. return null;
  469. }
  470. if (lineStartsWithSingleDot) {
  471. /**
  472. * `.cookielaw.js`
  473. * `.content_tracking.js`
  474. * `.ads.css`
  475. */
  476. const _domain = line.slice(1);
  477. const suffix = gorhill.getPublicSuffix(_domain);
  478. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  479. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  480. return null;
  481. }
  482. const tryNormalizeDomain = normalizeDomain(_domain);
  483. if (tryNormalizeDomain === _domain) {
  484. // the entire rule is domain
  485. return [line, 2];
  486. }
  487. } else {
  488. /**
  489. * `_prebid.js`
  490. * `t.yesware.com`
  491. * `ubmcmm.baidustatic.com`
  492. * `://www.smfg-card.$document`
  493. * `portal.librus.pl$$advertisement-module`
  494. * `@@-ds.metric.gstatic.com^|`
  495. * `://gom.ge/cookie.js`
  496. * `://accout-update-smba.jp.$document`
  497. * `_200x250.png`
  498. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  499. */
  500. const tryNormalizeDomain = normalizeDomain(line);
  501. if (tryNormalizeDomain === line) {
  502. // the entire rule is domain
  503. return [line, 2];
  504. }
  505. }
  506. console.warn(' * [parse-filter E0010] can not parse:', line);
  507. return null;
  508. }