parse-filter.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. // @ts-check
  2. import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
  3. import * as tldts from './cached-tld-parse';
  4. import { fetchRemoteTextAndCreateReadlineInterface } from './fetch-remote-text-by-line';
  5. import { NetworkFilter } from '@cliqz/adblocker';
  6. import { processLine } from './process-line';
  7. import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
  8. import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
  9. import { isProbablyIpv4 } from './is-fast-ip';
  10. const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
  11. let foundDebugDomain = false;
  12. const warnOnceUrl = new Set<string>();
  13. const warnOnce = (url: string, isWhite: boolean, ...message: any[]) => {
  14. const key = `${url}${isWhite ? 'white' : 'black'}`;
  15. if (warnOnceUrl.has(key)) {
  16. return;
  17. }
  18. warnOnceUrl.add(key);
  19. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  20. };
  21. const normalizeDomain = (domain: string) => {
  22. if (!domain) return null;
  23. if (isProbablyIpv4(domain)) return null;
  24. const parsed = tldts.parse2(domain);
  25. if (parsed.isIp) return null;
  26. if (!parsed.isIcann && !parsed.isPrivate) return null;
  27. const h = parsed.hostname;
  28. if (!h) return null;
  29. return h[0] === '.' ? h.slice(1) : h;
  30. };
  31. export async function processDomainLists(domainListsUrl: string | URL, includeAllSubDomain = false) {
  32. if (typeof domainListsUrl === 'string') {
  33. domainListsUrl = new URL(domainListsUrl);
  34. }
  35. const domainSets = new Set<string>();
  36. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  37. const domainToAdd = processLine(line);
  38. if (!domainToAdd) {
  39. continue;
  40. }
  41. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  42. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  43. foundDebugDomain = true;
  44. }
  45. if (includeAllSubDomain) {
  46. domainSets.add(`.${domainToAdd}`);
  47. } else {
  48. domainSets.add(domainToAdd);
  49. }
  50. }
  51. return domainSets;
  52. }
  53. export async function processHosts(hostsUrl: string | URL, includeAllSubDomain = false, skipDomainCheck = false) {
  54. console.time(`- processHosts: ${hostsUrl.toString()}`);
  55. if (typeof hostsUrl === 'string') {
  56. hostsUrl = new URL(hostsUrl);
  57. }
  58. const domainSets = new Set<string>();
  59. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  60. const line = processLine(l);
  61. if (!line) {
  62. continue;
  63. }
  64. const [, ...domains] = line.split(' ');
  65. const _domain = domains.join(' ').trim();
  66. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  67. warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
  68. foundDebugDomain = true;
  69. }
  70. const domain = skipDomainCheck ? _domain : normalizeDomain(_domain);
  71. if (domain) {
  72. if (includeAllSubDomain) {
  73. domainSets.add(`.${domain}`);
  74. } else {
  75. domainSets.add(domain);
  76. }
  77. }
  78. }
  79. console.timeEnd(` - processHosts: ${hostsUrl.toString()}`);
  80. return domainSets;
  81. }
  82. export async function processFilterRules(
  83. filterRulesUrl: string | URL,
  84. fallbackUrls?: ReadonlyArray<string | URL> | undefined
  85. ): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
  86. const runStart = Bun.nanoseconds();
  87. const whitelistDomainSets = new Set<string>();
  88. const blacklistDomainSets = new Set<string>();
  89. let downloadTime = 0;
  90. const gorhill = await getGorhillPublicSuffixPromise();
  91. /**
  92. * @param {string} line
  93. */
  94. const lineCb = (line: string) => {
  95. const result = parse(line, gorhill);
  96. if (!result) {
  97. return;
  98. }
  99. const flag = result[1];
  100. const hostname = result[0];
  101. if (DEBUG_DOMAIN_TO_FIND) {
  102. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  103. warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
  104. foundDebugDomain = true;
  105. console.log({ result, flag });
  106. }
  107. }
  108. switch (flag) {
  109. case 0:
  110. if (hostname[0] !== '.') {
  111. whitelistDomainSets.add(`.${hostname}`);
  112. } else {
  113. whitelistDomainSets.add(hostname);
  114. }
  115. break;
  116. case -1:
  117. whitelistDomainSets.add(hostname);
  118. break;
  119. case 1:
  120. blacklistDomainSets.add(hostname);
  121. break;
  122. case 2:
  123. if (hostname[0] !== '.') {
  124. blacklistDomainSets.add(`.${hostname}`);
  125. } else {
  126. blacklistDomainSets.add(hostname);
  127. }
  128. break;
  129. default:
  130. throw new Error(`Unknown flag: ${flag as any}`);
  131. }
  132. };
  133. if (!fallbackUrls || fallbackUrls.length === 0) {
  134. downloadTime = 0;
  135. let last = Bun.nanoseconds();
  136. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  137. const now = Bun.nanoseconds();
  138. downloadTime += Bun.nanoseconds() - last;
  139. last = now;
  140. // don't trim here
  141. lineCb(line);
  142. }
  143. } else {
  144. let filterRules;
  145. const downloadStart = Bun.nanoseconds();
  146. try {
  147. const controller = new AbortController();
  148. /** @type string[] */
  149. filterRules = (
  150. await Promise.any(
  151. [filterRulesUrl, ...fallbackUrls].map(async url => {
  152. const r = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
  153. const text = await r.text();
  154. controller.abort();
  155. return text;
  156. })
  157. )
  158. ).split('\n');
  159. } catch (e) {
  160. console.log(`Download Rule for [${filterRulesUrl.toString()}] failed`);
  161. throw e;
  162. }
  163. downloadTime = Bun.nanoseconds() - downloadStart;
  164. for (let i = 0, len = filterRules.length; i < len; i++) {
  165. lineCb(filterRules[i]);
  166. }
  167. }
  168. console.log(` ┬ processFilterRules (${filterRulesUrl.toString()}): ${((Bun.nanoseconds() - runStart) / 1e6).toFixed(3)}ms`);
  169. console.log(` └── download time: ${(downloadTime / 1e6).toFixed(3)}ms`);
  170. return {
  171. white: whitelistDomainSets,
  172. black: blacklistDomainSets,
  173. foundDebugDomain
  174. };
  175. }
  176. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  177. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  178. /**
  179. * 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
  180. */
  181. function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: 0 | 1 | 2 | -1] {
  182. if (
  183. // doesn't include
  184. !$line.includes('.') // rule with out dot can not be a domain
  185. // includes
  186. || $line.includes('!')
  187. || $line.includes('?')
  188. || $line.includes('*')
  189. || $line.includes('[')
  190. || $line.includes('(')
  191. || $line.includes(']')
  192. || $line.includes(')')
  193. || $line.includes(',')
  194. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  195. ) {
  196. return null;
  197. }
  198. const line = $line.trim();
  199. /** @example line.length */
  200. const len = line.length;
  201. if (len === 0) {
  202. return null;
  203. }
  204. const firstChar = line[0];
  205. const lastChar = line[len - 1];
  206. if (
  207. firstChar === '/'
  208. // ends with
  209. || lastChar === '.' // || line.endsWith('.')
  210. || lastChar === '-' // || line.endsWith('-')
  211. || lastChar === '_' // || line.endsWith('_')
  212. // special modifier
  213. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  214. // || line.includes('$popup')
  215. // || line.includes('$removeparam')
  216. // || line.includes('$popunder')
  217. ) {
  218. return null;
  219. }
  220. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  221. return null;
  222. }
  223. const filter = NetworkFilter.parse(line);
  224. if (filter) {
  225. if (
  226. filter.isElemHide()
  227. || filter.isGenericHide()
  228. || filter.isSpecificHide()
  229. || filter.isRedirect()
  230. || filter.isRedirectRule()
  231. || filter.hasDomains()
  232. || filter.isCSP() // must not be csp rule
  233. || (!filter.fromAny() && !filter.fromDocument())
  234. ) {
  235. // not supported type
  236. return null;
  237. }
  238. if (
  239. filter.hostname // filter.hasHostname() // must have
  240. && filter.isPlain()
  241. // && (!filter.isRegex()) // isPlain() === !isRegex()
  242. && (!filter.isFullRegex())
  243. ) {
  244. const hostname = normalizeDomain(filter.hostname);
  245. if (!hostname) {
  246. return null;
  247. }
  248. // |: filter.isHostnameAnchor(),
  249. // |: filter.isLeftAnchor(),
  250. // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  251. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  252. if (filter.isException() || filter.isBadFilter()) {
  253. return [hostname, isIncludeAllSubDomain ? 0 : -1];
  254. }
  255. const _1p = filter.firstParty();
  256. const _3p = filter.thirdParty();
  257. if (_1p) {
  258. if (_1p === _3p) {
  259. return [hostname, isIncludeAllSubDomain ? 2 : 1];
  260. }
  261. return null;
  262. }
  263. if (_3p) {
  264. return null;
  265. }
  266. }
  267. }
  268. /**
  269. * abnormal filter that can not be parsed by NetworkFilter
  270. */
  271. if (line.includes('$third-party') || line.includes('$frame')) {
  272. /*
  273. * `.bbelements.com^$third-party`
  274. * `://o0e.ru^$third-party`
  275. */
  276. return null;
  277. }
  278. /** @example line.endsWith('^') */
  279. const linedEndsWithCaret = lastChar === '^';
  280. /** @example line.endsWith('^|') */
  281. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  282. /** @example line.endsWith('^') || line.endsWith('^|') */
  283. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  284. // whitelist (exception)
  285. if (firstChar === '@' && line[1] === '@') {
  286. /**
  287. * cname exceptional filter can not be parsed by NetworkFilter
  288. *
  289. * `@@||m.faz.net^$cname`
  290. *
  291. * Surge / Clash can't handle CNAME either, so we just ignore them
  292. */
  293. if (line.endsWith('$cname')) {
  294. return null;
  295. }
  296. /**
  297. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  298. * "$genericblock`" is also not supported by NetworkFilter
  299. *
  300. * `@@||cmechina.net^$genericblock`
  301. * `@@|ftp.bmp.ovh^|`
  302. * `@@|adsterra.com^|`
  303. */
  304. if (
  305. (
  306. // line.startsWith('@@|')
  307. line[2] === '|'
  308. // line.startsWith('@@.')
  309. || line[2] === '.'
  310. /**
  311. * line.startsWith('@@://')
  312. *
  313. * `@@://googleadservices.com^|`
  314. * `@@://www.googleadservices.com^|`
  315. */
  316. || (line[2] === ':' && line[3] === '/' && line[4] === '/')
  317. )
  318. && (
  319. lineEndsWithCaretOrCaretVerticalBar
  320. || line.endsWith('$genericblock')
  321. || line.endsWith('$document')
  322. )
  323. ) {
  324. const _domain = line
  325. .replace('@@||', '')
  326. .replace('@@://', '')
  327. .replace('@@|', '')
  328. .replace('@@.', '')
  329. .replace('^|', '')
  330. .replace('^$genericblock', '')
  331. .replace('$genericblock', '')
  332. .replace('^$document', '')
  333. .replace('$document', '')
  334. .replaceAll('^', '')
  335. .trim();
  336. const domain = normalizeDomain(_domain);
  337. if (domain) {
  338. return [domain, 0];
  339. }
  340. console.warn(' * [parse-filter E0001] (white) invalid domain:', _domain);
  341. return null;
  342. }
  343. }
  344. if (firstChar === '|') {
  345. const lineEndswithCname = line.endsWith('$cname');
  346. if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
  347. /**
  348. * Some malformed filters can not be parsed by NetworkFilter:
  349. *
  350. * `||smetrics.teambeachbody.com^.com^`
  351. * `||solutions.|pages.indigovision.com^`
  352. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  353. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  354. */
  355. const includeAllSubDomain = line[1] === '|';
  356. const sliceStart = includeAllSubDomain ? 2 : 1;
  357. const sliceEnd = lastChar === '^'
  358. ? -1
  359. : lineEndsWithCaretOrCaretVerticalBar
  360. ? -2
  361. // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
  362. : (lineEndswithCname ? -6 : 0);
  363. const _domain = line
  364. .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
  365. .trim();
  366. const domain = normalizeDomain(_domain);
  367. if (domain) {
  368. return [domain, includeAllSubDomain ? 2 : 1];
  369. }
  370. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  371. return null;
  372. }
  373. }
  374. const lineStartsWithSingleDot = firstChar === '.';
  375. if (
  376. lineStartsWithSingleDot
  377. && lineEndsWithCaretOrCaretVerticalBar
  378. ) {
  379. /**
  380. * `.ay.delivery^`
  381. * `.m.bookben.com^`
  382. * `.wap.x4399.com^`
  383. */
  384. const _domain = line.slice(
  385. 1, // remove prefix dot
  386. linedEndsWithCaret // replaceAll('^', '')
  387. ? -1
  388. : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
  389. );
  390. const suffix = gorhill.getPublicSuffix(_domain);
  391. if (!gorhill.suffixInPSL(suffix)) {
  392. // This exclude domain-like resource like `1.1.4.514.js`
  393. return null;
  394. }
  395. const domain = normalizeDomain(_domain);
  396. if (domain) {
  397. return [domain, 2];
  398. }
  399. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  400. return null;
  401. }
  402. /**
  403. * `|http://x.o2.pl^`
  404. * `://mine.torrent.pw^`
  405. * `://say.ac^`
  406. */
  407. if (
  408. (
  409. line.startsWith('://')
  410. || line.startsWith('http://')
  411. || line.startsWith('https://')
  412. || line.startsWith('|http://')
  413. || line.startsWith('|https://')
  414. )
  415. && lineEndsWithCaretOrCaretVerticalBar
  416. ) {
  417. const _domain = line
  418. .replace('|https://', '')
  419. .replace('https://', '')
  420. .replace('|http://', '')
  421. .replace('http://', '')
  422. .replace('://', '')
  423. .replace('^|', '')
  424. .replaceAll('^', '')
  425. .trim();
  426. const domain = normalizeDomain(_domain);
  427. if (domain) {
  428. return [domain, 1];
  429. }
  430. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  431. return null;
  432. }
  433. /**
  434. * `_vmind.qqvideo.tc.qq.com^`
  435. * `arketing.indianadunes.com^`
  436. * `charlestownwyllie.oaklawnnonantum.com^`
  437. * `-telemetry.officeapps.live.com^`
  438. * `-tracker.biliapi.net`
  439. * `-logging.nextmedia.com`
  440. * `_social_tracking.js^`
  441. */
  442. if (firstChar !== '|' && lastChar === '^') {
  443. const _domain = line.slice(0, -1);
  444. const suffix = gorhill.getPublicSuffix(_domain);
  445. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  446. // This exclude domain-like resource like `_social_tracking.js^`
  447. return null;
  448. }
  449. const domain = normalizeDomain(_domain);
  450. if (domain) {
  451. return [domain, 1];
  452. }
  453. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  454. return null;
  455. }
  456. if (lineStartsWithSingleDot) {
  457. /**
  458. * `.cookielaw.js`
  459. * `.content_tracking.js`
  460. * `.ads.css`
  461. */
  462. const _domain = line.slice(1);
  463. const suffix = gorhill.getPublicSuffix(_domain);
  464. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  465. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  466. return null;
  467. }
  468. const tryNormalizeDomain = normalizeDomain(_domain);
  469. if (tryNormalizeDomain === _domain) {
  470. // the entire rule is domain
  471. return [line, 2];
  472. }
  473. } else {
  474. /**
  475. * `_prebid.js`
  476. * `t.yesware.com`
  477. * `ubmcmm.baidustatic.com`
  478. * `://www.smfg-card.$document`
  479. * `portal.librus.pl$$advertisement-module`
  480. * `@@-ds.metric.gstatic.com^|`
  481. * `://gom.ge/cookie.js`
  482. * `://accout-update-smba.jp.$document`
  483. * `_200x250.png`
  484. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  485. */
  486. const tryNormalizeDomain = normalizeDomain(line);
  487. if (tryNormalizeDomain === line) {
  488. // the entire rule is domain
  489. return [line, 2];
  490. }
  491. }
  492. console.warn(' * [parse-filter E0010] can not parse:', line);
  493. return null;
  494. }