parse-filter.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. // @ts-check
  2. import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
  3. import * as tldts from './cached-tld-parse';
  4. import { fetchRemoteTextAndCreateReadlineInterface } from './fetch-remote-text-by-line';
  5. import { NetworkFilter } from '@cliqz/adblocker';
  6. import { processLine } from './process-line';
  7. import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
  8. import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
  9. const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
  10. let foundDebugDomain = false;
  11. const warnOnceUrl = new Set<string>();
  12. const warnOnce = (url: string, isWhite: boolean, ...message: any[]) => {
  13. const key = `${url}${isWhite ? 'white' : 'black'}`;
  14. if (warnOnceUrl.has(key)) {
  15. return;
  16. }
  17. warnOnceUrl.add(key);
  18. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  19. };
  20. const normalizeDomain = (domain: string) => {
  21. if (!domain) return null;
  22. const parsed = tldts.parse(domain);
  23. if (parsed.isIp) return null;
  24. if (parsed.isIcann || parsed.isPrivate) {
  25. const h = parsed.hostname;
  26. if (h === null) return null;
  27. return h[0] === '.' ? h.slice(1) : h;
  28. }
  29. return null;
  30. };
  31. export async function processDomainLists(domainListsUrl: string | URL, includeAllSubDomain = false) {
  32. if (typeof domainListsUrl === 'string') {
  33. domainListsUrl = new URL(domainListsUrl);
  34. }
  35. const domainSets = new Set<string>();
  36. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  37. const domainToAdd = processLine(line);
  38. if (!domainToAdd) {
  39. continue;
  40. }
  41. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  42. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  43. foundDebugDomain = true;
  44. }
  45. if (includeAllSubDomain) {
  46. domainSets.add(`.${domainToAdd}`);
  47. } else {
  48. domainSets.add(domainToAdd);
  49. }
  50. }
  51. return domainSets;
  52. }
  53. export async function processHosts(hostsUrl: string | URL, includeAllSubDomain = false, skipDomainCheck = false) {
  54. console.time(`- processHosts: ${hostsUrl.toString()}`);
  55. if (typeof hostsUrl === 'string') {
  56. hostsUrl = new URL(hostsUrl);
  57. }
  58. const domainSets = new Set<string>();
  59. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  60. const line = processLine(l);
  61. if (!line) {
  62. continue;
  63. }
  64. const [, ...domains] = line.split(' ');
  65. const _domain = domains.join(' ').trim();
  66. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  67. warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
  68. foundDebugDomain = true;
  69. }
  70. const domain = skipDomainCheck ? _domain : normalizeDomain(_domain);
  71. if (domain) {
  72. if (includeAllSubDomain) {
  73. domainSets.add(`.${domain}`);
  74. } else {
  75. domainSets.add(domain);
  76. }
  77. }
  78. }
  79. console.timeEnd(` - processHosts: ${hostsUrl.toString()}`);
  80. return domainSets;
  81. }
  82. export async function processFilterRules(
  83. filterRulesUrl: string | URL,
  84. fallbackUrls?: ReadonlyArray<string | URL> | undefined
  85. ): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
  86. const runStart = Bun.nanoseconds();
  87. const whitelistDomainSets = new Set<string>();
  88. const blacklistDomainSets = new Set<string>();
  89. /**
  90. * @param {string} domainToBeAddedToBlack
  91. * @param {boolean} isSubDomain
  92. */
  93. const addToBlackList = (domainToBeAddedToBlack: string, isSubDomain: boolean) => {
  94. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  95. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  96. } else {
  97. blacklistDomainSets.add(domainToBeAddedToBlack);
  98. }
  99. };
  100. /**
  101. * @param {string} domainToBeAddedToWhite
  102. * @param {boolean} [isSubDomain]
  103. */
  104. const addToWhiteList = (domainToBeAddedToWhite: string, isSubDomain = true) => {
  105. if (isSubDomain && domainToBeAddedToWhite[0] !== '.') {
  106. whitelistDomainSets.add(`.${domainToBeAddedToWhite}`);
  107. } else {
  108. whitelistDomainSets.add(domainToBeAddedToWhite);
  109. }
  110. };
  111. let downloadTime = 0;
  112. const gorhill = await getGorhillPublicSuffixPromise();
  113. /**
  114. * @param {string} line
  115. */
  116. const lineCb = (line: string) => {
  117. const result = parse(line, gorhill);
  118. if (result) {
  119. const flag = result[1];
  120. const hostname = result[0];
  121. if (DEBUG_DOMAIN_TO_FIND) {
  122. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  123. warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
  124. foundDebugDomain = true;
  125. console.log({ result, flag });
  126. }
  127. }
  128. switch (flag) {
  129. case 0:
  130. addToWhiteList(hostname, true);
  131. break;
  132. case -1:
  133. addToWhiteList(hostname, false);
  134. break;
  135. case 1:
  136. addToBlackList(hostname, false);
  137. break;
  138. case 2:
  139. addToBlackList(hostname, true);
  140. break;
  141. default:
  142. throw new Error(`Unknown flag: ${flag as any}`);
  143. }
  144. }
  145. };
  146. if (!fallbackUrls || fallbackUrls.length === 0) {
  147. downloadTime = 0;
  148. let last = Bun.nanoseconds();
  149. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  150. const now = Bun.nanoseconds();
  151. downloadTime += Bun.nanoseconds() - last;
  152. last = now;
  153. // don't trim here
  154. lineCb(line);
  155. }
  156. } else {
  157. let filterRules;
  158. const downloadStart = Bun.nanoseconds();
  159. try {
  160. const controller = new AbortController();
  161. /** @type string[] */
  162. filterRules = (
  163. await Promise.any(
  164. [filterRulesUrl, ...fallbackUrls].map(async url => {
  165. const r = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
  166. const text = await r.text();
  167. controller.abort();
  168. return text;
  169. })
  170. )
  171. ).split('\n');
  172. } catch (e) {
  173. console.log(`Download Rule for [${filterRulesUrl.toString()}] failed`);
  174. throw e;
  175. }
  176. downloadTime = Bun.nanoseconds() - downloadStart;
  177. for (let i = 0, len = filterRules.length; i < len; i++) {
  178. lineCb(filterRules[i]);
  179. }
  180. }
  181. console.log(` ┬ processFilterRules (${filterRulesUrl.toString()}): ${((Bun.nanoseconds() - runStart) / 1e6).toFixed(3)}ms`);
  182. console.log(` └── download time: ${(downloadTime / 1e6).toFixed(3)}ms`);
  183. return {
  184. white: whitelistDomainSets,
  185. black: blacklistDomainSets,
  186. foundDebugDomain
  187. };
  188. }
  189. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  190. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  191. /**
  192. * 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
  193. */
  194. function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: 0 | 1 | 2 | -1] {
  195. if (
  196. // doesn't include
  197. !$line.includes('.') // rule with out dot can not be a domain
  198. // includes
  199. || $line.includes('!')
  200. || $line.includes('?')
  201. || $line.includes('*')
  202. || $line.includes('[')
  203. || $line.includes('(')
  204. || $line.includes(']')
  205. || $line.includes(')')
  206. || $line.includes(',')
  207. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  208. ) {
  209. return null;
  210. }
  211. const line = $line.trim();
  212. /** @example line.length */
  213. const len = line.length;
  214. if (len === 0) {
  215. return null;
  216. }
  217. const firstChar = line[0];
  218. const lastChar = line[len - 1];
  219. if (
  220. firstChar === '/'
  221. // ends with
  222. || lastChar === '.' // || line.endsWith('.')
  223. || lastChar === '-' // || line.endsWith('-')
  224. || lastChar === '_' // || line.endsWith('_')
  225. // special modifier
  226. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  227. // || line.includes('$popup')
  228. // || line.includes('$removeparam')
  229. // || line.includes('$popunder')
  230. ) {
  231. return null;
  232. }
  233. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  234. return null;
  235. }
  236. const filter = NetworkFilter.parse(line);
  237. if (filter) {
  238. if (
  239. filter.isElemHide()
  240. || filter.isGenericHide()
  241. || filter.isSpecificHide()
  242. || filter.isRedirect()
  243. || filter.isRedirectRule()
  244. || filter.hasDomains()
  245. || filter.isCSP() // must not be csp rule
  246. || (!filter.fromAny() && !filter.fromDocument())
  247. ) {
  248. // not supported type
  249. return null;
  250. }
  251. if (
  252. filter.hostname // filter.hasHostname() // must have
  253. && filter.isPlain()
  254. // && (!filter.isRegex()) // isPlain() === !isRegex()
  255. && (!filter.isFullRegex())
  256. ) {
  257. if (!gorhill.getDomain(filter.hostname)) {
  258. return null;
  259. }
  260. const hostname = normalizeDomain(filter.hostname);
  261. if (!hostname) {
  262. return null;
  263. }
  264. // console.log({
  265. // '||': filter.isHostnameAnchor(),
  266. // '|': filter.isLeftAnchor(),
  267. // '|https://': !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  268. // });
  269. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  270. if (filter.isException() || filter.isBadFilter()) {
  271. return [hostname, isIncludeAllSubDomain ? 0 : -1];
  272. }
  273. const _1p = filter.firstParty();
  274. const _3p = filter.thirdParty();
  275. if (_1p) {
  276. if (_1p === _3p) {
  277. return [hostname, isIncludeAllSubDomain ? 2 : 1];
  278. }
  279. return null;
  280. }
  281. if (_3p) {
  282. return null;
  283. }
  284. }
  285. }
  286. /**
  287. * abnormal filter that can not be parsed by NetworkFilter
  288. */
  289. if (line.includes('$third-party') || line.includes('$frame')) {
  290. /*
  291. * `.bbelements.com^$third-party`
  292. * `://o0e.ru^$third-party`
  293. */
  294. return null;
  295. }
  296. /** @example line.endsWith('^') */
  297. const linedEndsWithCaret = lastChar === '^';
  298. /** @example line.endsWith('^|') */
  299. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  300. /** @example line.endsWith('^') || line.endsWith('^|') */
  301. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  302. // whitelist (exception)
  303. if (firstChar === '@' && line[1] === '@') {
  304. /**
  305. * cname exceptional filter can not be parsed by NetworkFilter
  306. *
  307. * `@@||m.faz.net^$cname`
  308. *
  309. * Surge / Clash can't handle CNAME either, so we just ignore them
  310. */
  311. if (line.endsWith('$cname')) {
  312. return null;
  313. }
  314. /**
  315. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  316. * "$genericblock`" is also not supported by NetworkFilter
  317. *
  318. * `@@||cmechina.net^$genericblock`
  319. * `@@|ftp.bmp.ovh^|`
  320. * `@@|adsterra.com^|`
  321. */
  322. if (
  323. (
  324. // line.startsWith('@@|')
  325. line[2] === '|'
  326. // line.startsWith('@@.')
  327. || line[2] === '.'
  328. /**
  329. * line.startsWith('@@://')
  330. *
  331. * `@@://googleadservices.com^|`
  332. * `@@://www.googleadservices.com^|`
  333. */
  334. || (line[2] === ':' && line[3] === '/' && line[4] === '/')
  335. )
  336. && (
  337. lineEndsWithCaretOrCaretVerticalBar
  338. || line.endsWith('$genericblock')
  339. || line.endsWith('$document')
  340. )
  341. ) {
  342. const _domain = line
  343. .replace('@@||', '')
  344. .replace('@@://', '')
  345. .replace('@@|', '')
  346. .replace('@@.', '')
  347. .replace('^|', '')
  348. .replace('^$genericblock', '')
  349. .replace('$genericblock', '')
  350. .replace('^$document', '')
  351. .replace('$document', '')
  352. .replaceAll('^', '')
  353. .trim();
  354. const domain = normalizeDomain(_domain);
  355. if (domain) {
  356. return [domain, 0];
  357. }
  358. console.warn(' * [parse-filter E0001] (white) invalid domain:', _domain);
  359. return null;
  360. }
  361. }
  362. if (firstChar === '|') {
  363. const lineEndswithCname = line.endsWith('$cname');
  364. if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
  365. /**
  366. * Some malformed filters can not be parsed by NetworkFilter:
  367. *
  368. * `||smetrics.teambeachbody.com^.com^`
  369. * `||solutions.|pages.indigovision.com^`
  370. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  371. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  372. */
  373. const includeAllSubDomain = line[1] === '|';
  374. const sliceStart = includeAllSubDomain ? 2 : 1;
  375. const sliceEnd = lastChar === '^'
  376. ? -1
  377. : lineEndsWithCaretOrCaretVerticalBar
  378. ? -2
  379. // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
  380. : (lineEndswithCname ? -6 : 0);
  381. const _domain = line
  382. .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
  383. .trim();
  384. const domain = normalizeDomain(_domain);
  385. if (domain) {
  386. return [domain, includeAllSubDomain ? 2 : 1];
  387. }
  388. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  389. return null;
  390. }
  391. }
  392. const lineStartsWithSingleDot = firstChar === '.';
  393. if (
  394. lineStartsWithSingleDot
  395. && lineEndsWithCaretOrCaretVerticalBar
  396. ) {
  397. /**
  398. * `.ay.delivery^`
  399. * `.m.bookben.com^`
  400. * `.wap.x4399.com^`
  401. */
  402. const _domain = line.slice(
  403. 1, // remove prefix dot
  404. linedEndsWithCaret // replaceAll('^', '')
  405. ? -1
  406. : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
  407. );
  408. const suffix = gorhill.getPublicSuffix(_domain);
  409. if (!gorhill.suffixInPSL(suffix)) {
  410. // This exclude domain-like resource like `1.1.4.514.js`
  411. return null;
  412. }
  413. const domain = normalizeDomain(_domain);
  414. if (domain) {
  415. return [domain, 2];
  416. }
  417. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  418. return null;
  419. }
  420. /**
  421. * `|http://x.o2.pl^`
  422. * `://mine.torrent.pw^`
  423. * `://say.ac^`
  424. */
  425. if (
  426. (
  427. line.startsWith('://')
  428. || line.startsWith('http://')
  429. || line.startsWith('https://')
  430. || line.startsWith('|http://')
  431. || line.startsWith('|https://')
  432. )
  433. && lineEndsWithCaretOrCaretVerticalBar
  434. ) {
  435. const _domain = line
  436. .replace('|https://', '')
  437. .replace('https://', '')
  438. .replace('|http://', '')
  439. .replace('http://', '')
  440. .replace('://', '')
  441. .replace('^|', '')
  442. .replaceAll('^', '')
  443. .trim();
  444. const domain = normalizeDomain(_domain);
  445. if (domain) {
  446. return [domain, 1];
  447. }
  448. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  449. return null;
  450. }
  451. /**
  452. * `_vmind.qqvideo.tc.qq.com^`
  453. * `arketing.indianadunes.com^`
  454. * `charlestownwyllie.oaklawnnonantum.com^`
  455. * `-telemetry.officeapps.live.com^`
  456. * `-tracker.biliapi.net`
  457. * `-logging.nextmedia.com`
  458. * `_social_tracking.js^`
  459. */
  460. if (firstChar !== '|' && lastChar === '^') {
  461. const _domain = line.slice(0, -1);
  462. const suffix = gorhill.getPublicSuffix(_domain);
  463. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  464. // This exclude domain-like resource like `_social_tracking.js^`
  465. return null;
  466. }
  467. const domain = normalizeDomain(_domain);
  468. if (domain) {
  469. return [domain, 1];
  470. }
  471. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  472. return null;
  473. }
  474. if (lineStartsWithSingleDot) {
  475. /**
  476. * `.cookielaw.js`
  477. * `.content_tracking.js`
  478. * `.ads.css`
  479. */
  480. const _domain = line.slice(1);
  481. const suffix = gorhill.getPublicSuffix(_domain);
  482. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  483. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  484. return null;
  485. }
  486. const tryNormalizeDomain = normalizeDomain(_domain);
  487. if (tryNormalizeDomain === _domain) {
  488. // the entire rule is domain
  489. return [line, 2];
  490. }
  491. } else {
  492. /**
  493. * `_prebid.js`
  494. * `t.yesware.com`
  495. * `ubmcmm.baidustatic.com`
  496. * `://www.smfg-card.$document`
  497. * `portal.librus.pl$$advertisement-module`
  498. * `@@-ds.metric.gstatic.com^|`
  499. * `://gom.ge/cookie.js`
  500. * `://accout-update-smba.jp.$document`
  501. * `_200x250.png`
  502. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  503. */
  504. const tryNormalizeDomain = normalizeDomain(line);
  505. if (tryNormalizeDomain === line) {
  506. // the entire rule is domain
  507. return [line, 2];
  508. }
  509. }
  510. console.warn(' * [parse-filter E0010] can not parse:', line);
  511. return null;
  512. }