parse-filter.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. // @ts-check
  2. import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
  3. import * as tldts from './cached-tld-parse';
  4. import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
  5. import { NetworkFilter } from '@cliqz/adblocker';
  6. import { processLine } from './process-line';
  7. import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
  8. import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
  9. import { isProbablyIpv4 } from './is-fast-ip';
  10. import { traceAsync } from './trace-runner';
  11. const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
  12. let foundDebugDomain = false;
  13. const warnOnceUrl = new Set<string>();
  14. const warnOnce = (url: string, isWhite: boolean, ...message: any[]) => {
  15. const key = `${url}${isWhite ? 'white' : 'black'}`;
  16. if (warnOnceUrl.has(key)) {
  17. return;
  18. }
  19. warnOnceUrl.add(key);
  20. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  21. };
  22. const normalizeDomain = (domain: string) => {
  23. if (!domain) return null;
  24. if (isProbablyIpv4(domain)) return null;
  25. const parsed = tldts.parse2(domain);
  26. if (parsed.isIp) return null;
  27. if (!parsed.isIcann && !parsed.isPrivate) return null;
  28. const h = parsed.hostname;
  29. if (!h) return null;
  30. return h[0] === '.' ? h.slice(1) : h;
  31. };
  32. export async function processDomainLists(domainListsUrl: string | URL, includeAllSubDomain = false) {
  33. if (typeof domainListsUrl === 'string') {
  34. domainListsUrl = new URL(domainListsUrl);
  35. }
  36. const domainSets = new Set<string>();
  37. for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) {
  38. const domainToAdd = processLine(line);
  39. if (!domainToAdd) {
  40. continue;
  41. }
  42. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  43. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  44. foundDebugDomain = true;
  45. }
  46. if (includeAllSubDomain) {
  47. domainSets.add(`.${domainToAdd}`);
  48. } else {
  49. domainSets.add(domainToAdd);
  50. }
  51. }
  52. return domainSets;
  53. }
  54. export async function processHosts(hostsUrl: string | URL, includeAllSubDomain = false, skipDomainCheck = false) {
  55. return traceAsync(`- processHosts: ${hostsUrl.toString()}`, async () => {
  56. if (typeof hostsUrl === 'string') {
  57. hostsUrl = new URL(hostsUrl);
  58. }
  59. const domainSets = new Set<string>();
  60. for await (const l of await fetchRemoteTextAndReadByLine(hostsUrl)) {
  61. const line = processLine(l);
  62. if (!line) {
  63. continue;
  64. }
  65. const [, ...domains] = line.split(' ');
  66. const _domain = domains.join(' ').trim();
  67. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  68. warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
  69. foundDebugDomain = true;
  70. }
  71. const domain = skipDomainCheck ? _domain : normalizeDomain(_domain);
  72. if (domain) {
  73. if (includeAllSubDomain) {
  74. domainSets.add(`.${domain}`);
  75. } else {
  76. domainSets.add(domain);
  77. }
  78. }
  79. }
  80. return domainSets;
  81. });
  82. }
  83. export async function processFilterRules(
  84. filterRulesUrl: string | URL,
  85. fallbackUrls?: ReadonlyArray<string | URL> | undefined
  86. ): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
  87. const whitelistDomainSets = new Set<string>();
  88. const blacklistDomainSets = new Set<string>();
  89. await traceAsync(`- processFilterRules: ${filterRulesUrl.toString()}`, async () => {
  90. const gorhill = await getGorhillPublicSuffixPromise();
  91. /**
  92. * @param {string} line
  93. */
  94. const lineCb = (line: string) => {
  95. const result = parse(line, gorhill);
  96. if (!result) {
  97. return;
  98. }
  99. const flag = result[1];
  100. const hostname = result[0];
  101. if (DEBUG_DOMAIN_TO_FIND) {
  102. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  103. warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
  104. foundDebugDomain = true;
  105. console.log({ result, flag });
  106. }
  107. }
  108. switch (flag) {
  109. case 0:
  110. if (hostname[0] !== '.') {
  111. whitelistDomainSets.add(`.${hostname}`);
  112. } else {
  113. whitelistDomainSets.add(hostname);
  114. }
  115. break;
  116. case -1:
  117. whitelistDomainSets.add(hostname);
  118. break;
  119. case 1:
  120. blacklistDomainSets.add(hostname);
  121. break;
  122. case 2:
  123. if (hostname[0] !== '.') {
  124. blacklistDomainSets.add(`.${hostname}`);
  125. } else {
  126. blacklistDomainSets.add(hostname);
  127. }
  128. break;
  129. default:
  130. throw new Error(`Unknown flag: ${flag as any}`);
  131. }
  132. };
  133. if (!fallbackUrls || fallbackUrls.length === 0) {
  134. for await (const line of await fetchRemoteTextAndReadByLine(filterRulesUrl)) {
  135. // don't trim here
  136. lineCb(line);
  137. }
  138. } else {
  139. let filterRules;
  140. try {
  141. const controller = new AbortController();
  142. /** @type string[] */
  143. filterRules = (
  144. await Promise.any(
  145. [filterRulesUrl, ...fallbackUrls].map(async url => {
  146. const r = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
  147. const text = await r.text();
  148. console.log('[fetch finish]', url.toString());
  149. controller.abort();
  150. return text;
  151. })
  152. )
  153. ).split('\n');
  154. } catch (e) {
  155. console.log(`Download Rule for [${filterRulesUrl.toString()}] failed`);
  156. throw e;
  157. }
  158. for (let i = 0, len = filterRules.length; i < len; i++) {
  159. lineCb(filterRules[i]);
  160. }
  161. }
  162. });
  163. return {
  164. white: whitelistDomainSets,
  165. black: blacklistDomainSets,
  166. foundDebugDomain
  167. };
  168. }
  169. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  170. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  171. /**
  172. * 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
  173. */
  174. function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: 0 | 1 | 2 | -1] {
  175. if (
  176. // doesn't include
  177. !$line.includes('.') // rule with out dot can not be a domain
  178. // includes
  179. || $line.includes('!')
  180. || $line.includes('?')
  181. || $line.includes('*')
  182. || $line.includes('[')
  183. || $line.includes('(')
  184. || $line.includes(']')
  185. || $line.includes(')')
  186. || $line.includes(',')
  187. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  188. ) {
  189. return null;
  190. }
  191. const line = $line.trim();
  192. /** @example line.length */
  193. const len = line.length;
  194. if (len === 0) {
  195. return null;
  196. }
  197. const firstChar = line[0];
  198. const lastChar = line[len - 1];
  199. if (
  200. firstChar === '/'
  201. // ends with
  202. || lastChar === '.' // || line.endsWith('.')
  203. || lastChar === '-' // || line.endsWith('-')
  204. || lastChar === '_' // || line.endsWith('_')
  205. // special modifier
  206. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  207. // || line.includes('$popup')
  208. // || line.includes('$removeparam')
  209. // || line.includes('$popunder')
  210. ) {
  211. return null;
  212. }
  213. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  214. return null;
  215. }
  216. const filter = NetworkFilter.parse(line);
  217. if (filter) {
  218. if (
  219. filter.isElemHide()
  220. || filter.isGenericHide()
  221. || filter.isSpecificHide()
  222. || filter.isRedirect()
  223. || filter.isRedirectRule()
  224. || filter.hasDomains()
  225. || filter.isCSP() // must not be csp rule
  226. || (!filter.fromAny() && !filter.fromDocument())
  227. ) {
  228. // not supported type
  229. return null;
  230. }
  231. if (
  232. filter.hostname // filter.hasHostname() // must have
  233. && filter.isPlain()
  234. // && (!filter.isRegex()) // isPlain() === !isRegex()
  235. && (!filter.isFullRegex())
  236. ) {
  237. const hostname = normalizeDomain(filter.hostname);
  238. if (!hostname) {
  239. return null;
  240. }
  241. // |: filter.isHostnameAnchor(),
  242. // |: filter.isLeftAnchor(),
  243. // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  244. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  245. if (filter.isException() || filter.isBadFilter()) {
  246. return [hostname, isIncludeAllSubDomain ? 0 : -1];
  247. }
  248. const _1p = filter.firstParty();
  249. const _3p = filter.thirdParty();
  250. if (_1p) {
  251. if (_1p === _3p) {
  252. return [hostname, isIncludeAllSubDomain ? 2 : 1];
  253. }
  254. return null;
  255. }
  256. if (_3p) {
  257. return null;
  258. }
  259. }
  260. }
  261. /**
  262. * abnormal filter that can not be parsed by NetworkFilter
  263. */
  264. if (line.includes('$third-party') || line.includes('$frame')) {
  265. /*
  266. * `.bbelements.com^$third-party`
  267. * `://o0e.ru^$third-party`
  268. */
  269. return null;
  270. }
  271. /** @example line.endsWith('^') */
  272. const linedEndsWithCaret = lastChar === '^';
  273. /** @example line.endsWith('^|') */
  274. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  275. /** @example line.endsWith('^') || line.endsWith('^|') */
  276. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  277. // whitelist (exception)
  278. if (firstChar === '@' && line[1] === '@') {
  279. /**
  280. * cname exceptional filter can not be parsed by NetworkFilter
  281. *
  282. * `@@||m.faz.net^$cname`
  283. *
  284. * Surge / Clash can't handle CNAME either, so we just ignore them
  285. */
  286. if (line.endsWith('$cname')) {
  287. return null;
  288. }
  289. /**
  290. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  291. * "$genericblock`" is also not supported by NetworkFilter
  292. *
  293. * `@@||cmechina.net^$genericblock`
  294. * `@@|ftp.bmp.ovh^|`
  295. * `@@|adsterra.com^|`
  296. */
  297. if (
  298. (
  299. // line.startsWith('@@|')
  300. line[2] === '|'
  301. // line.startsWith('@@.')
  302. || line[2] === '.'
  303. /**
  304. * line.startsWith('@@://')
  305. *
  306. * `@@://googleadservices.com^|`
  307. * `@@://www.googleadservices.com^|`
  308. */
  309. || (line[2] === ':' && line[3] === '/' && line[4] === '/')
  310. )
  311. && (
  312. lineEndsWithCaretOrCaretVerticalBar
  313. || line.endsWith('$genericblock')
  314. || line.endsWith('$document')
  315. )
  316. ) {
  317. const _domain = line
  318. .replace('@@||', '')
  319. .replace('@@://', '')
  320. .replace('@@|', '')
  321. .replace('@@.', '')
  322. .replace('^|', '')
  323. .replace('^$genericblock', '')
  324. .replace('$genericblock', '')
  325. .replace('^$document', '')
  326. .replace('$document', '')
  327. .replaceAll('^', '')
  328. .trim();
  329. const domain = normalizeDomain(_domain);
  330. if (domain) {
  331. return [domain, 0];
  332. }
  333. console.warn(' * [parse-filter E0001] (white) invalid domain:', _domain);
  334. return null;
  335. }
  336. }
  337. if (firstChar === '|') {
  338. const lineEndswithCname = line.endsWith('$cname');
  339. if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
  340. /**
  341. * Some malformed filters can not be parsed by NetworkFilter:
  342. *
  343. * `||smetrics.teambeachbody.com^.com^`
  344. * `||solutions.|pages.indigovision.com^`
  345. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  346. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  347. */
  348. const includeAllSubDomain = line[1] === '|';
  349. const sliceStart = includeAllSubDomain ? 2 : 1;
  350. const sliceEnd = lastChar === '^'
  351. ? -1
  352. : lineEndsWithCaretOrCaretVerticalBar
  353. ? -2
  354. // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
  355. : (lineEndswithCname ? -6 : 0);
  356. const _domain = line
  357. .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
  358. .trim();
  359. const domain = normalizeDomain(_domain);
  360. if (domain) {
  361. return [domain, includeAllSubDomain ? 2 : 1];
  362. }
  363. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  364. return null;
  365. }
  366. }
  367. const lineStartsWithSingleDot = firstChar === '.';
  368. if (
  369. lineStartsWithSingleDot
  370. && lineEndsWithCaretOrCaretVerticalBar
  371. ) {
  372. /**
  373. * `.ay.delivery^`
  374. * `.m.bookben.com^`
  375. * `.wap.x4399.com^`
  376. */
  377. const _domain = line.slice(
  378. 1, // remove prefix dot
  379. linedEndsWithCaret // replaceAll('^', '')
  380. ? -1
  381. : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
  382. );
  383. const suffix = gorhill.getPublicSuffix(_domain);
  384. if (!gorhill.suffixInPSL(suffix)) {
  385. // This exclude domain-like resource like `1.1.4.514.js`
  386. return null;
  387. }
  388. const domain = normalizeDomain(_domain);
  389. if (domain) {
  390. return [domain, 2];
  391. }
  392. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  393. return null;
  394. }
  395. /**
  396. * `|http://x.o2.pl^`
  397. * `://mine.torrent.pw^`
  398. * `://say.ac^`
  399. */
  400. if (
  401. (
  402. line.startsWith('://')
  403. || line.startsWith('http://')
  404. || line.startsWith('https://')
  405. || line.startsWith('|http://')
  406. || line.startsWith('|https://')
  407. )
  408. && lineEndsWithCaretOrCaretVerticalBar
  409. ) {
  410. const _domain = line
  411. .replace('|https://', '')
  412. .replace('https://', '')
  413. .replace('|http://', '')
  414. .replace('http://', '')
  415. .replace('://', '')
  416. .replace('^|', '')
  417. .replaceAll('^', '')
  418. .trim();
  419. const domain = normalizeDomain(_domain);
  420. if (domain) {
  421. return [domain, 1];
  422. }
  423. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  424. return null;
  425. }
  426. /**
  427. * `_vmind.qqvideo.tc.qq.com^`
  428. * `arketing.indianadunes.com^`
  429. * `charlestownwyllie.oaklawnnonantum.com^`
  430. * `-telemetry.officeapps.live.com^`
  431. * `-tracker.biliapi.net`
  432. * `-logging.nextmedia.com`
  433. * `_social_tracking.js^`
  434. */
  435. if (firstChar !== '|' && lastChar === '^') {
  436. const _domain = line.slice(0, -1);
  437. const suffix = gorhill.getPublicSuffix(_domain);
  438. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  439. // This exclude domain-like resource like `_social_tracking.js^`
  440. return null;
  441. }
  442. const domain = normalizeDomain(_domain);
  443. if (domain) {
  444. return [domain, 1];
  445. }
  446. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  447. return null;
  448. }
  449. if (lineStartsWithSingleDot) {
  450. /**
  451. * `.cookielaw.js`
  452. * `.content_tracking.js`
  453. * `.ads.css`
  454. */
  455. const _domain = line.slice(1);
  456. const suffix = gorhill.getPublicSuffix(_domain);
  457. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  458. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  459. return null;
  460. }
  461. const tryNormalizeDomain = normalizeDomain(_domain);
  462. if (tryNormalizeDomain === _domain) {
  463. // the entire rule is domain
  464. return [line, 2];
  465. }
  466. } else {
  467. /**
  468. * `_prebid.js`
  469. * `t.yesware.com`
  470. * `ubmcmm.baidustatic.com`
  471. * `://www.smfg-card.$document`
  472. * `portal.librus.pl$$advertisement-module`
  473. * `@@-ds.metric.gstatic.com^|`
  474. * `://gom.ge/cookie.js`
  475. * `://accout-update-smba.jp.$document`
  476. * `_200x250.png`
  477. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  478. */
  479. const tryNormalizeDomain = normalizeDomain(line);
  480. if (tryNormalizeDomain === line) {
  481. // the entire rule is domain
  482. return [line, 2];
  483. }
  484. }
  485. console.warn(' * [parse-filter E0010] can not parse:', line);
  486. return null;
  487. }