parse-filter.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. // @ts-check
  2. import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
  3. import * as tldts from './cached-tld-parse';
  4. import { fetchRemoteTextAndCreateReadlineInterface } from './fetch-remote-text-by-line';
  5. import { NetworkFilter } from '@cliqz/adblocker';
  6. import { processLine } from './process-line';
  7. import { performance } from 'perf_hooks';
  8. import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
  9. import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
  10. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  11. let foundDebugDomain = false;
  12. const warnOnceUrl = new Set<string>();
  13. const warnOnce = (url: string, isWhite: boolean, ...message: any[]) => {
  14. const key = `${url}${isWhite ? 'white' : 'black'}`;
  15. if (warnOnceUrl.has(key)) {
  16. return;
  17. }
  18. warnOnceUrl.add(key);
  19. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  20. };
  21. const normalizeDomain = (domain: string) => {
  22. if (!domain) return null;
  23. const parsed = tldts.parse(domain);
  24. if (parsed.isIp) return null;
  25. if (parsed.isIcann || parsed.isPrivate) {
  26. const h = parsed.hostname;
  27. if (h === null) return null;
  28. return h[0] === '.' ? h.slice(1) : h;
  29. }
  30. return null;
  31. };
  32. export async function processDomainLists(domainListsUrl: string | URL) {
  33. if (typeof domainListsUrl === 'string') {
  34. domainListsUrl = new URL(domainListsUrl);
  35. }
  36. const domainSets = new Set<string>();
  37. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  38. const domainToAdd = processLine(line);
  39. if (!domainToAdd) {
  40. continue;
  41. }
  42. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  43. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  44. foundDebugDomain = true;
  45. }
  46. domainSets.add(domainToAdd);
  47. }
  48. return domainSets;
  49. }
  50. export async function processHosts(hostsUrl: string | URL, includeAllSubDomain = false, skipDomainCheck = false) {
  51. console.time(`- processHosts: ${hostsUrl}`);
  52. if (typeof hostsUrl === 'string') {
  53. hostsUrl = new URL(hostsUrl);
  54. }
  55. const domainSets = new Set<string>();
  56. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  57. const line = processLine(l);
  58. if (!line) {
  59. continue;
  60. }
  61. const [, ...domains] = line.split(' ');
  62. const _domain = domains.join(' ').trim();
  63. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  64. warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
  65. foundDebugDomain = true;
  66. }
  67. const domain = skipDomainCheck ? _domain : normalizeDomain(_domain);
  68. if (domain) {
  69. if (includeAllSubDomain) {
  70. domainSets.add(`.${domain}`);
  71. } else {
  72. domainSets.add(domain);
  73. }
  74. }
  75. }
  76. console.timeEnd(` - processHosts: ${hostsUrl}`);
  77. return domainSets;
  78. }
  79. export async function processFilterRules(
  80. filterRulesUrl: string | URL,
  81. fallbackUrls?: readonly (string | URL)[] | undefined
  82. ): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
  83. const runStart = performance.now();
  84. const whitelistDomainSets = new Set<string>();
  85. const blacklistDomainSets = new Set<string>();
  86. /**
  87. * @param {string} domainToBeAddedToBlack
  88. * @param {boolean} isSubDomain
  89. */
  90. const addToBlackList = (domainToBeAddedToBlack: string, isSubDomain: boolean) => {
  91. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  92. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  93. } else {
  94. blacklistDomainSets.add(domainToBeAddedToBlack);
  95. }
  96. };
  97. /**
  98. * @param {string} domainToBeAddedToWhite
  99. * @param {boolean} [isSubDomain]
  100. */
  101. const addToWhiteList = (domainToBeAddedToWhite: string, isSubDomain = true) => {
  102. if (isSubDomain && domainToBeAddedToWhite[0] !== '.') {
  103. whitelistDomainSets.add(`.${domainToBeAddedToWhite}`);
  104. } else {
  105. whitelistDomainSets.add(domainToBeAddedToWhite);
  106. }
  107. };
  108. let downloadTime = 0;
  109. const gorhill = await getGorhillPublicSuffixPromise();
  110. /**
  111. * @param {string} line
  112. */
  113. const lineCb = (line: string) => {
  114. const result = parse(line, gorhill);
  115. if (result) {
  116. const flag = result[1];
  117. const hostname = result[0];
  118. if (DEBUG_DOMAIN_TO_FIND) {
  119. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  120. warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
  121. foundDebugDomain = true;
  122. console.log({ result, flag });
  123. }
  124. }
  125. switch (flag) {
  126. case 0:
  127. addToWhiteList(hostname, true);
  128. break;
  129. case -1:
  130. addToWhiteList(hostname, false);
  131. break;
  132. case 1:
  133. addToBlackList(hostname, false);
  134. break;
  135. case 2:
  136. addToBlackList(hostname, true);
  137. break;
  138. default:
  139. throw new Error(`Unknown flag: ${flag}`);
  140. }
  141. }
  142. };
  143. if (!fallbackUrls || fallbackUrls.length === 0) {
  144. downloadTime = 0;
  145. let last = performance.now();
  146. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  147. const now = performance.now();
  148. downloadTime += performance.now() - last;
  149. last = now;
  150. // don't trim here
  151. lineCb(line);
  152. }
  153. } else {
  154. let filterRules;
  155. const downloadStart = performance.now();
  156. try {
  157. const controller = new AbortController();
  158. /** @type string[] */
  159. filterRules = (
  160. await Promise.any(
  161. [filterRulesUrl, ...(fallbackUrls || [])].map(async url => {
  162. const r = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
  163. const text = await r.text();
  164. controller.abort();
  165. return text;
  166. })
  167. )
  168. ).split('\n');
  169. } catch (e) {
  170. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  171. throw e;
  172. }
  173. downloadTime = performance.now() - downloadStart;
  174. for (let i = 0, len = filterRules.length; i < len; i++) {
  175. lineCb(filterRules[i]);
  176. }
  177. }
  178. console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
  179. console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
  180. return {
  181. white: whitelistDomainSets,
  182. black: blacklistDomainSets,
  183. foundDebugDomain
  184. };
  185. }
  186. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  187. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  188. /**
  189. * 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
  190. */
  191. function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: 0 | 1 | 2 | -1] {
  192. if (
  193. // doesn't include
  194. !$line.includes('.') // rule with out dot can not be a domain
  195. // includes
  196. || $line.includes('!')
  197. || $line.includes('?')
  198. || $line.includes('*')
  199. || $line.includes('[')
  200. || $line.includes('(')
  201. || $line.includes(']')
  202. || $line.includes(')')
  203. || $line.includes(',')
  204. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  205. ) {
  206. return null;
  207. }
  208. const line = $line.trim();
  209. /** @example line.length */
  210. const len = line.length;
  211. if (len === 0) {
  212. return null;
  213. }
  214. const firstChar = line[0];
  215. const lastChar = line[len - 1];
  216. if (
  217. firstChar === '/'
  218. // ends with
  219. || lastChar === '.' // || line.endsWith('.')
  220. || lastChar === '-' // || line.endsWith('-')
  221. || lastChar === '_' // || line.endsWith('_')
  222. // special modifier
  223. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  224. // || line.includes('$popup')
  225. // || line.includes('$removeparam')
  226. // || line.includes('$popunder')
  227. ) {
  228. return null;
  229. }
  230. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  231. return null;
  232. }
  233. const filter = NetworkFilter.parse(line);
  234. if (filter) {
  235. if (
  236. filter.isElemHide()
  237. || filter.isGenericHide()
  238. || filter.isSpecificHide()
  239. || filter.isRedirect()
  240. || filter.isRedirectRule()
  241. || filter.hasDomains()
  242. || filter.isCSP() // must not be csp rule
  243. || (!filter.fromAny() && !filter.fromDocument())
  244. ) {
  245. // not supported type
  246. return null;
  247. }
  248. if (
  249. filter.hostname // filter.hasHostname() // must have
  250. && filter.isPlain()
  251. // && (!filter.isRegex()) // isPlain() === !isRegex()
  252. && (!filter.isFullRegex())
  253. ) {
  254. if (!gorhill.getDomain(filter.hostname)) {
  255. return null;
  256. }
  257. const hostname = normalizeDomain(filter.hostname);
  258. if (!hostname) {
  259. return null;
  260. }
  261. // console.log({
  262. // '||': filter.isHostnameAnchor(),
  263. // '|': filter.isLeftAnchor(),
  264. // '|https://': !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  265. // });
  266. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  267. if (filter.isException() || filter.isBadFilter()) {
  268. return [hostname, isIncludeAllSubDomain ? 0 : -1];
  269. }
  270. const _1p = filter.firstParty();
  271. const _3p = filter.thirdParty();
  272. if (_1p) {
  273. if (_1p === _3p) {
  274. return [hostname, isIncludeAllSubDomain ? 2 : 1];
  275. }
  276. return null;
  277. }
  278. if (_3p) {
  279. return null;
  280. }
  281. }
  282. }
  283. /**
  284. * abnormal filter that can not be parsed by NetworkFilter
  285. */
  286. if (line.includes('$third-party') || line.includes('$frame')) {
  287. /*
  288. * `.bbelements.com^$third-party`
  289. * `://o0e.ru^$third-party`
  290. */
  291. return null;
  292. }
  293. /** @example line.endsWith('^') */
  294. const linedEndsWithCaret = lastChar === '^';
  295. /** @example line.endsWith('^|') */
  296. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  297. /** @example line.endsWith('^') || line.endsWith('^|') */
  298. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  299. // whitelist (exception)
  300. if (firstChar === '@' && line[1] === '@') {
  301. /**
  302. * cname exceptional filter can not be parsed by NetworkFilter
  303. *
  304. * `@@||m.faz.net^$cname`
  305. *
  306. * Surge / Clash can't handle CNAME either, so we just ignore them
  307. */
  308. if (line.endsWith('$cname')) {
  309. return null;
  310. }
  311. /**
  312. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  313. * "$genericblock`" is also not supported by NetworkFilter
  314. *
  315. * `@@||cmechina.net^$genericblock`
  316. * `@@|ftp.bmp.ovh^|`
  317. * `@@|adsterra.com^|`
  318. */
  319. if (
  320. (
  321. // line.startsWith('@@|')
  322. line[2] === '|'
  323. // line.startsWith('@@.')
  324. || line[2] === '.'
  325. /**
  326. * line.startsWith('@@://')
  327. *
  328. * `@@://googleadservices.com^|`
  329. * `@@://www.googleadservices.com^|`
  330. */
  331. || (line[2] === ':' && line[3] === '/' && line[4] === '/')
  332. )
  333. && (
  334. lineEndsWithCaretOrCaretVerticalBar
  335. || line.endsWith('$genericblock')
  336. || line.endsWith('$document')
  337. )
  338. ) {
  339. const _domain = line
  340. .replace('@@||', '')
  341. .replace('@@://', '')
  342. .replace('@@|', '')
  343. .replace('@@.', '')
  344. .replace('^|', '')
  345. .replace('^$genericblock', '')
  346. .replace('$genericblock', '')
  347. .replace('^$document', '')
  348. .replace('$document', '')
  349. .replaceAll('^', '')
  350. .trim();
  351. const domain = normalizeDomain(_domain);
  352. if (domain) {
  353. return [domain, 0];
  354. }
  355. console.warn(' * [parse-filter E0001] (white) invalid domain:', _domain);
  356. return null;
  357. }
  358. }
  359. if (firstChar === '|') {
  360. const lineEndswithCname = line.endsWith('$cname');
  361. if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
  362. /**
  363. * Some malformed filters can not be parsed by NetworkFilter:
  364. *
  365. * `||smetrics.teambeachbody.com^.com^`
  366. * `||solutions.|pages.indigovision.com^`
  367. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  368. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  369. */
  370. const includeAllSubDomain = line[1] === '|';
  371. const sliceStart = includeAllSubDomain ? 2 : 1;
  372. const sliceEnd = lastChar === '^'
  373. ? -1
  374. : lineEndsWithCaretOrCaretVerticalBar
  375. ? -2
  376. // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
  377. : (lineEndswithCname ? -6 : 0);
  378. const _domain = line
  379. .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
  380. .trim();
  381. const domain = normalizeDomain(_domain);
  382. if (domain) {
  383. return [domain, includeAllSubDomain ? 2 : 1];
  384. }
  385. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  386. return null;
  387. }
  388. }
  389. const lineStartsWithSingleDot = firstChar === '.';
  390. if (
  391. lineStartsWithSingleDot
  392. && lineEndsWithCaretOrCaretVerticalBar
  393. ) {
  394. /**
  395. * `.ay.delivery^`
  396. * `.m.bookben.com^`
  397. * `.wap.x4399.com^`
  398. */
  399. const _domain = line.slice(
  400. 1, // remove prefix dot
  401. linedEndsWithCaret // replaceAll('^', '')
  402. ? -1
  403. : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
  404. );
  405. const suffix = gorhill.getPublicSuffix(_domain);
  406. if (!gorhill.suffixInPSL(suffix)) {
  407. // This exclude domain-like resource like `1.1.4.514.js`
  408. return null;
  409. }
  410. const domain = normalizeDomain(_domain);
  411. if (domain) {
  412. return [domain, 2];
  413. }
  414. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  415. return null;
  416. }
  417. /**
  418. * `|http://x.o2.pl^`
  419. * `://mine.torrent.pw^`
  420. * `://say.ac^`
  421. */
  422. if (
  423. (
  424. line.startsWith('://')
  425. || line.startsWith('http://')
  426. || line.startsWith('https://')
  427. || line.startsWith('|http://')
  428. || line.startsWith('|https://')
  429. )
  430. && lineEndsWithCaretOrCaretVerticalBar
  431. ) {
  432. const _domain = line
  433. .replace('|https://', '')
  434. .replace('https://', '')
  435. .replace('|http://', '')
  436. .replace('http://', '')
  437. .replace('://', '')
  438. .replace('^|', '')
  439. .replaceAll('^', '')
  440. .trim();
  441. const domain = normalizeDomain(_domain);
  442. if (domain) {
  443. return [domain, 1];
  444. }
  445. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  446. return null;
  447. }
  448. /**
  449. * `_vmind.qqvideo.tc.qq.com^`
  450. * `arketing.indianadunes.com^`
  451. * `charlestownwyllie.oaklawnnonantum.com^`
  452. * `-telemetry.officeapps.live.com^`
  453. * `-tracker.biliapi.net`
  454. * `-logging.nextmedia.com`
  455. * `_social_tracking.js^`
  456. */
  457. if (firstChar !== '|' && lastChar === '^') {
  458. const _domain = line.slice(0, -1);
  459. const suffix = gorhill.getPublicSuffix(_domain);
  460. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  461. // This exclude domain-like resource like `_social_tracking.js^`
  462. return null;
  463. }
  464. const domain = normalizeDomain(_domain);
  465. if (domain) {
  466. return [domain, 1];
  467. }
  468. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  469. return null;
  470. }
  471. if (lineStartsWithSingleDot) {
  472. /**
  473. * `.cookielaw.js`
  474. * `.content_tracking.js`
  475. * `.ads.css`
  476. */
  477. const _domain = line.slice(1);
  478. const suffix = gorhill.getPublicSuffix(_domain);
  479. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  480. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  481. return null;
  482. }
  483. const tryNormalizeDomain = normalizeDomain(_domain);
  484. if (tryNormalizeDomain === _domain) {
  485. // the entire rule is domain
  486. return [line, 2];
  487. }
  488. } else {
  489. /**
  490. * `_prebid.js`
  491. * `t.yesware.com`
  492. * `ubmcmm.baidustatic.com`
  493. * `://www.smfg-card.$document`
  494. * `portal.librus.pl$$advertisement-module`
  495. * `@@-ds.metric.gstatic.com^|`
  496. * `://gom.ge/cookie.js`
  497. * `://accout-update-smba.jp.$document`
  498. * `_200x250.png`
  499. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  500. */
  501. const tryNormalizeDomain = normalizeDomain(line);
  502. if (tryNormalizeDomain === line) {
  503. // the entire rule is domain
  504. return [line, 2];
  505. }
  506. }
  507. console.warn(' * [parse-filter E0010] can not parse:', line);
  508. return null;
  509. }