parse-filter.ts 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602
  1. // @ts-check
  2. import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
  3. import { NetworkFilter } from '@cliqz/adblocker';
  4. import { processLine } from './process-line';
  5. import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
  6. import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
  7. import { traceAsync, traceSync } from './trace-runner';
  8. import picocolors from 'picocolors';
  9. import { normalizeDomain } from './normalize-domain';
  10. import { fetchAssets } from './fetch-assets';
  11. const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
  12. let foundDebugDomain = false;
  13. const warnOnceUrl = new Set<string>();
  14. const warnOnce = (url: string, isWhite: boolean, ...message: string[]) => {
  15. const key = `${url}${isWhite ? 'white' : 'black'}`;
  16. if (warnOnceUrl.has(key)) {
  17. return;
  18. }
  19. warnOnceUrl.add(key);
  20. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  21. };
  22. export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) {
  23. return traceAsync(`- processDomainLists: ${domainListsUrl}`, async () => {
  24. const domainSets = new Set<string>();
  25. for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) {
  26. const domainToAdd = processLine(line);
  27. if (!domainToAdd) continue;
  28. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  29. warnOnce(domainListsUrl, false, DEBUG_DOMAIN_TO_FIND);
  30. foundDebugDomain = true;
  31. }
  32. domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
  33. }
  34. return domainSets;
  35. });
  36. }
  37. export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) {
  38. return traceAsync(`- processHosts: ${hostsUrl}`, async () => {
  39. const domainSets = new Set<string>();
  40. for await (const l of await fetchRemoteTextAndReadByLine(hostsUrl)) {
  41. const line = processLine(l);
  42. if (!line) {
  43. continue;
  44. }
  45. const domain = line.split(/\s/)[1];
  46. if (!domain) {
  47. continue;
  48. }
  49. const _domain = domain.trim();
  50. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  51. warnOnce(hostsUrl, false, DEBUG_DOMAIN_TO_FIND);
  52. foundDebugDomain = true;
  53. }
  54. const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain);
  55. if (domainToAdd) {
  56. domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
  57. }
  58. }
  59. console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
  60. return domainSets;
  61. });
  62. }
  63. // eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
  64. const enum ParseType {
  65. WhiteIncludeSubdomain = 0,
  66. WhiteAbsolute = -1,
  67. BlackAbsolute = 1,
  68. BlackIncludeSubdomain = 2,
  69. ErrorMessage = 10
  70. }
  71. export async function processFilterRules(
  72. filterRulesUrl: string,
  73. fallbackUrls?: readonly string[] | undefined
  74. ): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
  75. const whitelistDomainSets = new Set<string>();
  76. const blacklistDomainSets = new Set<string>();
  77. const warningMessages: string[] = [];
  78. await traceAsync(`- processFilterRules: ${filterRulesUrl}`, async () => {
  79. const gorhill = await getGorhillPublicSuffixPromise();
  80. /**
  81. * @param {string} line
  82. */
  83. const lineCb = (line: string) => {
  84. const result = parse(line, gorhill);
  85. if (!result) {
  86. return;
  87. }
  88. const flag = result[1];
  89. const hostname = result[0];
  90. if (DEBUG_DOMAIN_TO_FIND) {
  91. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  92. warnOnce(filterRulesUrl, flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute, DEBUG_DOMAIN_TO_FIND);
  93. foundDebugDomain = true;
  94. }
  95. }
  96. switch (flag) {
  97. case ParseType.WhiteIncludeSubdomain:
  98. if (hostname[0] !== '.') {
  99. whitelistDomainSets.add(`.${hostname}`);
  100. } else {
  101. whitelistDomainSets.add(hostname);
  102. }
  103. break;
  104. case ParseType.WhiteAbsolute:
  105. whitelistDomainSets.add(hostname);
  106. break;
  107. case ParseType.BlackAbsolute:
  108. blacklistDomainSets.add(hostname);
  109. break;
  110. case ParseType.BlackIncludeSubdomain:
  111. if (hostname[0] !== '.') {
  112. blacklistDomainSets.add(`.${hostname}`);
  113. } else {
  114. blacklistDomainSets.add(hostname);
  115. }
  116. break;
  117. case ParseType.ErrorMessage:
  118. warningMessages.push(hostname);
  119. break;
  120. default:
  121. break;
  122. }
  123. };
  124. if (!fallbackUrls || fallbackUrls.length === 0) {
  125. for await (const line of await fetchRemoteTextAndReadByLine(filterRulesUrl)) {
  126. // don't trim here
  127. lineCb(line);
  128. }
  129. } else {
  130. const filterRules = (await traceAsync(
  131. picocolors.gray(`- download ${filterRulesUrl}`),
  132. () => fetchAssets(filterRulesUrl, fallbackUrls),
  133. picocolors.gray
  134. )).split('\n');
  135. const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`);
  136. console.time(key);
  137. for (let i = 0, len = filterRules.length; i < len; i++) {
  138. lineCb(filterRules[i]);
  139. }
  140. console.timeEnd(key);
  141. }
  142. });
  143. warningMessages.forEach(msg => {
  144. console.warn(
  145. picocolors.yellow(msg),
  146. picocolors.gray(picocolors.underline(filterRulesUrl))
  147. );
  148. });
  149. console.log(
  150. picocolors.gray('[process filter]'),
  151. picocolors.gray(filterRulesUrl),
  152. picocolors.gray(`white: ${whitelistDomainSets.size}`),
  153. picocolors.gray(`black: ${blacklistDomainSets.size}`)
  154. );
  155. return {
  156. white: whitelistDomainSets,
  157. black: blacklistDomainSets,
  158. foundDebugDomain
  159. };
  160. }
  161. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  162. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder|\$cname)/;
  163. // cname exceptional filter can not be parsed by NetworkFilter
  164. // Surge / Clash can't handle CNAME either, so we just ignore them
  165. function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: ParseType] {
  166. if (
  167. // doesn't include
  168. !$line.includes('.') // rule with out dot can not be a domain
  169. // includes
  170. || $line.includes('!')
  171. || $line.includes('?')
  172. || $line.includes('*')
  173. || $line.includes('[')
  174. || $line.includes('(')
  175. || $line.includes(']')
  176. || $line.includes(')')
  177. || $line.includes(',')
  178. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  179. ) {
  180. return null;
  181. }
  182. const line = $line.trim();
  183. /** @example line.length */
  184. const len = line.length;
  185. if (len === 0) {
  186. return null;
  187. }
  188. const firstCharCode = line[0].charCodeAt(0);
  189. const lastCharCode = line[len - 1].charCodeAt(0);
  190. if (
  191. firstCharCode === 47 // 47 `/`
  192. // ends with
  193. || lastCharCode === 46 // 46 `.`, line.endsWith('.')
  194. || lastCharCode === 45 // 45 `-`, line.endsWith('-')
  195. || lastCharCode === 95 // 95 `_`, line.endsWith('_')
  196. // special modifier
  197. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  198. // || line.includes('$popup')
  199. // || line.includes('$removeparam')
  200. // || line.includes('$popunder')
  201. ) {
  202. return null;
  203. }
  204. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  205. return null;
  206. }
  207. const filter = NetworkFilter.parse(line);
  208. if (filter) {
  209. if (
  210. // filter.isCosmeticFilter() // always false
  211. // filter.isNetworkFilter() // always true
  212. filter.isElemHide()
  213. || filter.isGenericHide()
  214. || filter.isSpecificHide()
  215. || filter.isRedirect()
  216. || filter.isRedirectRule()
  217. || filter.hasDomains()
  218. || filter.isCSP() // must not be csp rule
  219. || (!filter.fromAny() && !filter.fromDocument())
  220. ) {
  221. // not supported type
  222. return null;
  223. }
  224. if (
  225. filter.hostname // filter.hasHostname() // must have
  226. && filter.isPlain() // isPlain() === !isRegex()
  227. && (!filter.isFullRegex())
  228. ) {
  229. const hostname = normalizeDomain(filter.hostname);
  230. if (!hostname) {
  231. return null;
  232. }
  233. // |: filter.isHostnameAnchor(),
  234. // |: filter.isLeftAnchor(),
  235. // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  236. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  237. if (filter.isException() || filter.isBadFilter()) {
  238. return [hostname, isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute];
  239. }
  240. const _1p = filter.firstParty();
  241. const _3p = filter.thirdParty();
  242. if (_1p) {
  243. if (_1p === _3p) {
  244. return [hostname, isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
  245. }
  246. return null;
  247. }
  248. if (_3p) {
  249. return null;
  250. }
  251. }
  252. }
  253. // After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
  254. // We now need to "salvage" the line as much as possible
  255. /*
  256. * From now on, we are mostly facing non-standard domain rules (some are regex like)
  257. * We first skip third-party and frame rules, as Surge / Clash can't handle them
  258. *
  259. * `.sharecounter.$third-party`
  260. * `.bbelements.com^$third-party`
  261. * `://o0e.ru^$third-party`
  262. * `.1.1.1.l80.js^$third-party`
  263. */
  264. if (line.includes('$third-party') || line.includes('$frame')) {
  265. return null;
  266. }
  267. /** @example line.endsWith('^') */
  268. const lineEndsWithCaret = lastCharCode === 94; // lastChar === '^';
  269. /** @example line.endsWith('^|') */
  270. const lineEndsWithCaretVerticalBar = (lastCharCode === 124 /** lastChar === '|' */) && line[len - 2] === '^';
  271. /** @example line.endsWith('^') || line.endsWith('^|') */
  272. const lineEndsWithCaretOrCaretVerticalBar = lineEndsWithCaret || lineEndsWithCaretVerticalBar;
  273. // whitelist (exception)
  274. if (
  275. firstCharCode === 64 // 64 `@`
  276. && line[1] === '@'
  277. ) {
  278. let whiteIncludeAllSubDomain = true;
  279. /**
  280. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  281. * "$genericblock`" is also not supported by NetworkFilter, see:
  282. * https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950
  283. *
  284. * `@@||cmechina.net^$genericblock`
  285. * `@@|ftp.bmp.ovh^|`
  286. * `@@|adsterra.com^|`
  287. * `@@.atlassian.net$document`
  288. * `@@||ad.alimama.com^$genericblock`
  289. */
  290. let sliceStart = 0;
  291. let sliceEnd: number | undefined;
  292. if (line[2] === '|') { // line.startsWith('@@|')
  293. sliceStart = 3;
  294. whiteIncludeAllSubDomain = false;
  295. if (line[3] === '|') { // line.startsWith('@@||')
  296. sliceStart = 4;
  297. whiteIncludeAllSubDomain = true;
  298. }
  299. } else if (line[2] === '.') { // line.startsWith('@@.')
  300. sliceStart = 3;
  301. whiteIncludeAllSubDomain = true;
  302. } else if (
  303. /**
  304. * line.startsWith('@@://')
  305. *
  306. * `@@://googleadservices.com^|`
  307. * `@@://www.googleadservices.com^|`
  308. */
  309. line[2] === ':' && line[3] === '/' && line[4] === '/'
  310. ) {
  311. whiteIncludeAllSubDomain = false;
  312. sliceStart = 5;
  313. }
  314. if (lineEndsWithCaretOrCaretVerticalBar) {
  315. sliceEnd = -2;
  316. } else if (line.endsWith('$genericblock')) {
  317. sliceEnd = -13;
  318. if (line[len - 14] === '^') { // line.endsWith('^$genericblock')
  319. sliceEnd = -14;
  320. }
  321. } else if (line.endsWith('$document')) {
  322. sliceEnd = -9;
  323. if (line[len - 10] === '^') { // line.endsWith('^$document')
  324. sliceEnd = -10;
  325. }
  326. }
  327. if (sliceStart !== 0 || sliceEnd !== undefined) {
  328. const sliced = line.slice(sliceStart, sliceEnd);
  329. const domain = normalizeDomain(sliced);
  330. if (domain) {
  331. return [domain, whiteIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute];
  332. }
  333. return [
  334. `[parse-filter E0001] (white) invalid domain: ${JSON.stringify({
  335. line, sliced, sliceStart, sliceEnd
  336. })}`,
  337. ParseType.ErrorMessage
  338. ];
  339. }
  340. return [
  341. `[parse-filter E0006] (white) failed to parse: ${JSON.stringify({
  342. line, sliceStart, sliceEnd
  343. })}`,
  344. ParseType.ErrorMessage
  345. ];
  346. }
  347. if (
  348. // 124 `|`
  349. // line.startsWith('|')
  350. firstCharCode === 124
  351. && lineEndsWithCaretOrCaretVerticalBar
  352. ) {
  353. /**
  354. * Some malformed filters can not be parsed by NetworkFilter:
  355. *
  356. * `||smetrics.teambeachbody.com^.com^`
  357. * `||solutions.|pages.indigovision.com^`
  358. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  359. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  360. */
  361. const includeAllSubDomain = line[1] === '|';
  362. const sliceStart = includeAllSubDomain ? 2 : 1;
  363. const sliceEnd = lineEndsWithCaret
  364. ? -1
  365. : (lineEndsWithCaretVerticalBar ? -2 : undefined);
  366. const sliced = line.slice(sliceStart, sliceEnd); // we already make sure line startsWith "|"
  367. const domain = normalizeDomain(sliced);
  368. if (domain) {
  369. return [domain, includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
  370. }
  371. return [
  372. `[parse-filter E0002] (black) invalid domain: ${sliced}`,
  373. ParseType.ErrorMessage
  374. ];
  375. }
  376. const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.`
  377. if (
  378. lineStartsWithSingleDot
  379. && lineEndsWithCaretOrCaretVerticalBar
  380. ) {
  381. /**
  382. * `.ay.delivery^`
  383. * `.m.bookben.com^`
  384. * `.wap.x4399.com^`
  385. */
  386. const sliced = line.slice(
  387. 1, // remove prefix dot
  388. lineEndsWithCaret // replaceAll('^', '')
  389. ? -1
  390. : (lineEndsWithCaretVerticalBar ? -2 : undefined) // replace('^|', '')
  391. );
  392. const suffix = gorhill.getPublicSuffix(sliced);
  393. if (!gorhill.suffixInPSL(suffix)) {
  394. // This exclude domain-like resource like `1.1.4.514.js`
  395. return null;
  396. }
  397. const domain = normalizeDomain(sliced);
  398. if (domain) {
  399. return [domain, ParseType.BlackIncludeSubdomain];
  400. }
  401. return [
  402. `[paparse-filter E0003] (black) invalid domain: ${sliced}`,
  403. ParseType.ErrorMessage
  404. ];
  405. }
  406. /**
  407. * `|http://x.o2.pl^`
  408. * `://mine.torrent.pw^`
  409. * `://say.ac^`
  410. */
  411. if (lineEndsWithCaretOrCaretVerticalBar) {
  412. let sliceStart = 0;
  413. let sliceEnd;
  414. if (lineEndsWithCaret) { // line.endsWith('^')
  415. sliceEnd = -1;
  416. } else if (lineEndsWithCaretVerticalBar) { // line.endsWith('^|')
  417. sliceEnd = -2;
  418. }
  419. if (line.startsWith('://')) {
  420. sliceStart = 3;
  421. } else if (line.startsWith('http://')) {
  422. sliceStart = 7;
  423. } else if (line.startsWith('https://')) {
  424. sliceStart = 8;
  425. } else if (line.startsWith('|http://')) {
  426. sliceStart = 8;
  427. } else if (line.startsWith('|https://')) {
  428. sliceStart = 9;
  429. }
  430. if (sliceStart !== 0 || sliceEnd !== undefined) {
  431. const sliced = line.slice(sliceStart, sliceEnd);
  432. const domain = normalizeDomain(sliced);
  433. if (domain) {
  434. return [domain, ParseType.BlackIncludeSubdomain];
  435. }
  436. return [
  437. `[parse-filter E0004] (black) invalid domain: ${JSON.stringify({
  438. line, sliced, sliceStart, sliceEnd
  439. })}`,
  440. ParseType.ErrorMessage
  441. ];
  442. }
  443. }
  444. /**
  445. * `_vmind.qqvideo.tc.qq.com^`
  446. * `arketing.indianadunes.com^`
  447. * `charlestownwyllie.oaklawnnonantum.com^`
  448. * `-telemetry.officeapps.live.com^`
  449. * `-tracker.biliapi.net`
  450. * `-logging.nextmedia.com`
  451. * `_social_tracking.js^`
  452. */
  453. if (
  454. firstCharCode !== 124 // 124 `|`
  455. && lastCharCode === 94 // 94 `^`
  456. ) {
  457. const _domain = line.slice(0, -1);
  458. const suffix = gorhill.getPublicSuffix(_domain);
  459. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  460. // This exclude domain-like resource like `_social_tracking.js^`
  461. return null;
  462. }
  463. const domain = normalizeDomain(_domain);
  464. if (domain) {
  465. return [domain, ParseType.BlackAbsolute];
  466. }
  467. return [
  468. `[parse-filter E0005] (black) invalid domain: ${_domain}`,
  469. ParseType.ErrorMessage
  470. ];
  471. }
  472. // Possibly that entire rule is domain
  473. /**
  474. * lineStartsWithSingleDot:
  475. *
  476. * `.cookielaw.js`
  477. * `.content_tracking.js`
  478. * `.ads.css`
  479. *
  480. * else:
  481. *
  482. * `_prebid.js`
  483. * `t.yesware.com`
  484. * `ubmcmm.baidustatic.com`
  485. * `://www.smfg-card.$document`
  486. * `portal.librus.pl$$advertisement-module`
  487. * `@@-ds.metric.gstatic.com^|`
  488. * `://gom.ge/cookie.js`
  489. * `://accout-update-smba.jp.$document`
  490. * `_200x250.png`
  491. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  492. */
  493. let sliceStart = 0;
  494. let sliceEnd: number | undefined;
  495. if (lineStartsWithSingleDot) {
  496. sliceStart = 1;
  497. }
  498. if (line.endsWith('^$all')) { // This salvage line `thepiratebay3.com^$all`
  499. sliceEnd = -5;
  500. } else if (
  501. // Try to salvage line like `://account.smba.$document`
  502. // For this specific line, it will fail anyway though.
  503. line.endsWith('$document')
  504. ) {
  505. sliceEnd = -9;
  506. }
  507. const sliced = (sliceStart !== 0 || sliceEnd !== undefined) ? line.slice(sliceStart, sliceEnd) : line;
  508. const suffix = gorhill.getPublicSuffix(sliced);
  509. /**
  510. * Fast exclude definitely not domain-like resource
  511. *
  512. * `.gatracking.js`, suffix is `js`,
  513. * `.ads.css`, suffix is `css`,
  514. * `-cpm-ads.$badfilter`, suffix is `$badfilter`,
  515. * `portal.librus.pl$$advertisement-module`, suffix is `pl$$advertisement-module`
  516. */
  517. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  518. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  519. console.log({ line, suffix });
  520. return null;
  521. }
  522. const tryNormalizeDomain = normalizeDomain(sliced);
  523. if (tryNormalizeDomain === sliced) {
  524. // the entire rule is domain
  525. return [sliced, ParseType.BlackIncludeSubdomain];
  526. }
  527. return [
  528. `[parse-filter E0010] can not parse: ${line}`,
  529. ParseType.ErrorMessage
  530. ];
  531. }