parse-filter.ts 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597
  1. // @ts-check
  2. import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
  3. import { NetworkFilter } from '@cliqz/adblocker';
  4. import { processLine } from './process-line';
  5. import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
  6. import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
  7. import { traceAsync } from './trace-runner';
  8. import picocolors from 'picocolors';
  9. import { normalizeDomain } from './normalize-domain';
  10. import { fetchAssets } from './fetch-assets';
  11. const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
  12. let foundDebugDomain = false;
  13. export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) {
  14. return traceAsync(`- processDomainLists: ${domainListsUrl}`, async () => {
  15. const domainSets = new Set<string>();
  16. for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) {
  17. const domainToAdd = processLine(line);
  18. if (!domainToAdd) continue;
  19. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  20. console.warn(picocolors.red(domainListsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
  21. foundDebugDomain = true;
  22. }
  23. domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
  24. }
  25. return domainSets;
  26. });
  27. }
  28. export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) {
  29. return traceAsync(`- processHosts: ${hostsUrl}`, async () => {
  30. const domainSets = new Set<string>();
  31. for await (const l of await fetchRemoteTextAndReadByLine(hostsUrl)) {
  32. const line = processLine(l);
  33. if (!line) {
  34. continue;
  35. }
  36. const domain = line.split(/\s/)[1];
  37. if (!domain) {
  38. continue;
  39. }
  40. const _domain = domain.trim();
  41. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  42. console.warn(picocolors.red(hostsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
  43. foundDebugDomain = true;
  44. }
  45. const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain);
  46. if (domainToAdd) {
  47. domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
  48. }
  49. }
  50. console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
  51. return domainSets;
  52. });
  53. }
  54. // eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
  55. const enum ParseType {
  56. WhiteIncludeSubdomain = 0,
  57. WhiteAbsolute = -1,
  58. BlackAbsolute = 1,
  59. BlackIncludeSubdomain = 2,
  60. ErrorMessage = 10
  61. }
  62. export async function processFilterRules(
  63. filterRulesUrl: string,
  64. fallbackUrls?: readonly string[] | undefined
  65. ): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
  66. const whitelistDomainSets = new Set<string>();
  67. const blacklistDomainSets = new Set<string>();
  68. const warningMessages: string[] = [];
  69. await traceAsync(`- processFilterRules: ${filterRulesUrl}`, async () => {
  70. const gorhill = await getGorhillPublicSuffixPromise();
  71. /**
  72. * @param {string} line
  73. */
  74. const lineCb = (line: string) => {
  75. const result = parse(line, gorhill);
  76. if (!result) {
  77. return;
  78. }
  79. const flag = result[1];
  80. const hostname = result[0];
  81. if (DEBUG_DOMAIN_TO_FIND) {
  82. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  83. console.warn(
  84. picocolors.red(filterRulesUrl),
  85. flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute
  86. ? '(white)'
  87. : '(black)',
  88. picocolors.bold(DEBUG_DOMAIN_TO_FIND)
  89. );
  90. foundDebugDomain = true;
  91. }
  92. }
  93. switch (flag) {
  94. case ParseType.WhiteIncludeSubdomain:
  95. if (hostname[0] !== '.') {
  96. whitelistDomainSets.add(`.${hostname}`);
  97. } else {
  98. whitelistDomainSets.add(hostname);
  99. }
  100. break;
  101. case ParseType.WhiteAbsolute:
  102. whitelistDomainSets.add(hostname);
  103. break;
  104. case ParseType.BlackAbsolute:
  105. blacklistDomainSets.add(hostname);
  106. break;
  107. case ParseType.BlackIncludeSubdomain:
  108. if (hostname[0] !== '.') {
  109. blacklistDomainSets.add(`.${hostname}`);
  110. } else {
  111. blacklistDomainSets.add(hostname);
  112. }
  113. break;
  114. case ParseType.ErrorMessage:
  115. warningMessages.push(hostname);
  116. break;
  117. default:
  118. break;
  119. }
  120. };
  121. if (!fallbackUrls || fallbackUrls.length === 0) {
  122. for await (const line of await fetchRemoteTextAndReadByLine(filterRulesUrl)) {
  123. // don't trim here
  124. lineCb(line);
  125. }
  126. } else {
  127. const filterRules = (await traceAsync(
  128. picocolors.gray(`- download ${filterRulesUrl}`),
  129. () => fetchAssets(filterRulesUrl, fallbackUrls),
  130. picocolors.gray
  131. )).split('\n');
  132. const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`);
  133. console.time(key);
  134. for (let i = 0, len = filterRules.length; i < len; i++) {
  135. lineCb(filterRules[i]);
  136. }
  137. console.timeEnd(key);
  138. }
  139. });
  140. warningMessages.forEach(msg => {
  141. console.warn(
  142. picocolors.yellow(msg),
  143. picocolors.gray(picocolors.underline(filterRulesUrl))
  144. );
  145. });
  146. console.log(
  147. picocolors.gray('[process filter]'),
  148. picocolors.gray(filterRulesUrl),
  149. picocolors.gray(`white: ${whitelistDomainSets.size}`),
  150. picocolors.gray(`black: ${blacklistDomainSets.size}`)
  151. );
  152. return {
  153. white: whitelistDomainSets,
  154. black: blacklistDomainSets,
  155. foundDebugDomain
  156. };
  157. }
  158. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
  159. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder|\$cname)/;
  160. // cname exceptional filter can not be parsed by NetworkFilter
  161. // Surge / Clash can't handle CNAME either, so we just ignore them
  162. function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: ParseType] {
  163. if (
  164. // doesn't include
  165. !$line.includes('.') // rule with out dot can not be a domain
  166. // includes
  167. || $line.includes('!')
  168. || $line.includes('?')
  169. || $line.includes('*')
  170. || $line.includes('[')
  171. || $line.includes('(')
  172. || $line.includes(']')
  173. || $line.includes(')')
  174. || $line.includes(',')
  175. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  176. ) {
  177. return null;
  178. }
  179. const line = $line.trim();
  180. /** @example line.length */
  181. const len = line.length;
  182. if (len === 0) {
  183. return null;
  184. }
  185. const firstCharCode = line[0].charCodeAt(0);
  186. const lastCharCode = line[len - 1].charCodeAt(0);
  187. if (
  188. firstCharCode === 47 // 47 `/`
  189. // ends with
  190. || lastCharCode === 46 // 46 `.`, line.endsWith('.')
  191. || lastCharCode === 45 // 45 `-`, line.endsWith('-')
  192. || lastCharCode === 95 // 95 `_`, line.endsWith('_')
  193. // special modifier
  194. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  195. // || line.includes('$popup')
  196. // || line.includes('$removeparam')
  197. // || line.includes('$popunder')
  198. ) {
  199. return null;
  200. }
  201. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  202. return null;
  203. }
  204. const filter = NetworkFilter.parse(line);
  205. if (filter) {
  206. if (
  207. // filter.isCosmeticFilter() // always false
  208. // filter.isNetworkFilter() // always true
  209. filter.isElemHide()
  210. || filter.isGenericHide()
  211. || filter.isSpecificHide()
  212. || filter.isRedirect()
  213. || filter.isRedirectRule()
  214. || filter.hasDomains()
  215. || filter.isCSP() // must not be csp rule
  216. || (!filter.fromAny() && !filter.fromDocument())
  217. ) {
  218. // not supported type
  219. return null;
  220. }
  221. if (
  222. filter.hostname // filter.hasHostname() // must have
  223. && filter.isPlain() // isPlain() === !isRegex()
  224. && (!filter.isFullRegex())
  225. ) {
  226. const hostname = normalizeDomain(filter.hostname);
  227. if (!hostname) {
  228. return null;
  229. }
  230. // |: filter.isHostnameAnchor(),
  231. // |: filter.isLeftAnchor(),
  232. // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  233. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  234. if (filter.isException() || filter.isBadFilter()) {
  235. return [hostname, isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute];
  236. }
  237. const _1p = filter.firstParty();
  238. const _3p = filter.thirdParty();
  239. if (_1p) {
  240. if (_1p === _3p) {
  241. return [hostname, isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
  242. }
  243. return null;
  244. }
  245. if (_3p) {
  246. return null;
  247. }
  248. }
  249. }
  250. // After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
  251. // We now need to "salvage" the line as much as possible
  252. /*
  253. * From now on, we are mostly facing non-standard domain rules (some are regex like)
  254. * We first skip third-party and frame rules, as Surge / Clash can't handle them
  255. *
  256. * `.sharecounter.$third-party`
  257. * `.bbelements.com^$third-party`
  258. * `://o0e.ru^$third-party`
  259. * `.1.1.1.l80.js^$third-party`
  260. */
  261. if (line.includes('$third-party') || line.includes('$frame')) {
  262. return null;
  263. }
  264. /** @example line.endsWith('^') */
  265. const lineEndsWithCaret = lastCharCode === 94; // lastChar === '^';
  266. /** @example line.endsWith('^|') */
  267. const lineEndsWithCaretVerticalBar = (lastCharCode === 124 /** lastChar === '|' */) && line[len - 2] === '^';
  268. /** @example line.endsWith('^') || line.endsWith('^|') */
  269. const lineEndsWithCaretOrCaretVerticalBar = lineEndsWithCaret || lineEndsWithCaretVerticalBar;
  270. // whitelist (exception)
  271. if (
  272. firstCharCode === 64 // 64 `@`
  273. && line[1] === '@'
  274. ) {
  275. let whiteIncludeAllSubDomain = true;
  276. /**
  277. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  278. * "$genericblock`" is also not supported by NetworkFilter, see:
  279. * https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950
  280. *
  281. * `@@||cmechina.net^$genericblock`
  282. * `@@|ftp.bmp.ovh^|`
  283. * `@@|adsterra.com^|`
  284. * `@@.atlassian.net$document`
  285. * `@@||ad.alimama.com^$genericblock`
  286. */
  287. let sliceStart = 0;
  288. let sliceEnd: number | undefined;
  289. if (line[2] === '|') { // line.startsWith('@@|')
  290. sliceStart = 3;
  291. whiteIncludeAllSubDomain = false;
  292. if (line[3] === '|') { // line.startsWith('@@||')
  293. sliceStart = 4;
  294. whiteIncludeAllSubDomain = true;
  295. }
  296. } else if (line[2] === '.') { // line.startsWith('@@.')
  297. sliceStart = 3;
  298. whiteIncludeAllSubDomain = true;
  299. } else if (
  300. /**
  301. * line.startsWith('@@://')
  302. *
  303. * `@@://googleadservices.com^|`
  304. * `@@://www.googleadservices.com^|`
  305. */
  306. line[2] === ':' && line[3] === '/' && line[4] === '/'
  307. ) {
  308. whiteIncludeAllSubDomain = false;
  309. sliceStart = 5;
  310. }
  311. if (lineEndsWithCaretOrCaretVerticalBar) {
  312. sliceEnd = -2;
  313. } else if (line.endsWith('$genericblock')) {
  314. sliceEnd = -13;
  315. if (line[len - 14] === '^') { // line.endsWith('^$genericblock')
  316. sliceEnd = -14;
  317. }
  318. } else if (line.endsWith('$document')) {
  319. sliceEnd = -9;
  320. if (line[len - 10] === '^') { // line.endsWith('^$document')
  321. sliceEnd = -10;
  322. }
  323. }
  324. if (sliceStart !== 0 || sliceEnd !== undefined) {
  325. const sliced = line.slice(sliceStart, sliceEnd);
  326. const domain = normalizeDomain(sliced);
  327. if (domain) {
  328. return [domain, whiteIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute];
  329. }
  330. return [
  331. `[parse-filter E0001] (white) invalid domain: ${JSON.stringify({
  332. line, sliced, sliceStart, sliceEnd
  333. })}`,
  334. ParseType.ErrorMessage
  335. ];
  336. }
  337. return [
  338. `[parse-filter E0006] (white) failed to parse: ${JSON.stringify({
  339. line, sliceStart, sliceEnd
  340. })}`,
  341. ParseType.ErrorMessage
  342. ];
  343. }
  344. if (
  345. // 124 `|`
  346. // line.startsWith('|')
  347. firstCharCode === 124
  348. && lineEndsWithCaretOrCaretVerticalBar
  349. ) {
  350. /**
  351. * Some malformed filters can not be parsed by NetworkFilter:
  352. *
  353. * `||smetrics.teambeachbody.com^.com^`
  354. * `||solutions.|pages.indigovision.com^`
  355. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  356. * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
  357. */
  358. const includeAllSubDomain = line[1] === '|';
  359. const sliceStart = includeAllSubDomain ? 2 : 1;
  360. const sliceEnd = lineEndsWithCaret
  361. ? -1
  362. : (lineEndsWithCaretVerticalBar ? -2 : undefined);
  363. const sliced = line.slice(sliceStart, sliceEnd); // we already make sure line startsWith "|"
  364. const domain = normalizeDomain(sliced);
  365. if (domain) {
  366. return [domain, includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
  367. }
  368. return [
  369. `[parse-filter E0002] (black) invalid domain: ${sliced}`,
  370. ParseType.ErrorMessage
  371. ];
  372. }
  373. const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.`
  374. if (
  375. lineStartsWithSingleDot
  376. && lineEndsWithCaretOrCaretVerticalBar
  377. ) {
  378. /**
  379. * `.ay.delivery^`
  380. * `.m.bookben.com^`
  381. * `.wap.x4399.com^`
  382. */
  383. const sliced = line.slice(
  384. 1, // remove prefix dot
  385. lineEndsWithCaret // replaceAll('^', '')
  386. ? -1
  387. : (lineEndsWithCaretVerticalBar ? -2 : undefined) // replace('^|', '')
  388. );
  389. const suffix = gorhill.getPublicSuffix(sliced);
  390. if (!gorhill.suffixInPSL(suffix)) {
  391. // This exclude domain-like resource like `1.1.4.514.js`
  392. return null;
  393. }
  394. const domain = normalizeDomain(sliced);
  395. if (domain) {
  396. return [domain, ParseType.BlackIncludeSubdomain];
  397. }
  398. return [
  399. `[paparse-filter E0003] (black) invalid domain: ${sliced}`,
  400. ParseType.ErrorMessage
  401. ];
  402. }
  403. /**
  404. * `|http://x.o2.pl^`
  405. * `://mine.torrent.pw^`
  406. * `://say.ac^`
  407. */
  408. if (lineEndsWithCaretOrCaretVerticalBar) {
  409. let sliceStart = 0;
  410. let sliceEnd;
  411. if (lineEndsWithCaret) { // line.endsWith('^')
  412. sliceEnd = -1;
  413. } else if (lineEndsWithCaretVerticalBar) { // line.endsWith('^|')
  414. sliceEnd = -2;
  415. }
  416. if (line.startsWith('://')) {
  417. sliceStart = 3;
  418. } else if (line.startsWith('http://')) {
  419. sliceStart = 7;
  420. } else if (line.startsWith('https://')) {
  421. sliceStart = 8;
  422. } else if (line.startsWith('|http://')) {
  423. sliceStart = 8;
  424. } else if (line.startsWith('|https://')) {
  425. sliceStart = 9;
  426. }
  427. if (sliceStart !== 0 || sliceEnd !== undefined) {
  428. const sliced = line.slice(sliceStart, sliceEnd);
  429. const domain = normalizeDomain(sliced);
  430. if (domain) {
  431. return [domain, ParseType.BlackIncludeSubdomain];
  432. }
  433. return [
  434. `[parse-filter E0004] (black) invalid domain: ${JSON.stringify({
  435. line, sliced, sliceStart, sliceEnd
  436. })}`,
  437. ParseType.ErrorMessage
  438. ];
  439. }
  440. }
  441. /**
  442. * `_vmind.qqvideo.tc.qq.com^`
  443. * `arketing.indianadunes.com^`
  444. * `charlestownwyllie.oaklawnnonantum.com^`
  445. * `-telemetry.officeapps.live.com^`
  446. * `-tracker.biliapi.net`
  447. * `-logging.nextmedia.com`
  448. * `_social_tracking.js^`
  449. */
  450. if (
  451. firstCharCode !== 124 // 124 `|`
  452. && lastCharCode === 94 // 94 `^`
  453. ) {
  454. const _domain = line.slice(0, -1);
  455. const suffix = gorhill.getPublicSuffix(_domain);
  456. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  457. // This exclude domain-like resource like `_social_tracking.js^`
  458. return null;
  459. }
  460. const domain = normalizeDomain(_domain);
  461. if (domain) {
  462. return [domain, ParseType.BlackAbsolute];
  463. }
  464. return [
  465. `[parse-filter E0005] (black) invalid domain: ${_domain}`,
  466. ParseType.ErrorMessage
  467. ];
  468. }
  469. // Possibly that entire rule is domain
  470. /**
  471. * lineStartsWithSingleDot:
  472. *
  473. * `.cookielaw.js`
  474. * `.content_tracking.js`
  475. * `.ads.css`
  476. *
  477. * else:
  478. *
  479. * `_prebid.js`
  480. * `t.yesware.com`
  481. * `ubmcmm.baidustatic.com`
  482. * `://www.smfg-card.$document`
  483. * `portal.librus.pl$$advertisement-module`
  484. * `@@-ds.metric.gstatic.com^|`
  485. * `://gom.ge/cookie.js`
  486. * `://accout-update-smba.jp.$document`
  487. * `_200x250.png`
  488. * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
  489. */
  490. let sliceStart = 0;
  491. let sliceEnd: number | undefined;
  492. if (lineStartsWithSingleDot) {
  493. sliceStart = 1;
  494. }
  495. if (line.endsWith('^$all')) { // This salvage line `thepiratebay3.com^$all`
  496. sliceEnd = -5;
  497. } else if (
  498. // Try to salvage line like `://account.smba.$document`
  499. // For this specific line, it will fail anyway though.
  500. line.endsWith('$document')
  501. ) {
  502. sliceEnd = -9;
  503. }
  504. const sliced = (sliceStart !== 0 || sliceEnd !== undefined) ? line.slice(sliceStart, sliceEnd) : line;
  505. const suffix = gorhill.getPublicSuffix(sliced);
  506. /**
  507. * Fast exclude definitely not domain-like resource
  508. *
  509. * `.gatracking.js`, suffix is `js`,
  510. * `.ads.css`, suffix is `css`,
  511. * `-cpm-ads.$badfilter`, suffix is `$badfilter`,
  512. * `portal.librus.pl$$advertisement-module`, suffix is `pl$$advertisement-module`
  513. */
  514. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  515. // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
  516. return null;
  517. }
  518. const tryNormalizeDomain = normalizeDomain(sliced);
  519. if (tryNormalizeDomain === sliced) {
  520. // the entire rule is domain
  521. return [sliced, ParseType.BlackIncludeSubdomain];
  522. }
  523. return [
  524. `[parse-filter E0010] can not parse: ${line}`,
  525. ParseType.ErrorMessage
  526. ];
  527. }