parse-filter.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. import { NetworkFilter } from '@ghostery/adblocker';
  2. import { processLine } from './process-line';
  3. import tldts from 'tldts-experimental';
  4. import picocolors from 'picocolors';
  5. import { normalizeDomain } from './normalize-domain';
  6. import type { Span } from '../trace';
  7. import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
  8. import { looseTldtsOpt } from '../constants/loose-tldts-opt';
  9. import { DEBUG_DOMAIN_TO_FIND } from '../constants/reject-data-source';
  10. import { noop } from 'foxts/noop';
  11. import { fetchAssetsWithout304 } from './fetch-assets';
  12. let foundDebugDomain = false;
  13. const onBlackFound = DEBUG_DOMAIN_TO_FIND
  14. ? (line: string, meta: string) => {
  15. if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
  16. console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
  17. foundDebugDomain = true;
  18. }
  19. }
  20. : noop;
  21. const onWhiteFound = DEBUG_DOMAIN_TO_FIND
  22. ? (line: string, meta: string) => {
  23. if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
  24. console.warn(picocolors.red(meta), '(white)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
  25. foundDebugDomain = true;
  26. }
  27. }
  28. : noop;
  29. function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
  30. let line = processLine(l);
  31. if (!line) return;
  32. line = line.toLowerCase();
  33. const domain = normalizeDomain(line);
  34. if (!domain) return;
  35. if (domain !== line) {
  36. console.log(
  37. picocolors.red('[process domain list]'),
  38. picocolors.gray(`line: ${line}`),
  39. picocolors.gray(`domain: ${domain}`),
  40. picocolors.gray(meta)
  41. );
  42. return;
  43. }
  44. onBlackFound(domain, meta);
  45. set.push(includeAllSubDomain ? `.${line}` : line);
  46. }
  47. export function processDomainLists(
  48. span: Span,
  49. domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
  50. ) {
  51. return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => {
  52. const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304(
  53. domainListsUrl,
  54. mirrors
  55. ));
  56. const domainSets: string[] = [];
  57. const filterRules = text.split('\n');
  58. span.traceChildSync('parse domain list', () => {
  59. for (let i = 0, len = filterRules.length; i < len; i++) {
  60. domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
  61. }
  62. });
  63. return domainSets;
  64. });
  65. }
  66. function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
  67. const line = processLine(l);
  68. if (!line) {
  69. return;
  70. }
  71. const _domain = line.split(/\s/)[1]?.trim();
  72. if (!_domain) {
  73. return;
  74. }
  75. const domain = normalizeDomain(_domain);
  76. if (!domain) {
  77. return;
  78. }
  79. onBlackFound(domain, meta);
  80. set.push(includeAllSubDomain ? `.${domain}` : domain);
  81. }
  82. export function processHosts(
  83. span: Span,
  84. hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
  85. ) {
  86. return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => {
  87. const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors));
  88. const domainSets: string[] = [];
  89. const filterRules = text.split('\n');
  90. span.traceChild('parse hosts').traceSyncFn(() => {
  91. for (let i = 0, len = filterRules.length; i < len; i++) {
  92. hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl);
  93. }
  94. });
  95. return domainSets;
  96. });
  97. }
  98. const enum ParseType {
  99. WhiteIncludeSubdomain = 0,
  100. WhiteAbsolute = -1,
  101. BlackAbsolute = 1,
  102. BlackIncludeSubdomain = 2,
  103. ErrorMessage = 10,
  104. Null = 1000,
  105. NotParsed = 2000
  106. }
  107. export { type ParseType };
  108. export async function processFilterRules(
  109. parentSpan: Span,
  110. filterRulesUrl: string,
  111. fallbackUrls?: string[] | null,
  112. allowThirdParty = false
  113. ): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
  114. const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn(async (span) => {
  115. const text = await fetchAssetsWithout304(filterRulesUrl, fallbackUrls);
  116. const whitelistDomainSets = new Set<string>();
  117. const blacklistDomainSets = new Set<string>();
  118. const warningMessages: string[] = [];
  119. const MUTABLE_PARSE_LINE_RESULT: [string, ParseType] = ['', ParseType.NotParsed];
  120. /**
  121. * @param {string} line
  122. */
  123. const lineCb = (line: string) => {
  124. const result = parse(line, MUTABLE_PARSE_LINE_RESULT, allowThirdParty);
  125. const flag = result[1];
  126. if (flag === ParseType.NotParsed) {
  127. throw new Error(`Didn't parse line: ${line}`);
  128. }
  129. if (flag === ParseType.Null) {
  130. return;
  131. }
  132. const hostname = result[0];
  133. if (flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute) {
  134. onWhiteFound(hostname, filterRulesUrl);
  135. } else {
  136. onBlackFound(hostname, filterRulesUrl);
  137. }
  138. switch (flag) {
  139. case ParseType.WhiteIncludeSubdomain:
  140. if (hostname[0] === '.') {
  141. whitelistDomainSets.add(hostname);
  142. } else {
  143. whitelistDomainSets.add(`.${hostname}`);
  144. }
  145. break;
  146. case ParseType.WhiteAbsolute:
  147. whitelistDomainSets.add(hostname);
  148. break;
  149. case ParseType.BlackIncludeSubdomain:
  150. if (hostname[0] === '.') {
  151. blacklistDomainSets.add(hostname);
  152. } else {
  153. blacklistDomainSets.add(`.${hostname}`);
  154. }
  155. break;
  156. case ParseType.BlackAbsolute:
  157. blacklistDomainSets.add(hostname);
  158. break;
  159. case ParseType.ErrorMessage:
  160. warningMessages.push(hostname);
  161. break;
  162. default:
  163. break;
  164. }
  165. };
  166. const filterRules = text.split('\n');
  167. span.traceChild('parse adguard filter').traceSyncFn(() => {
  168. for (let i = 0, len = filterRules.length; i < len; i++) {
  169. lineCb(filterRules[i]);
  170. }
  171. });
  172. return [
  173. Array.from(whitelistDomainSets),
  174. Array.from(blacklistDomainSets),
  175. warningMessages
  176. ] as const;
  177. });
  178. for (let i = 0, len = warningMessages.length; i < len; i++) {
  179. console.warn(
  180. picocolors.yellow(warningMessages[i]),
  181. picocolors.gray(picocolors.underline(filterRulesUrl))
  182. );
  183. }
  184. console.log(
  185. picocolors.gray('[process filter]'),
  186. picocolors.gray(filterRulesUrl),
  187. picocolors.gray(`white: ${white.length}`),
  188. picocolors.gray(`black: ${black.length}`)
  189. );
  190. return {
  191. white,
  192. black,
  193. foundDebugDomain
  194. };
  195. }
  196. // const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder|\$cname)/;
  197. // cname exceptional filter can not be parsed by NetworkFilter
  198. // Surge / Clash can't handle CNAME either, so we just ignore them
  199. const kwfilter = createKeywordFilter([
  200. '!',
  201. '?',
  202. '*',
  203. '[',
  204. '(',
  205. ']',
  206. ')',
  207. ',',
  208. '#',
  209. '%',
  210. '&',
  211. '=',
  212. '~',
  213. // special modifier
  214. '$popup',
  215. '$removeparam',
  216. '$popunder',
  217. '$cname',
  218. '$frame',
  219. '$domain',
  220. // some bad syntax
  221. '^popup'
  222. ]);
  223. export function parse($line: string, result: [string, ParseType], allowThirdParty: boolean): [hostname: string, flag: ParseType] {
  224. if (
  225. // doesn't include
  226. !$line.includes('.') // rule with out dot can not be a domain
  227. // includes
  228. || kwfilter($line)
  229. ) {
  230. result[1] = ParseType.Null;
  231. return result;
  232. }
  233. const line = $line.trim();
  234. if (line.length === 0) {
  235. result[1] = ParseType.Null;
  236. return result;
  237. }
  238. const firstCharCode = line.charCodeAt(0);
  239. const lastCharCode = line.charCodeAt(line.length - 1);
  240. if (
  241. firstCharCode === 47 // 47 `/`
  242. // ends with
  243. // _160-600.
  244. // -detect-adblock.
  245. // _web-advert.
  246. || lastCharCode === 46 // 46 `.`, line.endsWith('.')
  247. || lastCharCode === 45 // 45 `-`, line.endsWith('-')
  248. || lastCharCode === 95 // 95 `_`, line.endsWith('_')
  249. // || line.includes('$popup')
  250. // || line.includes('$removeparam')
  251. // || line.includes('$popunder')
  252. ) {
  253. result[1] = ParseType.Null;
  254. return result;
  255. }
  256. if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
  257. result[1] = ParseType.Null;
  258. return result;
  259. }
  260. const filter = NetworkFilter.parse(line);
  261. if (filter) {
  262. if (
  263. // filter.isCosmeticFilter() // always false
  264. // filter.isNetworkFilter() // always true
  265. filter.isElemHide()
  266. || filter.isGenericHide()
  267. || filter.isSpecificHide()
  268. || filter.isRedirect()
  269. || filter.isRedirectRule()
  270. || filter.hasDomains()
  271. || filter.isCSP() // must not be csp rule
  272. || (!filter.fromHttp() && !filter.fromHttps())
  273. ) {
  274. // not supported type
  275. result[1] = ParseType.Null;
  276. return result;
  277. }
  278. if (
  279. !filter.fromAny()
  280. // $image, $websocket, $xhr this are all non-any
  281. && !filter.fromDocument() // $document, $doc
  282. // && !filter.fromSubdocument() // $subdocument, $subdoc
  283. ) {
  284. result[1] = ParseType.Null;
  285. return result;
  286. }
  287. if (
  288. filter.hostname // filter.hasHostname() // must have
  289. && filter.isPlain() // isPlain() === !isRegex()
  290. && (!filter.isFullRegex())
  291. ) {
  292. const hostname = normalizeDomain(filter.hostname);
  293. if (!hostname) {
  294. result[1] = ParseType.Null;
  295. return result;
  296. }
  297. // |: filter.isHostnameAnchor(),
  298. // |: filter.isLeftAnchor(),
  299. // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  300. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  301. if (filter.isException() || filter.isBadFilter()) {
  302. result[0] = hostname;
  303. result[1] = isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
  304. return result;
  305. }
  306. const _1p = filter.firstParty();
  307. const _3p = filter.thirdParty();
  308. if (_1p) { // first party is true
  309. if (_3p) { // third party is also true
  310. result[0] = hostname;
  311. result[1] = isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
  312. return result;
  313. }
  314. result[1] = ParseType.Null;
  315. return result;
  316. }
  317. if (_3p) {
  318. if (allowThirdParty) {
  319. result[0] = hostname;
  320. result[1] = isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
  321. return result;
  322. }
  323. result[1] = ParseType.Null;
  324. return result;
  325. }
  326. }
  327. }
  328. /**
  329. * From now on, we are mostly facing non-standard domain rules (some are regex like)
  330. *
  331. * We can still salvage some of them by removing modifiers
  332. */
  333. let sliceStart = 0;
  334. let sliceEnd = 0;
  335. // After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
  336. // We now need to "salvage" the line as much as possible
  337. let white = false;
  338. let includeAllSubDomain = false;
  339. if (
  340. firstCharCode === 64 // 64 `@`
  341. && line.charCodeAt(1) === 64 // 64 `@`
  342. ) {
  343. sliceStart += 2;
  344. white = true;
  345. includeAllSubDomain = true;
  346. }
  347. /**
  348. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  349. * "$genericblock`" is also not supported by NetworkFilter, see:
  350. * https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950
  351. *
  352. * `@@||cmechina.net^$genericblock`
  353. * `@@|ftp.bmp.ovh^|`
  354. * `@@|adsterra.com^|`
  355. * `@@.atlassian.net$document`
  356. * `@@||ad.alimama.com^$genericblock`
  357. */
  358. switch (line.charCodeAt(sliceStart)) {
  359. case 124: /** | */
  360. // line.startsWith('@@|') || line.startsWith('|')
  361. sliceStart += 1;
  362. includeAllSubDomain = false;
  363. if (line[sliceStart] === '|') { // line.startsWith('@@||') || line.startsWith('||')
  364. sliceStart += 1;
  365. includeAllSubDomain = true;
  366. }
  367. break;
  368. case 46: { /** | */ // line.startsWith('@@.') || line.startsWith('.')
  369. /**
  370. * `.ay.delivery^`
  371. * `.m.bookben.com^`
  372. * `.wap.x4399.com^`
  373. */
  374. sliceStart += 1;
  375. includeAllSubDomain = true;
  376. break;
  377. }
  378. default:
  379. break;
  380. }
  381. switch (line.charCodeAt(sliceStart)) {
  382. case 58: { /** : */
  383. /**
  384. * `@@://googleadservices.com^|`
  385. * `@@://www.googleadservices.com^|`
  386. * `://mine.torrent.pw^`
  387. * `://say.ac^`
  388. */
  389. if (line[sliceStart + 1] === '/' && line[sliceStart + 2] === '/') {
  390. includeAllSubDomain = false;
  391. sliceStart += 3;
  392. }
  393. break;
  394. }
  395. case 104: { /** h */
  396. /** |http://x.o2.pl^ */
  397. if (line.startsWith('http://', sliceStart)) {
  398. includeAllSubDomain = false;
  399. sliceStart += 7;
  400. } else if (line.startsWith('https://', sliceStart)) {
  401. includeAllSubDomain = false;
  402. sliceStart += 8;
  403. }
  404. break;
  405. }
  406. default:
  407. break;
  408. }
  409. const indexOfDollar = line.indexOf('$', sliceStart);
  410. if (indexOfDollar > -1) {
  411. sliceEnd = indexOfDollar - line.length;
  412. }
  413. /*
  414. * We skip third-party and frame rules, as Surge / Clash can't handle them
  415. *
  416. * `.sharecounter.$third-party`
  417. * `.bbelements.com^$third-party`
  418. * `://o0e.ru^$third-party`
  419. * `.1.1.1.l80.js^$third-party`
  420. */
  421. if (
  422. !allowThirdParty
  423. && (
  424. line.includes('third-party', indexOfDollar + 1)
  425. || line.includes('3p', indexOfDollar + 1)
  426. )
  427. ) {
  428. result[1] = ParseType.Null;
  429. return result;
  430. }
  431. if (line.includes('badfilter', indexOfDollar + 1)) {
  432. white = true;
  433. }
  434. if (line.includes('all', indexOfDollar + 1)) {
  435. includeAllSubDomain = true;
  436. }
  437. /**
  438. * `_vmind.qqvideo.tc.qq.com^`
  439. * `arketing.indianadunes.com^`
  440. * `charlestownwyllie.oaklawnnonantum.com^`
  441. * `-telemetry.officeapps.live.com^`
  442. * `-tracker.biliapi.net`
  443. * `-logging.nextmedia.com`
  444. * `_social_tracking.js^`
  445. */
  446. if (line.charCodeAt(line.length + sliceEnd - 1) === 94) { // 94 `^`
  447. /** line.endsWith('^') */
  448. sliceEnd -= 1;
  449. } else if (line.charCodeAt(line.length + sliceEnd - 1) === 124) { // 124 `|`
  450. /** line.endsWith('|') */
  451. sliceEnd -= 1;
  452. if (line.charCodeAt(line.length + sliceEnd - 1) === 94) { // 94 `^`
  453. /** line.endsWith('^|') */
  454. sliceEnd -= 1;
  455. }
  456. } else if (line.charCodeAt(line.length + sliceEnd - 1) === 46) { // 46 `.`
  457. /** line.endsWith('.') */
  458. sliceEnd -= 1;
  459. }
  460. const sliced = (sliceStart > 0 || sliceEnd < 0) ? line.slice(sliceStart, sliceEnd === 0 ? undefined : sliceEnd) : line;
  461. if (sliced.charCodeAt(0) === 45 /* - */) {
  462. // line.startsWith('-') is not a valid domain
  463. result[1] = ParseType.ErrorMessage;
  464. result[0] = `[parse-filter E0001] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({
  465. line, sliced, sliceStart, sliceEnd
  466. })}`;
  467. return result;
  468. }
  469. const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt);
  470. if (!suffix) {
  471. // This exclude domain-like resource like `_social_tracking.js^`
  472. result[1] = ParseType.Null;
  473. return result;
  474. }
  475. const domain = normalizeDomain(sliced);
  476. if (domain && domain === sliced) {
  477. result[0] = domain;
  478. if (white) {
  479. result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
  480. } else {
  481. result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
  482. }
  483. return result;
  484. }
  485. result[0] = `[parse-filter E0010] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({
  486. line, domain, suffix, sliced, sliceStart, sliceEnd
  487. })}`;
  488. result[1] = ParseType.ErrorMessage;
  489. return result;
  490. }