parse-filter.js 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. // @ts-check
  2. const { fetchWithRetry } = require('./fetch-retry');
  3. const tldts = require('./cached-tld-parse');
  4. const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-text-by-line');
  5. const { NetworkFilter } = require('@cliqz/adblocker');
  6. const { processLine } = require('./process-line');
  7. const { performance } = require('perf_hooks');
  8. const { getGorhillPublicSuffixPromise } = require('./get-gorhill-publicsuffix');
  9. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  10. let foundDebugDomain = false;
  11. const warnOnceUrl = new Set();
  12. const warnOnce = (url, isWhite, ...message) => {
  13. const key = `${url}${isWhite ? 'white' : 'black'}`;
  14. if (warnOnceUrl.has(key)) {
  15. return;
  16. }
  17. warnOnceUrl.add(key);
  18. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  19. };
  20. const normalizeDomain = (domain) => {
  21. if (!domain) return null;
  22. const parsed = tldts.parse(domain);
  23. if (parsed.isIp) return null;
  24. if (parsed.isIcann || parsed.isPrivate) {
  25. const h = parsed.hostname;
  26. return h?.[0] === '.' ? h.slice(1) : h;
  27. }
  28. return null;
  29. };
  30. /**
  31. * @param {string | URL} domainListsUrl
  32. */
  33. async function processDomainLists(domainListsUrl) {
  34. if (typeof domainListsUrl === 'string') {
  35. domainListsUrl = new URL(domainListsUrl);
  36. }
  37. /** @type Set<string> */
  38. const domainSets = new Set();
  39. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  40. if (line[0] === '!') {
  41. continue;
  42. }
  43. const domainToAdd = processLine(line);
  44. if (!domainToAdd) {
  45. continue;
  46. }
  47. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  48. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  49. foundDebugDomain = true;
  50. }
  51. domainSets.add(domainToAdd);
  52. }
  53. return domainSets;
  54. }
  55. /**
  56. * @param {string | URL} hostsUrl
  57. */
  58. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  59. console.time(` - processHosts: ${hostsUrl}`);
  60. if (typeof hostsUrl === 'string') {
  61. hostsUrl = new URL(hostsUrl);
  62. }
  63. /** @type Set<string> */
  64. const domainSets = new Set();
  65. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  66. const line = processLine(l);
  67. if (!line) {
  68. continue;
  69. }
  70. const [, ...domains] = line.split(' ');
  71. const _domain = domains.join(' ').trim();
  72. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  73. warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
  74. foundDebugDomain = true;
  75. }
  76. const domain = normalizeDomain(_domain);
  77. if (domain) {
  78. if (includeAllSubDomain) {
  79. domainSets.add(`.${domain}`);
  80. } else {
  81. domainSets.add(domain);
  82. }
  83. }
  84. }
  85. console.timeEnd(` - processHosts: ${hostsUrl}`);
  86. return domainSets;
  87. }
  88. /**
  89. * @param {string | URL} filterRulesUrl
  90. * @param {readonly (string | URL)[] | undefined} [fallbackUrls]
  91. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
  92. */
  93. async function processFilterRules(filterRulesUrl, fallbackUrls) {
  94. const runStart = performance.now();
  95. /** @type Set<string> */
  96. const whitelistDomainSets = new Set();
  97. /** @type Set<string> */
  98. const blacklistDomainSets = new Set();
  99. /**
  100. * @param {string} domainToBeAddedToBlack
  101. * @param {boolean} isSubDomain
  102. */
  103. const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  104. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  105. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  106. } else {
  107. blacklistDomainSets.add(domainToBeAddedToBlack);
  108. }
  109. };
  110. /**
  111. * @param {string} domainToBeAddedToWhite
  112. * @param {boolean} [isSubDomain]
  113. */
  114. const addToWhiteList = (domainToBeAddedToWhite, isSubDomain = true) => {
  115. if (isSubDomain && domainToBeAddedToWhite[0] !== '.') {
  116. whitelistDomainSets.add(`.${domainToBeAddedToWhite}`);
  117. } else {
  118. whitelistDomainSets.add(domainToBeAddedToWhite);
  119. }
  120. };
  121. let downloadTime = 0;
  122. const gorhill = await getGorhillPublicSuffixPromise();
  123. const lineCb = (line) => {
  124. const result = parse(line, gorhill);
  125. if (result) {
  126. const flag = result[1];
  127. const hostname = result[0];
  128. if (DEBUG_DOMAIN_TO_FIND) {
  129. if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
  130. warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
  131. foundDebugDomain = true;
  132. console.log({ result, flag });
  133. }
  134. }
  135. switch (flag) {
  136. case 0:
  137. addToWhiteList(hostname, true);
  138. break;
  139. case -1:
  140. addToWhiteList(hostname, false);
  141. break;
  142. case 1:
  143. addToBlackList(hostname, false);
  144. break;
  145. case 2:
  146. addToBlackList(hostname, true);
  147. break;
  148. default:
  149. throw new Error(`Unknown flag: ${flag}`);
  150. }
  151. }
  152. };
  153. if (!fallbackUrls || fallbackUrls.length === 0) {
  154. downloadTime = 0;
  155. let last = performance.now();
  156. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  157. const now = performance.now();
  158. downloadTime += performance.now() - last;
  159. last = now;
  160. // don't trim here
  161. lineCb(line);
  162. }
  163. } else {
  164. let filterRules;
  165. const downloadStart = performance.now();
  166. try {
  167. const controller = new AbortController();
  168. /** @type string[] */
  169. filterRules = (
  170. await Promise.any(
  171. [filterRulesUrl, ...(fallbackUrls || [])].map(async url => {
  172. const text = await fetchWithRetry(url, { signal: controller.signal }).then(r => r.text());
  173. controller.abort();
  174. return text;
  175. })
  176. )
  177. ).split('\n');
  178. } catch (e) {
  179. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  180. throw e;
  181. }
  182. downloadTime = performance.now() - downloadStart;
  183. for (let i = 0, len = filterRules.length; i < len; i++) {
  184. lineCb(filterRules[i]);
  185. }
  186. }
  187. console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
  188. console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
  189. return {
  190. white: whitelistDomainSets,
  191. black: blacklistDomainSets,
  192. foundDebugDomain
  193. };
  194. }
  195. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#&%~=]/;
  196. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  197. /**
  198. * @param {string} $line
  199. * @param {import('gorhill-publicsuffixlist').default} gorhill
  200. * @returns {null | [hostname: string, flag: 0 | 1 | 2 | -1]} - 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
  201. */
  202. function parse($line, gorhill) {
  203. if (
  204. // doesn't include
  205. !$line.includes('.') // rule with out dot can not be a domain
  206. // includes
  207. || $line.includes('!')
  208. || $line.includes('?')
  209. || $line.includes('*')
  210. || $line.includes('[')
  211. || $line.includes('(')
  212. || $line.includes(']')
  213. || $line.includes(')')
  214. || $line.includes(',')
  215. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
  216. ) {
  217. return null;
  218. }
  219. const line = $line.trim();
  220. const len = line.length;
  221. if (len === 0) {
  222. return null;
  223. }
  224. const firstChar = line[0];
  225. const lastChar = line[len - 1];
  226. if (
  227. len === 0
  228. || firstChar === '/'
  229. // ends with
  230. || lastChar === '.' // || line.endsWith('.')
  231. || lastChar === '-' // || line.endsWith('-')
  232. || lastChar === '_' // || line.endsWith('_')
  233. // special modifier
  234. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  235. || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
  236. // || line.includes('$popup')
  237. // || line.includes('$removeparam')
  238. // || line.includes('$popunder')
  239. ) {
  240. return null;
  241. }
  242. const filter = NetworkFilter.parse(line);
  243. if (filter) {
  244. if (
  245. filter.isElemHide()
  246. || filter.isGenericHide()
  247. || filter.isSpecificHide()
  248. || filter.isRedirect()
  249. || filter.isRedirectRule()
  250. || filter.hasDomains()
  251. || filter.isCSP() // must not be csp rule
  252. || (!filter.fromAny() && !filter.fromDocument())
  253. ) {
  254. // not supported type
  255. return null;
  256. }
  257. if (
  258. filter.hostname // filter.hasHostname() // must have
  259. && filter.isPlain()
  260. // && (!filter.isRegex()) // isPlain() === !isRegex()
  261. && (!filter.isFullRegex())
  262. ) {
  263. if (!gorhill.getDomain(filter.hostname)) {
  264. return null;
  265. }
  266. const hostname = normalizeDomain(filter.hostname);
  267. if (!hostname) {
  268. return null;
  269. }
  270. // console.log({
  271. // '||': filter.isHostnameAnchor(),
  272. // '|': filter.isLeftAnchor(),
  273. // '|https://': !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
  274. // });
  275. const isIncludeAllSubDomain = filter.isHostnameAnchor();
  276. if (filter.isException() || filter.isBadFilter()) {
  277. return [hostname, isIncludeAllSubDomain ? 0 : -1];
  278. }
  279. const _1p = filter.firstParty();
  280. const _3p = filter.thirdParty();
  281. if (_1p) {
  282. if (_1p === _3p) {
  283. return [hostname, isIncludeAllSubDomain ? 2 : 1];
  284. }
  285. return null;
  286. }
  287. if (_3p) {
  288. return null;
  289. }
  290. }
  291. }
  292. /**
  293. * abnormal filter that can not be parsed by NetworkFilter
  294. */
  295. if (line.includes('$third-party') || line.includes('$frame')) {
  296. /*
  297. * `.bbelements.com^$third-party`
  298. * `://o0e.ru^$third-party`
  299. */
  300. return null;
  301. }
  302. /* eslint-disable no-nested-ternary -- speed */
  303. const linedEndsWithCaret = lastChar === '^';
  304. const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
  305. const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
  306. // whitelist (exception)
  307. if (firstChar === '@' && line[1] === '@') {
  308. /**
  309. * cname exceptional filter can not be parsed by NetworkFilter
  310. *
  311. * `@@||m.faz.net^$cname`
  312. *
  313. * Surge / Clash can't handle CNAME either, so we just ignore them
  314. */
  315. if (line.endsWith('$cname')) {
  316. return null;
  317. }
  318. /**
  319. * Some "malformed" regex-based filters can not be parsed by NetworkFilter
  320. * "$genericblock`" is also not supported by NetworkFilter
  321. *
  322. * `@@||cmechina.net^$genericblock`
  323. * `@@|ftp.bmp.ovh^|`
  324. * `@@|adsterra.com^|`
  325. */
  326. if (
  327. // (line.startsWith('@@|') || line.startsWith('@@.'))
  328. (line[2] === '|' || line[2] === '.')
  329. && (
  330. lineEndsWithCaretOrCaretVerticalBar
  331. || line.endsWith('$genericblock')
  332. || line.endsWith('$document')
  333. )
  334. ) {
  335. const _domain = line
  336. .replace('@@||', '')
  337. .replace('@@|', '')
  338. .replace('@@.', '')
  339. .replace('^|', '')
  340. .replace('^$genericblock', '')
  341. .replace('$genericblock', '')
  342. .replace('^$document', '')
  343. .replace('$document', '')
  344. .replaceAll('^', '')
  345. .trim();
  346. const domain = normalizeDomain(_domain);
  347. if (domain) {
  348. return [domain, 0];
  349. }
  350. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  351. return null;
  352. }
  353. }
  354. if (firstChar === '|' && (lineEndsWithCaretOrCaretVerticalBar || line.endsWith('$cname'))) {
  355. /**
  356. * Some malformed filters can not be parsed by NetworkFilter:
  357. *
  358. * `||smetrics.teambeachbody.com^.com^`
  359. * `||solutions.|pages.indigovision.com^`
  360. * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
  361. */
  362. const includeAllSubDomain = line[1] === '|';
  363. const sliceStart = includeAllSubDomain ? 2 : 1;
  364. const sliceEnd = lastChar === '^'
  365. ? -1
  366. : lineEndsWithCaretOrCaretVerticalBar
  367. ? -2
  368. : line.endsWith('$cname')
  369. ? -6
  370. : 0;
  371. const _domain = line
  372. // .replace('||', '')
  373. .slice(sliceStart, sliceEnd) // we already make sure line startsWith ||
  374. .trim();
  375. const domain = normalizeDomain(_domain);
  376. if (domain) {
  377. return [domain, includeAllSubDomain ? 2 : 1];
  378. }
  379. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  380. return null;
  381. }
  382. const lineStartsWithSingleDot = firstChar === '.';
  383. if (
  384. lineStartsWithSingleDot
  385. && lineEndsWithCaretOrCaretVerticalBar
  386. ) {
  387. /**
  388. * `.ay.delivery^`
  389. * `.m.bookben.com^`
  390. * `.wap.x4399.com^`
  391. */
  392. const _domain = line
  393. .slice(
  394. 1,
  395. linedEndsWithCaret
  396. ? -1
  397. : lineEndsWithCaretVerticalBar
  398. ? -2
  399. : 0
  400. ) // remove prefix dot
  401. .replace('^|', '')
  402. .replaceAll('^', '')
  403. .trim();
  404. const suffix = gorhill.getPublicSuffix(_domain);
  405. if (!gorhill.suffixInPSL(suffix)) {
  406. // This exclude domain-like resource like `1.1.4.514.js`
  407. return null;
  408. }
  409. const domain = normalizeDomain(_domain);
  410. if (domain) {
  411. return [domain, 2];
  412. }
  413. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  414. return null;
  415. }
  416. /**
  417. * `|http://x.o2.pl^`
  418. * `://mine.torrent.pw^`
  419. * `://say.ac^`
  420. */
  421. if (
  422. (
  423. line.startsWith('://')
  424. || line.startsWith('http://')
  425. || line.startsWith('https://')
  426. || line.startsWith('|http://')
  427. || line.startsWith('|https://')
  428. )
  429. && lineEndsWithCaretOrCaretVerticalBar
  430. ) {
  431. const _domain = line
  432. .replace('|https://', '')
  433. .replace('https://', '')
  434. .replace('|http://', '')
  435. .replace('http://', '')
  436. .replace('://', '')
  437. .replace('^|', '')
  438. .replaceAll('^', '')
  439. .trim();
  440. const domain = normalizeDomain(_domain);
  441. if (domain) {
  442. return [domain, 1];
  443. }
  444. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  445. return null;
  446. }
  447. /**
  448. * `_vmind.qqvideo.tc.qq.com^`
  449. * `arketing.indianadunes.com^`
  450. * `charlestownwyllie.oaklawnnonantum.com^`
  451. * `-telemetry.officeapps.live.com^`
  452. * `-tracker.biliapi.net`
  453. * `_social_tracking.js^`
  454. */
  455. if (firstChar !== '|' && lastChar === '^') {
  456. const _domain = line.slice(0, -1);
  457. const suffix = gorhill.getPublicSuffix(_domain);
  458. if (!suffix || !gorhill.suffixInPSL(suffix)) {
  459. // This exclude domain-like resource like `_social_tracking.js^`
  460. return null;
  461. }
  462. const domain = normalizeDomain(_domain);
  463. if (domain) {
  464. return [domain, 1];
  465. }
  466. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  467. return null;
  468. }
  469. /**
  470. * `.3.n.2.2.l30.js`
  471. * `_prebid.js`
  472. * `t.yesware.com`
  473. * `ubmcmm.baidustatic.com`
  474. * `portal.librus.pl$$advertisement-module`
  475. * `@@-ds.metric.gstatic.com^|`
  476. * `://gom.ge/cookie.js`
  477. * `://accout-update-smba.jp.$document`
  478. * `@@://googleadservices.com^|`
  479. */
  480. const tryNormalizeDomain = normalizeDomain(line);
  481. if (tryNormalizeDomain) {
  482. if (tryNormalizeDomain === line) {
  483. // the entire rule is domain
  484. return [line, 2];
  485. }
  486. if (lineStartsWithSingleDot && tryNormalizeDomain === line.slice(1)) {
  487. // dot prefixed line has stripped
  488. return [line, 2];
  489. }
  490. }
  491. if (!line.endsWith('.js') && !line.endsWith('.css')) {
  492. console.warn(' * [parse-filter E0010] can not parse:', line);
  493. }
  494. return null;
  495. /* eslint-enable no-nested-ternary */
  496. }
  497. module.exports.processDomainLists = processDomainLists;
  498. module.exports.processHosts = processHosts;
  499. module.exports.processFilterRules = processFilterRules;