parse-filter.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. // @ts-check
  2. const { fetchWithRetry } = require('./fetch-retry');
  3. const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-text-by-line');
  4. const { NetworkFilter } = require('@cliqz/adblocker');
  5. const { normalizeDomain } = require('./is-domain-loose');
  6. const { processLine } = require('./process-line');
  7. const { performance } = require('perf_hooks');
  8. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  9. let foundDebugDomain = false;
  10. const warnOnceUrl = new Set();
  11. const warnOnce = (url, isWhite, ...message) => {
  12. const key = `${url}${isWhite ? 'white' : 'black'}`;
  13. if (warnOnceUrl.has(key)) {
  14. return;
  15. }
  16. warnOnceUrl.add(key);
  17. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  18. };
  19. /**
  20. * @param {string | URL} domainListsUrl
  21. */
  22. async function processDomainLists(domainListsUrl) {
  23. if (typeof domainListsUrl === 'string') {
  24. domainListsUrl = new URL(domainListsUrl);
  25. }
  26. /** @type Set<string> */
  27. const domainSets = new Set();
  28. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  29. if (line[0] === '!') {
  30. continue;
  31. }
  32. const domainToAdd = processLine(line);
  33. if (!domainToAdd) {
  34. continue;
  35. }
  36. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  37. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  38. foundDebugDomain = true;
  39. }
  40. domainSets.add(domainToAdd);
  41. }
  42. return domainSets;
  43. }
  44. /**
  45. * @param {string | URL} hostsUrl
  46. */
  47. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  48. console.time(` - processHosts: ${hostsUrl}`);
  49. if (typeof hostsUrl === 'string') {
  50. hostsUrl = new URL(hostsUrl);
  51. }
  52. /** @type Set<string> */
  53. const domainSets = new Set();
  54. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  55. const line = processLine(l);
  56. if (!line) {
  57. continue;
  58. }
  59. const [, ...domains] = line.split(' ');
  60. const _domain = domains.join(' ').trim();
  61. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  62. warnOnce(hostsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  63. foundDebugDomain = true;
  64. }
  65. const domain = normalizeDomain(_domain);
  66. if (domain) {
  67. if (includeAllSubDomain) {
  68. domainSets.add(`.${domain}`);
  69. } else {
  70. domainSets.add(domain);
  71. }
  72. }
  73. }
  74. console.timeEnd(` - processHosts: ${hostsUrl}`);
  75. return domainSets;
  76. }
  77. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#&%~=]/;
  78. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  79. /**
  80. * @param {string | URL} filterRulesUrl
  81. * @param {readonly (string | URL)[] | undefined} [fallbackUrls]
  82. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
  83. */
  84. async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdParties = false) {
  85. const runStart = performance.now();
  86. /** @type Set<string> */
  87. const whitelistDomainSets = new Set();
  88. /** @type Set<string> */
  89. const blacklistDomainSets = new Set();
  90. const __addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  91. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  92. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  93. foundDebugDomain = true;
  94. }
  95. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  96. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  97. } else {
  98. blacklistDomainSets.add(domainToBeAddedToBlack);
  99. }
  100. };
  101. const addToBlackList = DEBUG_DOMAIN_TO_FIND == null
  102. ? __addToBlackList
  103. : (domainToBeAddedToBlack, isSubDomain) => {
  104. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  105. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  106. foundDebugDomain = true;
  107. }
  108. __addToBlackList(domainToBeAddedToBlack, isSubDomain);
  109. };
  110. const __addToWhiteList = (domainToBeAddedToWhite) => {
  111. whitelistDomainSets.add(domainToBeAddedToWhite);
  112. };
  113. const addToWhiteList = DEBUG_DOMAIN_TO_FIND == null
  114. ? __addToWhiteList
  115. : (domainToBeAddedToWhite) => {
  116. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
  117. warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
  118. foundDebugDomain = true;
  119. }
  120. __addToWhiteList(domainToBeAddedToWhite);
  121. };
  122. let downloadTime = 0;
  123. const lineCb = (line) => {
  124. const result = parse(line, includeThirdParties);
  125. if (result) {
  126. const flag = result[1];
  127. const hostname = result[0];
  128. switch (flag) {
  129. case 0:
  130. addToWhiteList(hostname);
  131. break;
  132. case 1:
  133. addToBlackList(hostname, false);
  134. break;
  135. case 2:
  136. addToBlackList(hostname, true);
  137. break;
  138. default:
  139. throw new Error(`Unknown flag: ${flag}`);
  140. }
  141. }
  142. };
  143. if (!fallbackUrls || fallbackUrls.length === 0) {
  144. const downloadStart = performance.now();
  145. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  146. lineCb(line.trim());
  147. }
  148. downloadTime = performance.now() - downloadStart;
  149. } else {
  150. let filterRules;
  151. const downloadStart = performance.now();
  152. try {
  153. const controller = new AbortController();
  154. const signal = controller.signal;
  155. /** @type string[] */
  156. filterRules = (
  157. await Promise.any(
  158. [filterRulesUrl, ...(fallbackUrls || [])].map(
  159. url => fetchWithRetry(url, { signal })
  160. .then(r => r.text())
  161. .then(text => {
  162. controller.abort();
  163. return text;
  164. })
  165. )
  166. )
  167. ).split('\n').map(line => line.trim());
  168. } catch (e) {
  169. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  170. throw e;
  171. }
  172. downloadTime = performance.now() - downloadStart;
  173. for (let i = 0, len = filterRules.length; i < len; i++) {
  174. const line = filterRules[i].trim();
  175. lineCb(line);
  176. }
  177. }
  178. console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
  179. console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
  180. return {
  181. white: whitelistDomainSets,
  182. black: blacklistDomainSets,
  183. foundDebugDomain
  184. };
  185. }
  186. /**
  187. * @param {string} $line
  188. * @param {boolean} includeThirdParties
  189. * @returns {null | [string, 0 | 1 | 2]} - 0 white, 1 black abosulte, 2 black include subdomain
  190. */
  191. function parse($line, includeThirdParties) {
  192. const line = $line.trim();
  193. if (
  194. line === ''
  195. || line[0] === '/'
  196. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
  197. // doesn't include
  198. || !line.includes('.') // rule with out dot can not be a domain
  199. // includes
  200. // || line.includes('#')
  201. || line.includes('!')
  202. || line.includes('?')
  203. || line.includes('*')
  204. // || line.includes('=')
  205. || line.includes('[')
  206. || line.includes('(')
  207. || line.includes(']')
  208. || line.includes(')')
  209. || line.includes(',')
  210. // || line.includes('~')
  211. // || line.includes('&')
  212. // || line.includes('%')
  213. // ends with
  214. || line.endsWith('.')
  215. || line.endsWith('-')
  216. || line.endsWith('_')
  217. // special modifier
  218. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  219. || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
  220. // || line.includes('$popup')
  221. // || line.includes('$removeparam')
  222. // || line.includes('$popunder')
  223. ) {
  224. return null;
  225. }
  226. const filter = NetworkFilter.parse(line);
  227. if (filter) {
  228. if (
  229. filter.isElemHide()
  230. || filter.isGenericHide()
  231. || filter.isSpecificHide()
  232. || filter.isRedirect()
  233. || filter.isRedirectRule()
  234. || filter.hasDomains()
  235. || filter.isCSP() // must not be csp rule
  236. || (!filter.fromAny() && !filter.fromDocument())
  237. ) {
  238. // not supported type
  239. return null;
  240. }
  241. if (
  242. filter.hasHostname() // must have
  243. && filter.isPlain()
  244. && (!filter.isRegex())
  245. && (!filter.isFullRegex())
  246. ) {
  247. const hostname = normalizeDomain(filter.getHostname());
  248. if (hostname) {
  249. if (filter.isException() || filter.isBadFilter()) {
  250. return [hostname, 0];
  251. }
  252. if (filter.firstParty() === filter.thirdParty()) {
  253. return [hostname, 2];
  254. }
  255. if (filter.thirdParty()) {
  256. if (includeThirdParties) {
  257. return [hostname, 2];
  258. }
  259. return null;
  260. }
  261. if (filter.firstParty()) {
  262. return null;
  263. }
  264. } else {
  265. return null;
  266. }
  267. }
  268. }
  269. if (line.includes('$third-party') || line.includes('$frame')) {
  270. return null;
  271. }
  272. const lineEndsWithCaret = line.endsWith('^');
  273. const lineEndsWithCaretVerticalBar = line.endsWith('^|');
  274. if (line[0] === '@' && line[1] === '@') {
  275. if (line.endsWith('$cname')) {
  276. return null;
  277. }
  278. if (
  279. // (line.startsWith('@@|') || line.startsWith('@@.'))
  280. (
  281. line[2] === '|'
  282. || line[2] === '.'
  283. )
  284. && (
  285. lineEndsWithCaret
  286. || lineEndsWithCaretVerticalBar
  287. || line.endsWith('$genericblock')
  288. || line.endsWith('$document')
  289. )
  290. ) {
  291. const _domain = line
  292. .replace('@@||', '')
  293. .replace('@@|', '')
  294. .replace('@@.', '')
  295. .replace('^|', '')
  296. .replace('^$genericblock', '')
  297. .replace('$genericblock', '')
  298. .replace('^$document', '')
  299. .replace('$document', '')
  300. .replaceAll('^', '')
  301. .trim();
  302. const domain = normalizeDomain(_domain);
  303. if (domain) {
  304. return [domain, 0];
  305. }
  306. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  307. return null;
  308. }
  309. }
  310. if (
  311. line.startsWith('||')
  312. && (
  313. lineEndsWithCaret
  314. || lineEndsWithCaretVerticalBar
  315. || line.endsWith('$cname')
  316. )
  317. ) {
  318. const _domain = line
  319. .replace('||', '')
  320. .replace('^|', '')
  321. .replace('$cname', '')
  322. .replaceAll('^', '')
  323. .trim();
  324. const domain = normalizeDomain(_domain);
  325. if (domain) {
  326. return [domain, 2];
  327. }
  328. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  329. return null;
  330. }
  331. const lineStartsWithSingleDot = line.startsWith('.');
  332. if (
  333. lineStartsWithSingleDot
  334. && (
  335. lineEndsWithCaret
  336. || lineEndsWithCaretVerticalBar
  337. )
  338. ) {
  339. const _domain = line
  340. .replace('^|', '')
  341. .replaceAll('^', '')
  342. .slice(1)
  343. .trim();
  344. const domain = normalizeDomain(_domain);
  345. if (domain) {
  346. return [domain, 2];
  347. }
  348. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  349. return null;
  350. }
  351. if (
  352. (
  353. line.startsWith('://')
  354. || line.startsWith('http://')
  355. || line.startsWith('https://')
  356. || line.startsWith('|http://')
  357. || line.startsWith('|https://')
  358. )
  359. && (
  360. lineEndsWithCaret
  361. || lineEndsWithCaretVerticalBar
  362. )
  363. ) {
  364. const _domain = line
  365. .replace('|https://', '')
  366. .replace('https://', '')
  367. .replace('|http://', '')
  368. .replace('http://', '')
  369. .replace('://', '')
  370. .replace('^|', '')
  371. .replaceAll('^', '')
  372. .trim();
  373. const domain = normalizeDomain(_domain);
  374. if (domain) {
  375. return [domain, 1];
  376. }
  377. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  378. return null;
  379. }
  380. if (line[0] !== '|' && lineEndsWithCaret) {
  381. const _domain = line.slice(0, -1);
  382. const domain = normalizeDomain(_domain);
  383. if (domain) {
  384. return [domain, 1];
  385. }
  386. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  387. return null;
  388. }
  389. const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line);
  390. if (
  391. tryNormalizeDomain
  392. && (
  393. lineStartsWithSingleDot
  394. ? tryNormalizeDomain.length === line.length - 1
  395. : tryNormalizeDomain === line
  396. )
  397. ) {
  398. return [line, 2];
  399. }
  400. if (!line.endsWith('.js')) {
  401. console.warn(' * [parse-filter E0010] can not parse:', line);
  402. }
  403. return null;
  404. }
  405. module.exports.processDomainLists = processDomainLists;
  406. module.exports.processHosts = processHosts;
  407. module.exports.processFilterRules = processFilterRules;