parse-filter.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. // @ts-check
  2. const { fetchWithRetry } = require('./fetch-retry');
  3. const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-text-by-line');
  4. const { NetworkFilter } = require('@cliqz/adblocker');
  5. const { normalizeDomain } = require('./is-domain-loose');
  6. const { processLine } = require('./process-line');
  7. const { performance } = require('perf_hooks');
  8. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  9. let foundDebugDomain = false;
  10. const warnOnceUrl = new Set();
  11. const warnOnce = (url, isWhite, ...message) => {
  12. const key = `${url}${isWhite ? 'white' : 'black'}`;
  13. if (warnOnceUrl.has(key)) {
  14. return;
  15. }
  16. warnOnceUrl.add(key);
  17. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  18. };
  19. /**
  20. * @param {string | URL} domainListsUrl
  21. */
  22. async function processDomainLists(domainListsUrl) {
  23. if (typeof domainListsUrl === 'string') {
  24. domainListsUrl = new URL(domainListsUrl);
  25. }
  26. /** @type Set<string> */
  27. const domainSets = new Set();
  28. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  29. if (line[0] === '!') {
  30. continue;
  31. }
  32. const domainToAdd = processLine(line);
  33. if (!domainToAdd) {
  34. continue;
  35. }
  36. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  37. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  38. foundDebugDomain = true;
  39. }
  40. domainSets.add(domainToAdd);
  41. }
  42. return domainSets;
  43. }
  44. /**
  45. * @param {string | URL} hostsUrl
  46. */
  47. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  48. console.time(` - processHosts: ${hostsUrl}`);
  49. if (typeof hostsUrl === 'string') {
  50. hostsUrl = new URL(hostsUrl);
  51. }
  52. /** @type Set<string> */
  53. const domainSets = new Set();
  54. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  55. const line = processLine(l);
  56. if (!line) {
  57. continue;
  58. }
  59. const [, ...domains] = line.split(' ');
  60. const _domain = domains.join(' ').trim();
  61. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  62. warnOnce(hostsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  63. foundDebugDomain = true;
  64. }
  65. const domain = normalizeDomain(_domain);
  66. if (domain) {
  67. if (includeAllSubDomain) {
  68. domainSets.add(`.${domain}`);
  69. } else {
  70. domainSets.add(domain);
  71. }
  72. }
  73. }
  74. console.timeEnd(` - processHosts: ${hostsUrl}`);
  75. return domainSets;
  76. }
  77. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#&%~=]/;
  78. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  79. /**
  80. * @param {string | URL} filterRulesUrl
  81. * @param {readonly (string | URL)[] | undefined} [fallbackUrls]
  82. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
  83. */
  84. async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdParties = false) {
  85. const runStart = performance.now();
  86. /** @type Set<string> */
  87. const whitelistDomainSets = new Set();
  88. /** @type Set<string> */
  89. const blacklistDomainSets = new Set();
  90. const __addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  91. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  92. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  93. foundDebugDomain = true;
  94. }
  95. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  96. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  97. } else {
  98. blacklistDomainSets.add(domainToBeAddedToBlack);
  99. }
  100. };
  101. const addToBlackList = DEBUG_DOMAIN_TO_FIND == null
  102. ? __addToBlackList
  103. : (domainToBeAddedToBlack, isSubDomain) => {
  104. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  105. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  106. foundDebugDomain = true;
  107. }
  108. __addToBlackList(domainToBeAddedToBlack, isSubDomain);
  109. };
  110. const __addToWhiteList = (domainToBeAddedToWhite) => {
  111. whitelistDomainSets.add(domainToBeAddedToWhite);
  112. };
  113. const addToWhiteList = DEBUG_DOMAIN_TO_FIND == null
  114. ? __addToWhiteList
  115. : (domainToBeAddedToWhite) => {
  116. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
  117. warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
  118. foundDebugDomain = true;
  119. }
  120. __addToWhiteList(domainToBeAddedToWhite);
  121. };
  122. let downloadTime = 0;
  123. const lineCb = (line) => {
  124. const result = parse(line, includeThirdParties);
  125. if (result) {
  126. const flag = result[1];
  127. const hostname = result[0];
  128. switch (flag) {
  129. case 0:
  130. addToWhiteList(hostname);
  131. break;
  132. case 1:
  133. addToBlackList(hostname, false);
  134. break;
  135. case 2:
  136. addToBlackList(hostname, true);
  137. break;
  138. default:
  139. throw new Error(`Unknown flag: ${flag}`);
  140. }
  141. }
  142. };
  143. if (!fallbackUrls || fallbackUrls.length === 0) {
  144. const downloadStart = performance.now();
  145. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  146. lineCb(line.trim());
  147. }
  148. downloadTime = performance.now() - downloadStart;
  149. } else {
  150. let filterRules;
  151. const downloadStart = performance.now();
  152. try {
  153. const controller = new AbortController();
  154. /** @type string[] */
  155. filterRules = (
  156. await Promise.any(
  157. [filterRulesUrl, ...(fallbackUrls || [])].map(async url => {
  158. const text = await fetchWithRetry(url, { signal: controller.signal }).then(r => r.text());
  159. controller.abort();
  160. return text;
  161. })
  162. )
  163. ).split('\n').map(line => line.trim());
  164. } catch (e) {
  165. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  166. throw e;
  167. }
  168. downloadTime = performance.now() - downloadStart;
  169. for (let i = 0, len = filterRules.length; i < len; i++) {
  170. const line = filterRules[i].trim();
  171. lineCb(line);
  172. }
  173. }
  174. console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
  175. console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
  176. return {
  177. white: whitelistDomainSets,
  178. black: blacklistDomainSets,
  179. foundDebugDomain
  180. };
  181. }
  182. /**
  183. * @param {string} $line
  184. * @param {boolean} includeThirdParties
  185. * @returns {null | [string, 0 | 1 | 2]} - 0 white, 1 black abosulte, 2 black include subdomain
  186. */
  187. function parse($line, includeThirdParties) {
  188. const line = $line.trim();
  189. if (
  190. line === ''
  191. || line[0] === '/'
  192. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
  193. // doesn't include
  194. || !line.includes('.') // rule with out dot can not be a domain
  195. // includes
  196. // || line.includes('#')
  197. || line.includes('!')
  198. || line.includes('?')
  199. || line.includes('*')
  200. // || line.includes('=')
  201. || line.includes('[')
  202. || line.includes('(')
  203. || line.includes(']')
  204. || line.includes(')')
  205. || line.includes(',')
  206. // || line.includes('~')
  207. // || line.includes('&')
  208. // || line.includes('%')
  209. // ends with
  210. || line.endsWith('.')
  211. || line.endsWith('-')
  212. || line.endsWith('_')
  213. // special modifier
  214. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  215. || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
  216. // || line.includes('$popup')
  217. // || line.includes('$removeparam')
  218. // || line.includes('$popunder')
  219. ) {
  220. return null;
  221. }
  222. const filter = NetworkFilter.parse(line);
  223. if (filter) {
  224. if (
  225. filter.isElemHide()
  226. || filter.isGenericHide()
  227. || filter.isSpecificHide()
  228. || filter.isRedirect()
  229. || filter.isRedirectRule()
  230. || filter.hasDomains()
  231. || filter.isCSP() // must not be csp rule
  232. || (!filter.fromAny() && !filter.fromDocument())
  233. ) {
  234. // not supported type
  235. return null;
  236. }
  237. if (
  238. filter.hasHostname() // must have
  239. && filter.isPlain()
  240. && (!filter.isRegex())
  241. && (!filter.isFullRegex())
  242. ) {
  243. const hostname = normalizeDomain(filter.getHostname());
  244. if (hostname) {
  245. if (filter.isException() || filter.isBadFilter()) {
  246. return [hostname, 0];
  247. }
  248. if (filter.firstParty() === filter.thirdParty()) {
  249. return [hostname, 2];
  250. }
  251. if (filter.thirdParty()) {
  252. if (includeThirdParties) {
  253. return [hostname, 2];
  254. }
  255. return null;
  256. }
  257. if (filter.firstParty()) {
  258. return null;
  259. }
  260. } else {
  261. return null;
  262. }
  263. }
  264. }
  265. if (line.includes('$third-party') || line.includes('$frame')) {
  266. return null;
  267. }
  268. const lineEndsWithCaret = line.endsWith('^');
  269. const lineEndsWithCaretVerticalBar = line.endsWith('^|');
  270. if (line[0] === '@' && line[1] === '@') {
  271. if (line.endsWith('$cname')) {
  272. return null;
  273. }
  274. if (
  275. // (line.startsWith('@@|') || line.startsWith('@@.'))
  276. (line[2] === '|' || line[2] === '.')
  277. && (
  278. lineEndsWithCaret
  279. || lineEndsWithCaretVerticalBar
  280. || line.endsWith('$genericblock')
  281. || line.endsWith('$document')
  282. )
  283. ) {
  284. const _domain = line
  285. .replace('@@||', '')
  286. .replace('@@|', '')
  287. .replace('@@.', '')
  288. .replace('^|', '')
  289. .replace('^$genericblock', '')
  290. .replace('$genericblock', '')
  291. .replace('^$document', '')
  292. .replace('$document', '')
  293. .replaceAll('^', '')
  294. .trim();
  295. const domain = normalizeDomain(_domain);
  296. if (domain) {
  297. return [domain, 0];
  298. }
  299. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  300. return null;
  301. }
  302. }
  303. if (
  304. line.startsWith('||')
  305. && (
  306. lineEndsWithCaret
  307. || lineEndsWithCaretVerticalBar
  308. || line.endsWith('$cname')
  309. )
  310. ) {
  311. const _domain = line
  312. .replace('||', '')
  313. .replace('^|', '')
  314. .replace('$cname', '')
  315. .replaceAll('^', '')
  316. .trim();
  317. const domain = normalizeDomain(_domain);
  318. if (domain) {
  319. return [domain, 2];
  320. }
  321. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  322. return null;
  323. }
  324. const lineStartsWithSingleDot = line[0] === '.';
  325. if (
  326. lineStartsWithSingleDot
  327. && (
  328. lineEndsWithCaret
  329. || lineEndsWithCaretVerticalBar
  330. )
  331. ) {
  332. const _domain = line
  333. .replace('^|', '')
  334. .replaceAll('^', '')
  335. .slice(1)
  336. .trim();
  337. const domain = normalizeDomain(_domain);
  338. if (domain) {
  339. return [domain, 2];
  340. }
  341. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  342. return null;
  343. }
  344. if (
  345. (
  346. line.startsWith('://')
  347. || line.startsWith('http://')
  348. || line.startsWith('https://')
  349. || line.startsWith('|http://')
  350. || line.startsWith('|https://')
  351. )
  352. && (
  353. lineEndsWithCaret
  354. || lineEndsWithCaretVerticalBar
  355. )
  356. ) {
  357. const _domain = line
  358. .replace('|https://', '')
  359. .replace('https://', '')
  360. .replace('|http://', '')
  361. .replace('http://', '')
  362. .replace('://', '')
  363. .replace('^|', '')
  364. .replaceAll('^', '')
  365. .trim();
  366. const domain = normalizeDomain(_domain);
  367. if (domain) {
  368. return [domain, 1];
  369. }
  370. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  371. return null;
  372. }
  373. if (line[0] !== '|' && lineEndsWithCaret) {
  374. const _domain = line.slice(0, -1);
  375. const domain = normalizeDomain(_domain);
  376. if (domain) {
  377. return [domain, 1];
  378. }
  379. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  380. return null;
  381. }
  382. const tryNormalizeDomain = normalizeDomain(line);
  383. if (
  384. tryNormalizeDomain
  385. && (
  386. lineStartsWithSingleDot
  387. ? tryNormalizeDomain.length === line.length - 1
  388. : tryNormalizeDomain === line
  389. )
  390. ) {
  391. return [line, 2];
  392. }
  393. if (!line.endsWith('.js')) {
  394. console.warn(' * [parse-filter E0010] can not parse:', line);
  395. }
  396. return null;
  397. }
  398. module.exports.processDomainLists = processDomainLists;
  399. module.exports.processHosts = processHosts;
  400. module.exports.processFilterRules = processFilterRules;