parse-filter.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. // @ts-check
  2. const { fetchWithRetry } = require('./fetch-retry');
  3. const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-text-by-line');
  4. const { NetworkFilter } = require('@cliqz/adblocker');
  5. const { normalizeDomain } = require('./is-domain-loose');
  6. const { processLine } = require('./process-line');
  7. const { performance } = require('perf_hooks');
  8. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  9. let foundDebugDomain = false;
  10. const warnOnceUrl = new Set();
  11. const warnOnce = (url, isWhite, ...message) => {
  12. const key = `${url}${isWhite ? 'white' : 'black'}`;
  13. if (warnOnceUrl.has(key)) {
  14. return;
  15. }
  16. warnOnceUrl.add(key);
  17. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  18. };
  19. /**
  20. * @param {string | URL} domainListsUrl
  21. */
  22. async function processDomainLists(domainListsUrl) {
  23. if (typeof domainListsUrl === 'string') {
  24. domainListsUrl = new URL(domainListsUrl);
  25. }
  26. /** @type Set<string> */
  27. const domainSets = new Set();
  28. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
  29. if (line[0] === '!') {
  30. continue;
  31. }
  32. const domainToAdd = processLine(line);
  33. if (!domainToAdd) {
  34. continue;
  35. }
  36. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  37. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  38. foundDebugDomain = true;
  39. }
  40. domainSets.add(domainToAdd);
  41. }
  42. return domainSets;
  43. }
  44. /**
  45. * @param {string | URL} hostsUrl
  46. */
  47. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  48. console.time(` - processHosts: ${hostsUrl}`);
  49. if (typeof hostsUrl === 'string') {
  50. hostsUrl = new URL(hostsUrl);
  51. }
  52. /** @type Set<string> */
  53. const domainSets = new Set();
  54. for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
  55. const line = processLine(l);
  56. if (!line) {
  57. continue;
  58. }
  59. const [, ...domains] = line.split(' ');
  60. const _domain = domains.join(' ').trim();
  61. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  62. warnOnce(hostsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  63. foundDebugDomain = true;
  64. }
  65. const domain = normalizeDomain(_domain);
  66. if (domain) {
  67. if (includeAllSubDomain) {
  68. domainSets.add(`.${domain}`);
  69. } else {
  70. domainSets.add(domain);
  71. }
  72. }
  73. }
  74. console.timeEnd(` - processHosts: ${hostsUrl}`);
  75. return domainSets;
  76. }
  77. /**
  78. * @param {string | URL} filterRulesUrl
  79. * @param {readonly (string | URL)[] | undefined} [fallbackUrls]
  80. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
  81. */
  82. async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdParties = false) {
  83. const runStart = performance.now();
  84. /** @type Set<string> */
  85. const whitelistDomainSets = new Set();
  86. /** @type Set<string> */
  87. const blacklistDomainSets = new Set();
  88. const __addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  89. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  90. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  91. foundDebugDomain = true;
  92. }
  93. if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
  94. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  95. } else {
  96. blacklistDomainSets.add(domainToBeAddedToBlack);
  97. }
  98. };
  99. const addToBlackList = DEBUG_DOMAIN_TO_FIND == null
  100. ? __addToBlackList
  101. : (domainToBeAddedToBlack, isSubDomain) => {
  102. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  103. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  104. foundDebugDomain = true;
  105. }
  106. __addToBlackList(domainToBeAddedToBlack, isSubDomain);
  107. };
  108. const __addToWhiteList = (domainToBeAddedToWhite) => {
  109. whitelistDomainSets.add(domainToBeAddedToWhite);
  110. };
  111. const addToWhiteList = DEBUG_DOMAIN_TO_FIND == null
  112. ? __addToWhiteList
  113. : (domainToBeAddedToWhite) => {
  114. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
  115. warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
  116. foundDebugDomain = true;
  117. }
  118. __addToWhiteList(domainToBeAddedToWhite);
  119. };
  120. let downloadTime = 0;
  121. const lineCb = (line) => {
  122. const result = parse(line, includeThirdParties);
  123. if (result) {
  124. const flag = result[1];
  125. const hostname = result[0];
  126. switch (flag) {
  127. case 0:
  128. addToWhiteList(hostname);
  129. break;
  130. case 1:
  131. addToBlackList(hostname, false);
  132. break;
  133. case 2:
  134. addToBlackList(hostname, true);
  135. break;
  136. default:
  137. throw new Error(`Unknown flag: ${flag}`);
  138. }
  139. }
  140. };
  141. if (!fallbackUrls || fallbackUrls.length === 0) {
  142. const downloadStart = performance.now();
  143. for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
  144. lineCb(line.trim());
  145. }
  146. downloadTime = performance.now() - downloadStart;
  147. } else {
  148. let filterRules;
  149. const downloadStart = performance.now();
  150. try {
  151. const controller = new AbortController();
  152. /** @type string[] */
  153. filterRules = (
  154. await Promise.any(
  155. [filterRulesUrl, ...(fallbackUrls || [])].map(async url => {
  156. const text = await fetchWithRetry(url, { signal: controller.signal }).then(r => r.text());
  157. controller.abort();
  158. return text;
  159. })
  160. )
  161. ).split('\n').map(line => line.trim());
  162. } catch (e) {
  163. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  164. throw e;
  165. }
  166. downloadTime = performance.now() - downloadStart;
  167. for (let i = 0, len = filterRules.length; i < len; i++) {
  168. lineCb(filterRules[i].trim());
  169. }
  170. }
  171. console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
  172. console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
  173. return {
  174. white: whitelistDomainSets,
  175. black: blacklistDomainSets,
  176. foundDebugDomain
  177. };
  178. }
  179. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#&%~=]/;
  180. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  181. /**
  182. * @param {string} $line
  183. * @param {boolean} includeThirdParties
  184. * @returns {null | [string, 0 | 1 | 2]} - 0 white, 1 black abosulte, 2 black include subdomain
  185. */
  186. function parse($line, includeThirdParties) {
  187. const line = $line.trim();
  188. if (
  189. line === ''
  190. || line[0] === '/'
  191. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
  192. // doesn't include
  193. || !line.includes('.') // rule with out dot can not be a domain
  194. // includes
  195. // || line.includes('#')
  196. || line.includes('!')
  197. || line.includes('?')
  198. || line.includes('*')
  199. // || line.includes('=')
  200. || line.includes('[')
  201. || line.includes('(')
  202. || line.includes(']')
  203. || line.includes(')')
  204. || line.includes(',')
  205. // || line.includes('~')
  206. // || line.includes('&')
  207. // || line.includes('%')
  208. // ends with
  209. || line.endsWith('.')
  210. || line.endsWith('-')
  211. || line.endsWith('_')
  212. // special modifier
  213. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  214. || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
  215. // || line.includes('$popup')
  216. // || line.includes('$removeparam')
  217. // || line.includes('$popunder')
  218. ) {
  219. return null;
  220. }
  221. const filter = NetworkFilter.parse(line);
  222. if (filter) {
  223. if (
  224. filter.isElemHide()
  225. || filter.isGenericHide()
  226. || filter.isSpecificHide()
  227. || filter.isRedirect()
  228. || filter.isRedirectRule()
  229. || filter.hasDomains()
  230. || filter.isCSP() // must not be csp rule
  231. || (!filter.fromAny() && !filter.fromDocument())
  232. ) {
  233. // not supported type
  234. return null;
  235. }
  236. if (
  237. filter.hasHostname() // must have
  238. && filter.isPlain()
  239. && (!filter.isRegex())
  240. && (!filter.isFullRegex())
  241. ) {
  242. const hostname = normalizeDomain(filter.getHostname());
  243. if (hostname) {
  244. if (filter.isException() || filter.isBadFilter()) {
  245. return [hostname, 0];
  246. }
  247. if (filter.firstParty() === filter.thirdParty()) {
  248. return [hostname, 2];
  249. }
  250. if (filter.thirdParty()) {
  251. if (includeThirdParties) {
  252. return [hostname, 2];
  253. }
  254. return null;
  255. }
  256. if (filter.firstParty()) {
  257. return null;
  258. }
  259. } else {
  260. return null;
  261. }
  262. }
  263. }
  264. if (line.includes('$third-party') || line.includes('$frame')) {
  265. return null;
  266. }
  267. const lineEndsWithCaret = line.endsWith('^');
  268. const lineEndsWithCaretVerticalBar = line.endsWith('^|');
  269. if (line[0] === '@' && line[1] === '@') {
  270. if (line.endsWith('$cname')) {
  271. return null;
  272. }
  273. if (
  274. // (line.startsWith('@@|') || line.startsWith('@@.'))
  275. (line[2] === '|' || line[2] === '.')
  276. && (
  277. lineEndsWithCaret
  278. || lineEndsWithCaretVerticalBar
  279. || line.endsWith('$genericblock')
  280. || line.endsWith('$document')
  281. )
  282. ) {
  283. const _domain = line
  284. .replace('@@||', '')
  285. .replace('@@|', '')
  286. .replace('@@.', '')
  287. .replace('^|', '')
  288. .replace('^$genericblock', '')
  289. .replace('$genericblock', '')
  290. .replace('^$document', '')
  291. .replace('$document', '')
  292. .replaceAll('^', '')
  293. .trim();
  294. const domain = normalizeDomain(_domain);
  295. if (domain) {
  296. return [domain, 0];
  297. }
  298. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  299. return null;
  300. }
  301. }
  302. if (
  303. line.startsWith('||')
  304. && (
  305. lineEndsWithCaret
  306. || lineEndsWithCaretVerticalBar
  307. || line.endsWith('$cname')
  308. )
  309. ) {
  310. const _domain = line
  311. .replace('||', '')
  312. .replace('^|', '')
  313. .replace('$cname', '')
  314. .replaceAll('^', '')
  315. .trim();
  316. const domain = normalizeDomain(_domain);
  317. if (domain) {
  318. return [domain, 2];
  319. }
  320. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  321. return null;
  322. }
  323. const lineStartsWithSingleDot = line[0] === '.';
  324. if (
  325. lineStartsWithSingleDot
  326. && (
  327. lineEndsWithCaret
  328. || lineEndsWithCaretVerticalBar
  329. )
  330. ) {
  331. const _domain = line
  332. .replace('^|', '')
  333. .replaceAll('^', '')
  334. .slice(1)
  335. .trim();
  336. const domain = normalizeDomain(_domain);
  337. if (domain) {
  338. return [domain, 2];
  339. }
  340. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  341. return null;
  342. }
  343. if (
  344. (
  345. line.startsWith('://')
  346. || line.startsWith('http://')
  347. || line.startsWith('https://')
  348. || line.startsWith('|http://')
  349. || line.startsWith('|https://')
  350. )
  351. && (
  352. lineEndsWithCaret
  353. || lineEndsWithCaretVerticalBar
  354. )
  355. ) {
  356. const _domain = line
  357. .replace('|https://', '')
  358. .replace('https://', '')
  359. .replace('|http://', '')
  360. .replace('http://', '')
  361. .replace('://', '')
  362. .replace('^|', '')
  363. .replaceAll('^', '')
  364. .trim();
  365. const domain = normalizeDomain(_domain);
  366. if (domain) {
  367. return [domain, 1];
  368. }
  369. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  370. return null;
  371. }
  372. if (line[0] !== '|' && lineEndsWithCaret) {
  373. const _domain = line.slice(0, -1);
  374. const domain = normalizeDomain(_domain);
  375. if (domain) {
  376. return [domain, 1];
  377. }
  378. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  379. return null;
  380. }
  381. const tryNormalizeDomain = normalizeDomain(line);
  382. if (
  383. tryNormalizeDomain
  384. && (
  385. lineStartsWithSingleDot
  386. ? tryNormalizeDomain.length === line.length - 1
  387. : tryNormalizeDomain === line
  388. )
  389. ) {
  390. return [line, 2];
  391. }
  392. if (!line.endsWith('.js')) {
  393. console.warn(' * [parse-filter E0010] can not parse:', line);
  394. }
  395. return null;
  396. }
  397. module.exports.processDomainLists = processDomainLists;
  398. module.exports.processHosts = processHosts;
  399. module.exports.processFilterRules = processFilterRules;