parse-filter.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. // @ts-check
  2. const { fetchWithRetry } = require('./fetch-retry');
  3. const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-text-by-line');
  4. const { NetworkFilter } = require('@cliqz/adblocker');
  5. const { normalizeDomain } = require('./is-domain-loose');
  6. const { processLine } = require('./process-line');
  7. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  8. let foundDebugDomain = false;
  9. const warnOnceUrl = new Set();
  10. const warnOnce = (url, isWhite, ...message) => {
  11. const key = `${url}${isWhite ? 'white' : 'black'}`;
  12. if (warnOnceUrl.has(key)) {
  13. return;
  14. }
  15. warnOnceUrl.add(key);
  16. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  17. };
  18. /**
  19. * @param {string | URL} domainListsUrl
  20. */
  21. async function processDomainLists(domainListsUrl) {
  22. if (typeof domainListsUrl === 'string') {
  23. domainListsUrl = new URL(domainListsUrl);
  24. }
  25. /** @type Set<string> */
  26. const domainSets = new Set();
  27. const rl = await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl);
  28. for await (const line of rl) {
  29. if (line.startsWith('!')) {
  30. continue;
  31. }
  32. const domainToAdd = processLine(line);
  33. if (!domainToAdd) {
  34. continue;
  35. }
  36. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  37. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  38. foundDebugDomain = true;
  39. }
  40. domainSets.add(domainToAdd);
  41. }
  42. return domainSets;
  43. }
  44. /**
  45. * @param {string | URL} hostsUrl
  46. */
  47. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  48. console.time(` - processHosts: ${hostsUrl}`);
  49. if (typeof hostsUrl === 'string') {
  50. hostsUrl = new URL(hostsUrl);
  51. }
  52. /** @type Set<string> */
  53. const domainSets = new Set();
  54. const rl = await fetchRemoteTextAndCreateReadlineInterface(hostsUrl);
  55. for await (const _line of rl) {
  56. const line = processLine(_line);
  57. if (!line) {
  58. continue;
  59. }
  60. const [, ...domains] = line.split(' ');
  61. const _domain = domains.join(' ').trim();
  62. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  63. warnOnce(hostsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  64. foundDebugDomain = true;
  65. }
  66. const domain = normalizeDomain(_domain);
  67. if (domain) {
  68. if (includeAllSubDomain) {
  69. domainSets.add(`.${domain}`);
  70. } else {
  71. domainSets.add(domain);
  72. }
  73. }
  74. }
  75. console.timeEnd(` - processHosts: ${hostsUrl}`);
  76. return domainSets;
  77. }
  78. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#&%~=]/;
  79. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  80. /**
  81. * @param {string | URL} filterRulesUrl
  82. * @param {readonly (string | URL)[] | undefined} [fallbackUrls]
  83. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean, parseFailed: boolean }>}
  84. */
  85. async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdParties = false) {
  86. console.time(` - processFilterRules: ${filterRulesUrl}`);
  87. /** @type Set<string> */
  88. const whitelistDomainSets = new Set();
  89. /** @type Set<string> */
  90. const blacklistDomainSets = new Set();
  91. const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  92. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  93. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  94. foundDebugDomain = true;
  95. }
  96. if (isSubDomain && !domainToBeAddedToBlack.startsWith('.')) {
  97. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  98. } else {
  99. blacklistDomainSets.add(domainToBeAddedToBlack);
  100. }
  101. };
  102. const addToWhiteList = (domainToBeAddedToWhite) => {
  103. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
  104. warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
  105. foundDebugDomain = true;
  106. }
  107. whitelistDomainSets.add(domainToBeAddedToWhite);
  108. };
  109. let filterRules;
  110. try {
  111. const controller = new AbortController();
  112. const signal = controller.signal;
  113. /** @type string[] */
  114. filterRules = (
  115. await Promise.any(
  116. [filterRulesUrl, ...(fallbackUrls || [])].map(
  117. url => fetchWithRetry(url, { signal })
  118. .then(r => r.text())
  119. .then(text => {
  120. controller.abort();
  121. return text;
  122. })
  123. )
  124. )
  125. ).split('\n').map(line => line.trim());
  126. } catch (e) {
  127. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  128. throw e;
  129. }
  130. let hasParseFailed = false;
  131. for (let i = 0, len = filterRules.length; i < len; i++) {
  132. const line = filterRules[i].trim();
  133. if (
  134. line === ''
  135. || line.startsWith('/')
  136. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
  137. // doesn't include
  138. || !line.includes('.') // rule with out dot can not be a domain
  139. // includes
  140. // || line.includes('#')
  141. || line.includes('!')
  142. || line.includes('?')
  143. || line.includes('*')
  144. // || line.includes('=')
  145. || line.includes('[')
  146. || line.includes('(')
  147. || line.includes(']')
  148. || line.includes(')')
  149. || line.includes(',')
  150. // || line.includes('~')
  151. // || line.includes('&')
  152. // || line.includes('%')
  153. || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
  154. // ends with
  155. || line.endsWith('.')
  156. || line.endsWith('-')
  157. || line.endsWith('_')
  158. // special modifier
  159. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  160. // || line.includes('$popup')
  161. // || line.includes('$removeparam')
  162. // || line.includes('$popunder')
  163. ) {
  164. continue;
  165. }
  166. const filter = NetworkFilter.parse(line);
  167. if (filter) {
  168. if (
  169. filter.isElemHide()
  170. || filter.isGenericHide()
  171. || filter.isSpecificHide()
  172. || filter.isRedirect()
  173. || filter.isRedirectRule()
  174. || filter.hasDomains()
  175. || filter.isCSP() // must not be csp rule
  176. || (!filter.fromAny() && !filter.fromDocument())
  177. ) {
  178. // not supported type
  179. continue;
  180. }
  181. if (
  182. filter.hasHostname() // must have
  183. && filter.isPlain()
  184. && (!filter.isRegex())
  185. && (!filter.isFullRegex())
  186. ) {
  187. const hostname = normalizeDomain(filter.getHostname());
  188. if (hostname) {
  189. if (filter.isException() || filter.isBadFilter()) {
  190. addToWhiteList(hostname);
  191. continue;
  192. }
  193. if (filter.firstParty() === filter.thirdParty()) {
  194. addToBlackList(hostname, true);
  195. continue;
  196. }
  197. if (filter.thirdParty()) {
  198. if (includeThirdParties) {
  199. addToBlackList(hostname, true);
  200. }
  201. continue;
  202. }
  203. if (filter.firstParty()) {
  204. continue;
  205. }
  206. } else {
  207. continue;
  208. }
  209. }
  210. }
  211. if (line.includes('$third-party') || line.includes('$frame')) {
  212. continue;
  213. }
  214. const lineEndsWithCaret = line.endsWith('^');
  215. const lineEndsWithCaretVerticalBar = line.endsWith('^|');
  216. if (line.startsWith('@@')) {
  217. if (line.endsWith('$cname')) {
  218. continue;
  219. }
  220. if (
  221. (line.startsWith('@@|') || line.startsWith('@@.'))
  222. && (
  223. lineEndsWithCaret
  224. || lineEndsWithCaretVerticalBar
  225. || line.endsWith('$genericblock')
  226. || line.endsWith('$document')
  227. )
  228. ) {
  229. const _domain = line
  230. .replace('@@||', '')
  231. .replace('@@|', '')
  232. .replace('@@.', '')
  233. .replace('^|', '')
  234. .replace('^$genericblock', '')
  235. .replace('$genericblock', '')
  236. .replace('^$document', '')
  237. .replace('$document', '')
  238. .replaceAll('^', '')
  239. .trim();
  240. const domain = normalizeDomain(_domain);
  241. if (domain) {
  242. addToWhiteList(domain);
  243. } else {
  244. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  245. }
  246. continue;
  247. }
  248. }
  249. if (
  250. line.startsWith('||')
  251. && (
  252. lineEndsWithCaret
  253. || lineEndsWithCaretVerticalBar
  254. || line.endsWith('$cname')
  255. )
  256. ) {
  257. const _domain = line
  258. .replace('||', '')
  259. .replace('^|', '')
  260. .replace('$cname', '')
  261. .replaceAll('^', '')
  262. .trim();
  263. const domain = normalizeDomain(_domain);
  264. if (domain) {
  265. addToBlackList(domain, true);
  266. } else {
  267. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  268. }
  269. continue;
  270. }
  271. const lineStartsWithSingleDot = line.startsWith('.');
  272. if (
  273. lineStartsWithSingleDot
  274. && (
  275. lineEndsWithCaret
  276. || lineEndsWithCaretVerticalBar
  277. )
  278. ) {
  279. const _domain = line
  280. .replace('^|', '')
  281. .replaceAll('^', '')
  282. .slice(1)
  283. .trim();
  284. const domain = normalizeDomain(_domain);
  285. if (domain) {
  286. addToBlackList(domain, true);
  287. } else {
  288. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  289. }
  290. continue;
  291. }
  292. if (
  293. (
  294. line.startsWith('://')
  295. || line.startsWith('http://')
  296. || line.startsWith('https://')
  297. || line.startsWith('|http://')
  298. || line.startsWith('|https://')
  299. )
  300. && (
  301. lineEndsWithCaret
  302. || lineEndsWithCaretVerticalBar
  303. )
  304. ) {
  305. const _domain = line
  306. .replace('|https://', '')
  307. .replace('https://', '')
  308. .replace('|http://', '')
  309. .replace('http://', '')
  310. .replace('://', '')
  311. .replace('^|', '')
  312. .replaceAll('^', '')
  313. .trim();
  314. const domain = normalizeDomain(_domain);
  315. if (domain) {
  316. addToBlackList(domain, false);
  317. } else {
  318. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  319. }
  320. continue;
  321. }
  322. if (!line.startsWith('|') && lineEndsWithCaret) {
  323. const _domain = line.slice(0, -1);
  324. const domain = normalizeDomain(_domain);
  325. if (domain) {
  326. addToBlackList(domain, false);
  327. } else {
  328. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  329. }
  330. continue;
  331. }
  332. const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line);
  333. if (
  334. tryNormalizeDomain
  335. && (
  336. lineStartsWithSingleDot
  337. ? tryNormalizeDomain.length === line.length - 1
  338. : tryNormalizeDomain === line
  339. )
  340. ) {
  341. addToBlackList(line, true);
  342. continue;
  343. }
  344. if (
  345. !line.endsWith('.js')
  346. ) {
  347. hasParseFailed = true;
  348. console.warn(' * [parse-filter E0010] can not parse:', line);
  349. }
  350. }
  351. console.timeEnd(` - processFilterRules: ${filterRulesUrl}`);
  352. return {
  353. white: whitelistDomainSets,
  354. black: blacklistDomainSets,
  355. foundDebugDomain,
  356. parseFailed: hasParseFailed
  357. };
  358. }
  359. /**
  360. * @param {string[]} data
  361. */
  362. function preprocessFullDomainSetBeforeUsedAsWorkerData(data) {
  363. return data
  364. .filter(domain => domain[0] === '.')
  365. .sort((a, b) => a.length - b.length);
  366. }
  367. module.exports.processDomainLists = processDomainLists;
  368. module.exports.processHosts = processHosts;
  369. module.exports.processFilterRules = processFilterRules;
  370. module.exports.preprocessFullDomainSetBeforeUsedAsWorkerData = preprocessFullDomainSetBeforeUsedAsWorkerData;