parse-filter.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. // @ts-check
  2. const { fetchWithRetry } = require('./fetch-retry');
  3. const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-text-by-line');
  4. const { NetworkFilter } = require('@cliqz/adblocker');
  5. const { normalizeDomain } = require('./is-domain-loose');
  6. const { processLine } = require('./process-line');
  7. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  8. let foundDebugDomain = false;
  9. const warnOnceUrl = new Set();
  10. const warnOnce = (url, isWhite, ...message) => {
  11. const key = `${url}${isWhite ? 'white' : 'black'}`;
  12. if (warnOnceUrl.has(key)) {
  13. return;
  14. }
  15. warnOnceUrl.add(key);
  16. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  17. };
  18. /**
  19. * @param {string | URL} domainListsUrl
  20. */
  21. async function processDomainLists(domainListsUrl) {
  22. if (typeof domainListsUrl === 'string') {
  23. domainListsUrl = new URL(domainListsUrl);
  24. }
  25. /** @type Set<string> */
  26. const domainSets = new Set();
  27. const rl = await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl);
  28. for await (const line of rl) {
  29. if (line.startsWith('!')) {
  30. continue;
  31. }
  32. const domainToAdd = processLine(line);
  33. if (!domainToAdd) {
  34. continue;
  35. }
  36. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  37. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  38. foundDebugDomain = true;
  39. }
  40. domainSets.add(domainToAdd);
  41. }
  42. return domainSets;
  43. }
  44. /**
  45. * @param {string | URL} hostsUrl
  46. */
  47. async function processHosts(hostsUrl, includeAllSubDomain = false) {
  48. console.time(` - processHosts: ${hostsUrl}`);
  49. if (typeof hostsUrl === 'string') {
  50. hostsUrl = new URL(hostsUrl);
  51. }
  52. /** @type Set<string> */
  53. const domainSets = new Set();
  54. const rl = await fetchRemoteTextAndCreateReadlineInterface(hostsUrl);
  55. for await (const _line of rl) {
  56. const line = processLine(_line);
  57. if (!line) {
  58. continue;
  59. }
  60. const [, ...domains] = line.split(' ');
  61. const _domain = domains.join(' ').trim();
  62. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  63. warnOnce(hostsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  64. foundDebugDomain = true;
  65. }
  66. const domain = normalizeDomain(_domain);
  67. if (domain) {
  68. if (includeAllSubDomain) {
  69. domainSets.add(`.${domain}`);
  70. } else {
  71. domainSets.add(domain);
  72. }
  73. }
  74. }
  75. console.timeEnd(` - processHosts: ${hostsUrl}`);
  76. return domainSets;
  77. }
  78. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#&%~=]/;
  79. const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
  80. /**
  81. * @param {string | URL} filterRulesUrl
  82. * @param {readonly (string | URL)[] | undefined} [fallbackUrls]
  83. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean, parseFailed: boolean }>}
  84. */
  85. async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdParties = false) {
  86. console.time(` - processFilterRules: ${filterRulesUrl}`);
  87. /** @type Set<string> */
  88. const whitelistDomainSets = new Set();
  89. /** @type Set<string> */
  90. const blacklistDomainSets = new Set();
  91. const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  92. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  93. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  94. foundDebugDomain = true;
  95. }
  96. if (isSubDomain && !domainToBeAddedToBlack.startsWith('.')) {
  97. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  98. } else {
  99. blacklistDomainSets.add(domainToBeAddedToBlack);
  100. }
  101. };
  102. const addToWhiteList = (domainToBeAddedToWhite) => {
  103. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
  104. warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
  105. foundDebugDomain = true;
  106. }
  107. whitelistDomainSets.add(domainToBeAddedToWhite);
  108. };
  109. let filterRules;
  110. try {
  111. /** @type string[] */
  112. filterRules = (
  113. await Promise.any(
  114. [filterRulesUrl, ...(fallbackUrls || [])].map(
  115. async url => (await fetchWithRetry(url)).text()
  116. )
  117. )
  118. ).split('\n').map(line => line.trim());
  119. } catch (e) {
  120. console.log(`Download Rule for [${filterRulesUrl}] failed`);
  121. throw e;
  122. }
  123. let hasParseFailed = false;
  124. for (let i = 0, len = filterRules.length; i < len; i++) {
  125. const line = filterRules[i].trim();
  126. if (
  127. line === ''
  128. || line.startsWith('/')
  129. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
  130. // doesn't include
  131. || !line.includes('.') // rule with out dot can not be a domain
  132. // includes
  133. // || line.includes('#')
  134. || line.includes('!')
  135. || line.includes('?')
  136. || line.includes('*')
  137. // || line.includes('=')
  138. || line.includes('[')
  139. || line.includes('(')
  140. || line.includes(']')
  141. || line.includes(')')
  142. || line.includes(',')
  143. // || line.includes('~')
  144. // || line.includes('&')
  145. // || line.includes('%')
  146. || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
  147. // ends with
  148. || line.endsWith('.')
  149. || line.endsWith('-')
  150. || line.endsWith('_')
  151. // special modifier
  152. || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
  153. // || line.includes('$popup')
  154. // || line.includes('$removeparam')
  155. // || line.includes('$popunder')
  156. ) {
  157. continue;
  158. }
  159. const filter = NetworkFilter.parse(line);
  160. if (filter) {
  161. if (
  162. filter.isElemHide()
  163. || filter.isGenericHide()
  164. || filter.isSpecificHide()
  165. || filter.isRedirect()
  166. || filter.isRedirectRule()
  167. || filter.hasDomains()
  168. || filter.isCSP() // must not be csp rule
  169. || (!filter.fromAny() && !filter.fromDocument())
  170. ) {
  171. // not supported type
  172. continue;
  173. }
  174. if (
  175. filter.hasHostname() // must have
  176. && filter.isPlain()
  177. && (!filter.isRegex())
  178. && (!filter.isFullRegex())
  179. ) {
  180. const hostname = normalizeDomain(filter.getHostname());
  181. if (hostname) {
  182. if (filter.isException() || filter.isBadFilter()) {
  183. addToWhiteList(hostname);
  184. continue;
  185. }
  186. if (filter.firstParty() === filter.thirdParty()) {
  187. addToBlackList(hostname, true);
  188. continue;
  189. }
  190. if (filter.thirdParty()) {
  191. if (includeThirdParties) {
  192. addToBlackList(hostname, true);
  193. }
  194. continue;
  195. }
  196. if (filter.firstParty()) {
  197. continue;
  198. }
  199. } else {
  200. continue;
  201. }
  202. }
  203. }
  204. if (line.includes('$third-party') || line.includes('$frame')) {
  205. continue;
  206. }
  207. const lineEndsWithCaret = line.endsWith('^');
  208. const lineEndsWithCaretVerticalBar = line.endsWith('^|');
  209. if (line.startsWith('@@')) {
  210. if (line.endsWith('$cname')) {
  211. continue;
  212. }
  213. if (
  214. (line.startsWith('@@|') || line.startsWith('@@.'))
  215. && (
  216. lineEndsWithCaret
  217. || lineEndsWithCaretVerticalBar
  218. || line.endsWith('$genericblock')
  219. || line.endsWith('$document')
  220. )
  221. ) {
  222. const _domain = line
  223. .replace('@@||', '')
  224. .replace('@@|', '')
  225. .replace('@@.', '')
  226. .replace('^|', '')
  227. .replace('^$genericblock', '')
  228. .replace('$genericblock', '')
  229. .replace('^$document', '')
  230. .replace('$document', '')
  231. .replaceAll('^', '')
  232. .trim();
  233. const domain = normalizeDomain(_domain);
  234. if (domain) {
  235. addToWhiteList(domain);
  236. } else {
  237. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  238. }
  239. continue;
  240. }
  241. }
  242. if (
  243. line.startsWith('||')
  244. && (
  245. lineEndsWithCaret
  246. || lineEndsWithCaretVerticalBar
  247. || line.endsWith('$cname')
  248. )
  249. ) {
  250. const _domain = line
  251. .replace('||', '')
  252. .replace('^|', '')
  253. .replace('$cname', '')
  254. .replaceAll('^', '')
  255. .trim();
  256. const domain = normalizeDomain(_domain);
  257. if (domain) {
  258. addToBlackList(domain, true);
  259. } else {
  260. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  261. }
  262. continue;
  263. }
  264. const lineStartsWithSingleDot = line.startsWith('.');
  265. if (
  266. lineStartsWithSingleDot
  267. && (
  268. lineEndsWithCaret
  269. || lineEndsWithCaretVerticalBar
  270. )
  271. ) {
  272. const _domain = line
  273. .replace('^|', '')
  274. .replaceAll('^', '')
  275. .slice(1)
  276. .trim();
  277. const domain = normalizeDomain(_domain);
  278. if (domain) {
  279. addToBlackList(domain, true);
  280. } else {
  281. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  282. }
  283. continue;
  284. }
  285. if (
  286. (
  287. line.startsWith('://')
  288. || line.startsWith('http://')
  289. || line.startsWith('https://')
  290. || line.startsWith('|http://')
  291. || line.startsWith('|https://')
  292. )
  293. && (
  294. lineEndsWithCaret
  295. || lineEndsWithCaretVerticalBar
  296. )
  297. ) {
  298. const _domain = line
  299. .replace('|https://', '')
  300. .replace('https://', '')
  301. .replace('|http://', '')
  302. .replace('http://', '')
  303. .replace('://', '')
  304. .replace('^|', '')
  305. .replaceAll('^', '')
  306. .trim();
  307. const domain = normalizeDomain(_domain);
  308. if (domain) {
  309. addToBlackList(domain, false);
  310. } else {
  311. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  312. }
  313. continue;
  314. }
  315. if (!line.startsWith('|') && lineEndsWithCaret) {
  316. const _domain = line.slice(0, -1);
  317. const domain = normalizeDomain(_domain);
  318. if (domain) {
  319. addToBlackList(domain, false);
  320. } else {
  321. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  322. }
  323. continue;
  324. }
  325. const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line);
  326. if (
  327. tryNormalizeDomain
  328. && (
  329. lineStartsWithSingleDot
  330. ? tryNormalizeDomain.length === line.length - 1
  331. : tryNormalizeDomain === line
  332. )
  333. ) {
  334. addToBlackList(line, true);
  335. continue;
  336. }
  337. if (
  338. !line.endsWith('.js')
  339. ) {
  340. hasParseFailed = true;
  341. console.warn(' * [parse-filter E0010] can not parse:', line);
  342. }
  343. }
  344. console.timeEnd(` - processFilterRules: ${filterRulesUrl}`);
  345. return {
  346. white: whitelistDomainSets,
  347. black: blacklistDomainSets,
  348. foundDebugDomain,
  349. parseFailed: hasParseFailed
  350. };
  351. }
  352. /**
  353. * @param {string[]} data
  354. */
  355. function preprocessFullDomainSetBeforeUsedAsWorkerData(data) {
  356. return data
  357. .filter(domain => domain[0] === '.')
  358. .sort((a, b) => a.length - b.length);
  359. }
  360. module.exports.processDomainLists = processDomainLists;
  361. module.exports.processHosts = processHosts;
  362. module.exports.processFilterRules = processFilterRules;
  363. module.exports.preprocessFullDomainSetBeforeUsedAsWorkerData = preprocessFullDomainSetBeforeUsedAsWorkerData;