parse-filter.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. const { fetchWithRetry } = require('./fetch-retry');
  2. const { NetworkFilter } = require('@cliqz/adblocker');
  3. const { normalizeDomain } = require('./is-domain-loose');
  4. const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
  5. let foundDebugDomain = false;
  6. const warnOnceUrl = new Set();
  7. const warnOnce = (url, isWhite, ...message) => {
  8. const key = `${url}${isWhite ? 'white' : 'black'}`;
  9. if (warnOnceUrl.has(key)) {
  10. return;
  11. }
  12. warnOnceUrl.add(key);
  13. console.warn(url, isWhite ? '(white)' : '(black)', ...message);
  14. }
  15. /**
  16. * @param {string | URL} domainListsUrl
  17. */
  18. async function processDomainLists (domainListsUrl) {
  19. if (typeof domainListsUrl === 'string') {
  20. domainListsUrl = new URL(domainListsUrl);
  21. }
  22. /** @type Set<string> */
  23. const domainSets = new Set();
  24. /** @type string[] */
  25. const domains = (await (await fetchWithRetry(domainListsUrl)).text()).split('\n');
  26. domains.forEach(line => {
  27. if (
  28. line.startsWith('#')
  29. || line.startsWith('!')
  30. || line.startsWith(' ')
  31. || line === ''
  32. || line.startsWith('\r')
  33. || line.startsWith('\n')
  34. ) {
  35. return;
  36. }
  37. const domainToAdd = line.trim();
  38. if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
  39. warnOnce(domainListsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  40. foundDebugDomain = true;
  41. }
  42. domainSets.add(domainToAdd);
  43. });
  44. return [...domainSets];
  45. }
  46. /**
  47. * @param {string | URL} hostsUrl
  48. */
  49. async function processHosts (hostsUrl, includeAllSubDomain = false) {
  50. console.time(` - processHosts: ${hostsUrl}`);
  51. if (typeof hostsUrl === 'string') {
  52. hostsUrl = new URL(hostsUrl);
  53. }
  54. /** @type Set<string> */
  55. const domainSets = new Set();
  56. /** @type string[] */
  57. const hosts = (await (await fetchWithRetry(hostsUrl)).text()).split('\n');
  58. hosts.forEach(line => {
  59. if (line.includes('#')) {
  60. return;
  61. }
  62. if (line.startsWith(' ') || line.startsWith('\r') || line.startsWith('\n') || line.trim() === '') {
  63. return;
  64. }
  65. const [, ...domains] = line.split(' ');
  66. const _domain = domains.join(' ').trim();
  67. if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
  68. warnOnce(hostsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  69. foundDebugDomain = true;
  70. }
  71. const domain = normalizeDomain(_domain);
  72. if (domain) {
  73. if (includeAllSubDomain) {
  74. domainSets.add(`.${domain}`);
  75. } else {
  76. domainSets.add(domain);
  77. }
  78. }
  79. });
  80. console.timeEnd(` - processHosts: ${hostsUrl}`);
  81. return [...domainSets];
  82. }
  83. /**
  84. * @param {string | URL} filterRulesUrl
  85. * @param {(string | URL)[] | undefined} fallbackUrls
  86. * @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean, parseFailed: boolean }>}
  87. */
  88. async function processFilterRules (filterRulesUrl, fallbackUrls, includeThirdParties = false) {
  89. console.time(` - processFilterRules: ${filterRulesUrl}`);
  90. /** @type Set<string> */
  91. const whitelistDomainSets = new Set();
  92. /** @type Set<string> */
  93. const blacklistDomainSets = new Set();
  94. const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
  95. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
  96. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  97. foundDebugDomain = true;
  98. }
  99. if (isSubDomain && !domainToBeAddedToBlack.startsWith('.')) {
  100. blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
  101. } else {
  102. blacklistDomainSets.add(domainToBeAddedToBlack);
  103. }
  104. };
  105. const addToWhiteList = (domainToBeAddedToWhite) => {
  106. if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
  107. warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
  108. foundDebugDomain = true;
  109. }
  110. whitelistDomainSets.add(domainToBeAddedToWhite);
  111. }
  112. let filterRules;
  113. try {
  114. /** @type string[] */
  115. filterRules = (
  116. await Promise.any(
  117. [filterRulesUrl, ...(fallbackUrls || [])].map(
  118. async url => (await fetchWithRetry(url)).text()
  119. )
  120. )
  121. ).split('\n').map(line => line.trim());
  122. } catch (e) {
  123. console.log('Download Rule for [' + filterRulesUrl + '] failed');
  124. throw e;
  125. }
  126. let hasParseFailed = false;
  127. for (let i = 0, len = filterRules.length; i < len; i++) {
  128. const line = filterRules[i].trim();
  129. if (
  130. line === ''
  131. // doesn't include
  132. || !line.includes('.') // rule with out dot can not be a domain
  133. // includes
  134. || line.includes('#')
  135. || line.includes('!')
  136. || line.includes('?')
  137. || line.includes('*')
  138. || line.includes('=')
  139. || line.includes('[')
  140. || line.includes('(')
  141. || line.includes(']')
  142. || line.includes(')')
  143. || line.includes(',')
  144. || line.includes('~')
  145. || line.includes('&')
  146. || line.includes('%')
  147. || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
  148. // ends with
  149. || line.endsWith('.')
  150. || line.endsWith('-')
  151. || line.endsWith('_')
  152. // special modifier
  153. || line.includes('$popup')
  154. || line.includes('$removeparam')
  155. || line.includes('$popunder')
  156. ) {
  157. continue;
  158. }
  159. const filter = NetworkFilter.parse(line);
  160. if (filter) {
  161. if (
  162. filter.isElemHide()
  163. || filter.isGenericHide()
  164. || filter.isSpecificHide()
  165. || filter.isRedirect()
  166. || filter.isRedirectRule()
  167. || filter.hasDomains()
  168. || filter.isCSP() // must not be csp rule
  169. || (!filter.fromAny() && !filter.fromDocument())
  170. ) {
  171. // not supported type
  172. continue;
  173. }
  174. if (
  175. filter.hasHostname() // must have
  176. && filter.isPlain()
  177. && (!filter.isRegex())
  178. && (!filter.isFullRegex())
  179. ) {
  180. const hostname = filter.getHostname();
  181. if (hostname) {
  182. if (filter.isException() || filter.isBadFilter()) {
  183. addToWhiteList(hostname);
  184. continue;
  185. }
  186. if (filter.firstParty() === filter.thirdParty()) {
  187. addToBlackList(hostname, true);
  188. continue;
  189. }
  190. if (filter.thirdParty()) {
  191. if (includeThirdParties) {
  192. addToBlackList(hostname, true);
  193. }
  194. continue;
  195. }
  196. if (filter.firstParty()) {
  197. continue;
  198. }
  199. }
  200. }
  201. }
  202. if (line.includes('$third-party') || line.includes('$frame')) {
  203. continue;
  204. }
  205. const lineEndsWithCaret = line.endsWith('^');
  206. const lineEndsWithCaretVerticalBar = line.endsWith('^|');
  207. if (line.startsWith('@@')) {
  208. if (line.endsWith('$cname')) {
  209. continue;
  210. }
  211. if (
  212. (line.startsWith('@@|') || line.startsWith('@@.'))
  213. && (
  214. lineEndsWithCaret
  215. || lineEndsWithCaretVerticalBar
  216. || line.endsWith('$genericblock')
  217. || line.endsWith('$document')
  218. )
  219. ) {
  220. const _domain = line
  221. .replace('@@||', '')
  222. .replace('@@|', '')
  223. .replace('@@.', '')
  224. .replace('^|', '')
  225. .replace('^$genericblock', '')
  226. .replace('$genericblock', '')
  227. .replace('^$document', '')
  228. .replace('$document', '')
  229. .replaceAll('^', '')
  230. .trim();
  231. const domain = normalizeDomain(_domain);
  232. if (domain) {
  233. addToWhiteList(domain);
  234. } else {
  235. console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
  236. }
  237. continue;
  238. }
  239. }
  240. if (
  241. line.startsWith('||')
  242. && (
  243. lineEndsWithCaret
  244. || lineEndsWithCaretVerticalBar
  245. || line.endsWith('$cname')
  246. )
  247. ) {
  248. const _domain = line
  249. .replace('||', '')
  250. .replace('^|', '')
  251. .replace('$cname', '')
  252. .replaceAll('^', '')
  253. .trim();
  254. const domain = normalizeDomain(_domain);
  255. if (domain) {
  256. addToBlackList(domain, true);
  257. } else {
  258. console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
  259. }
  260. continue;
  261. }
  262. const lineStartsWithSingleDot = line.startsWith('.');
  263. if (
  264. lineStartsWithSingleDot
  265. && (
  266. lineEndsWithCaret
  267. || lineEndsWithCaretVerticalBar
  268. )
  269. ) {
  270. const _domain = line
  271. .replace('^|', '')
  272. .replaceAll('^', '')
  273. .slice(1)
  274. .trim();
  275. const domain = normalizeDomain(_domain);
  276. if (domain) {
  277. addToBlackList(domain, true);
  278. } else {
  279. console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
  280. }
  281. continue;
  282. }
  283. if (
  284. (
  285. line.startsWith('://')
  286. || line.startsWith('http://')
  287. || line.startsWith('https://')
  288. || line.startsWith('|http://')
  289. || line.startsWith('|https://')
  290. )
  291. && (
  292. lineEndsWithCaret
  293. || lineEndsWithCaretVerticalBar
  294. )
  295. ) {
  296. const _domain = line
  297. .replace('|https://', '')
  298. .replace('https://', '')
  299. .replace('|http://', '')
  300. .replace('http://', '')
  301. .replace('://', '')
  302. .replace('^|', '')
  303. .replaceAll('^', '')
  304. .trim();
  305. const domain = normalizeDomain(_domain);
  306. if (domain) {
  307. addToBlackList(domain, false);
  308. } else {
  309. console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
  310. }
  311. continue;
  312. }
  313. if (!line.startsWith('|') && lineEndsWithCaret) {
  314. const _domain = line.slice(0, -1);
  315. const domain = normalizeDomain(_domain);
  316. if (domain) {
  317. addToBlackList(domain, false);
  318. } else {
  319. console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
  320. }
  321. continue;
  322. }
  323. const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line);
  324. if (
  325. tryNormalizeDomain
  326. && (
  327. lineStartsWithSingleDot
  328. ? tryNormalizeDomain.length === line.length - 1
  329. : tryNormalizeDomain === line
  330. )
  331. ) {
  332. addToBlackList(line, true);
  333. continue;
  334. }
  335. if (
  336. !line.endsWith('.js')
  337. ) {
  338. hasParseFailed = true;
  339. console.warn(' * [parse-filter E0010] can not parse:', line);
  340. }
  341. }
  342. console.timeEnd(` - processFilterRules: ${filterRulesUrl}`);
  343. return {
  344. white: whitelistDomainSets,
  345. black: blacklistDomainSets,
  346. foundDebugDomain,
  347. parseFailed: hasParseFailed
  348. };
  349. }
  350. function preprocessFullDomainSetBeforeUsedAsWorkerData (data) {
  351. return data.filter(domain => (
  352. domain.charCodeAt(0) === 46
  353. ));
  354. }
  355. module.exports.processDomainLists = processDomainLists;
  356. module.exports.processHosts = processHosts;
  357. module.exports.processFilterRules = processFilterRules;
  358. module.exports.preprocessFullDomainSetBeforeUsedAsWorkerData = preprocessFullDomainSetBeforeUsedAsWorkerData;