Browse Source

Make AdGuard parse faster / Make reject stats correct

SukkaW 2 years ago
parent
commit
cbaa4d51f5
3 changed files with 202 additions and 123 deletions
  1. 4 3
      Build/build-phishing-domainset.js
  2. 6 6
      Build/build-reject-domainset.js
  3. 192 114
      Build/lib/parse-filter.js

+ 4 - 3
Build/build-phishing-domainset.js

@@ -69,10 +69,11 @@ const buildPhishingDomainSet = task(__filename, async () => {
     processFilterRules(
       'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt',
       [
-        'https://malware-filter.gitlab.io/phishing-filter/phishing-filter-agh.txt',
-        'https://malware-filter.pages.dev/phishing-filter-agh.txt',
         'https://phishing-filter.pages.dev/phishing-filter-agh.txt'
-      ]
+        // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while
+        // 'https://malware-filter.gitlab.io/malware-filter/phishing-filter-agh.txt'
+      ],
+      false
     ),
     getGorhillPublicSuffixPromise()
   ]);

+ 6 - 6
Build/build-reject-domainset.js

@@ -14,7 +14,7 @@ const { readFileByLine } = require('./lib/fetch-remote-text-by-line');
 const { createDomainSorter } = require('./lib/stable-sort-domain');
 const { traceSync, task } = require('./lib/trace-runner');
 const { getGorhillPublicSuffixPromise } = require('./lib/get-gorhill-publicsuffix');
-const { createCachedGorhillGetDomain } = require('./lib/cached-tld-parse');
+const tldts = require('tldts');
 
 /** Whitelists */
 const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
@@ -131,8 +131,6 @@ const buildRejectDomainSet = task(__filename, async () => {
   console.log(`Start deduping from black keywords/suffixes! (${previousSize})`);
   console.time('* Dedupe from black keywords/suffixes');
 
-  const kwfilter = createKeywordFilter(domainKeywordsSet);
-
   const trie1 = Trie.from(domainSets);
   domainSuffixSet.forEach(suffix => {
     trie1.find(suffix, true).forEach(f => domainSets.delete(f));
@@ -141,6 +139,9 @@ const buildRejectDomainSet = task(__filename, async () => {
     trie1.find(suffix, true).forEach(f => domainSets.delete(f));
   });
 
+  // remove pre-defined enforced blacklist from whitelist
+  const kwfilter = createKeywordFilter(domainKeywordsSet);
+
   // Build whitelist trie, to handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
   const trieWhite = Trie.from(filterRuleWhitelistDomainSets);
   for (const domain of domainSets) {
@@ -171,19 +172,18 @@ const buildRejectDomainSet = task(__filename, async () => {
   console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`);
 
   // Create reject stats
-  const getDomain = createCachedGorhillGetDomain(gorhill);
   /** @type {[string, number][]} */
   const rejectDomainsStats = traceSync(
     '* Collect reject domain stats',
     () => Object.entries(
       dudupedDominArray.reduce((acc, cur) => {
-        const suffix = getDomain(cur);
+        const suffix = tldts.getDomain(cur, { allowPrivateDomains: false });
         if (suffix) {
           acc[suffix] = (acc[suffix] ?? 0) + 1;
         }
         return acc;
       }, {})
-    ).filter(a => a[1] > 2).sort((a, b) => {
+    ).filter(a => a[1] > 10).sort((a, b) => {
       const t = b[1] - a[1];
       if (t !== 0) {
         return t;

+ 192 - 114
Build/lib/parse-filter.js

@@ -5,6 +5,7 @@ const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-te
 const { NetworkFilter } = require('@cliqz/adblocker');
 const { processLine } = require('./process-line');
 const { performance } = require('perf_hooks');
+const { getGorhillPublicSuffixPromise } = require('./get-gorhill-publicsuffix');
 
 const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
 let foundDebugDomain = false;
@@ -22,14 +23,12 @@ const warnOnce = (url, isWhite, ...message) => {
 const normalizeDomain = (domain) => {
   if (!domain) return null;
 
-  const { isIcann, isPrivate, hostname, isIp } = tldts.parse(domain);
-  if (isIp) return null;
+  const parsed = tldts.parse(domain);
+  if (parsed.isIp) return null;
 
-  if (isIcann || isPrivate) {
-    if (hostname?.[0] === '.') {
-      return hostname.slice(1);
-    }
-    return hostname;
+  if (parsed.isIcann || parsed.isPrivate) {
+    const h = parsed.hostname;
+    return h?.[0] === '.' ? h.slice(1) : h;
   }
 
   return null;
@@ -122,51 +121,51 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
   /** @type Set<string> */
   const blacklistDomainSets = new Set();
 
-  const __addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
-    if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
-      warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
-      foundDebugDomain = true;
-    }
-
+  /**
+   * @param {string} domainToBeAddedToBlack
+   * @param {boolean} isSubDomain
+   */
+  const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
     if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
       blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
     } else {
       blacklistDomainSets.add(domainToBeAddedToBlack);
     }
   };
-  const addToBlackList = DEBUG_DOMAIN_TO_FIND == null
-    ? __addToBlackList
-    : (domainToBeAddedToBlack, isSubDomain) => {
-      if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
-        warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
-        foundDebugDomain = true;
-      }
-      __addToBlackList(domainToBeAddedToBlack, isSubDomain);
-    };
-
-  const __addToWhiteList = (domainToBeAddedToWhite) => {
-    whitelistDomainSets.add(domainToBeAddedToWhite);
+  /**
+   * @param {string} domainToBeAddedToWhite
+   * @param {boolean} [isSubDomain]
+   */
+  const addToWhiteList = (domainToBeAddedToWhite, isSubDomain = true) => {
+    if (isSubDomain && domainToBeAddedToWhite[0] !== '.') {
+      blacklistDomainSets.add(`.${domainToBeAddedToWhite}`);
+    } else {
+      blacklistDomainSets.add(domainToBeAddedToWhite);
+    }
   };
-  const addToWhiteList = DEBUG_DOMAIN_TO_FIND == null
-    ? __addToWhiteList
-    : (domainToBeAddedToWhite) => {
-      if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
-        warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
-        foundDebugDomain = true;
-      }
-      __addToWhiteList(domainToBeAddedToWhite);
-    };
 
   let downloadTime = 0;
+  const gorhill = await getGorhillPublicSuffixPromise();
 
   const lineCb = (line) => {
-    const result = parse(line, includeThirdParties);
+    const result = parse(line, includeThirdParties, gorhill);
     if (result) {
       const flag = result[1];
       const hostname = result[0];
+
+      if (DEBUG_DOMAIN_TO_FIND) {
+        if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
+          warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
+          foundDebugDomain = true;
+        }
+      }
+
       switch (flag) {
         case 0:
-          addToWhiteList(hostname);
+          addToWhiteList(hostname, true);
+          break;
+        case -1:
+          addToWhiteList(hostname, false);
           break;
         case 1:
           addToBlackList(hostname, false);
@@ -183,7 +182,8 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
   if (!fallbackUrls || fallbackUrls.length === 0) {
     const downloadStart = performance.now();
     for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
-      lineCb(line.trim());
+      // don't trim here
+      lineCb(line);
     }
     downloadTime = performance.now() - downloadStart;
   } else {
@@ -202,7 +202,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
             return text;
           })
         )
-      ).split('\n').map(line => line.trim());
+      ).split('\n');
     } catch (e) {
       console.log(`Download Rule for [${filterRulesUrl}] failed`);
       throw e;
@@ -210,7 +210,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
     downloadTime = performance.now() - downloadStart;
 
     for (let i = 0, len = filterRules.length; i < len; i++) {
-      lineCb(filterRules[i].trim());
+      lineCb(filterRules[i]);
     }
   }
 
@@ -230,35 +230,44 @@ const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)
 /**
  * @param {string} $line
  * @param {boolean} includeThirdParties
- * @returns {null | [string, 0 | 1 | 2]} - 0 white, 1 black abosulte, 2 black include subdomain
+ * @param {import('gorhill-publicsuffixlist').default} gorhill
+ * @returns {null | [string, 0 | 1 | 2 | -1]} - 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
  */
-function parse($line, includeThirdParties) {
-  const line = $line.trim();
-
+function parse($line, includeThirdParties, gorhill) {
   if (
-    line === ''
-    || line[0] === '/'
-    || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
     // doesn't include
-    || !line.includes('.') // rule with out dot can not be a domain
+    !$line.includes('.') // rule with out dot can not be a domain
     // includes
-    // || line.includes('#')
-    || line.includes('!')
-    || line.includes('?')
-    || line.includes('*')
-    // || line.includes('=')
-    || line.includes('[')
-    || line.includes('(')
-    || line.includes(']')
-    || line.includes(')')
-    || line.includes(',')
-    // || line.includes('~')
-    // || line.includes('&')
-    // || line.includes('%')
+    || $line.includes('!')
+    || $line.includes('?')
+    || $line.includes('*')
+    || $line.includes('[')
+    || $line.includes('(')
+    || $line.includes(']')
+    || $line.includes(')')
+    || $line.includes(',')
+    || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
+  ) {
+    return null;
+  }
+
+  const line = $line.trim();
+
+  const len = line.length;
+  if (len === 0) {
+    return null;
+  }
+
+  const firstChar = line[0];
+  const lastChar = line[len - 1];
+
+  if (
+    len === 0
+    || firstChar === '/'
     // ends with
-    || line.endsWith('.')
-    || line.endsWith('-')
-    || line.endsWith('_')
+    || lastChar === '.' // || line.endsWith('.')
+    || lastChar === '-' // || line.endsWith('-')
+    || lastChar === '_' // || line.endsWith('_')
     // special modifier
     || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
     || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
@@ -286,52 +295,82 @@ function parse($line, includeThirdParties) {
     }
 
     if (
-      filter.hasHostname() // must have
+      filter.hostname // filter.hasHostname() // must have
       && filter.isPlain()
       && (!filter.isRegex())
       && (!filter.isFullRegex())
     ) {
-      const hostname = normalizeDomain(filter.getHostname());
-      if (hostname) {
-        if (filter.isException() || filter.isBadFilter()) {
-          return [hostname, 0];
-        }
-        if (filter.firstParty() === filter.thirdParty()) {
+      if (!gorhill.getDomain(filter.hostname)) {
+        return null;
+      }
+      const hostname = normalizeDomain(filter.hostname);
+      if (!hostname) {
+        return null;
+      }
+      if (filter.isException() || filter.isBadFilter()) {
+        return [hostname, 0];
+      }
+
+      const _1p = filter.firstParty();
+      const _3p = filter.thirdParty();
+      if (_1p === _3p) {
+        return [hostname, 2];
+      }
+      if (_3p) {
+        if (includeThirdParties) {
           return [hostname, 2];
         }
-        if (filter.thirdParty()) {
-          if (includeThirdParties) {
-            return [hostname, 2];
-          }
-          return null;
-        }
-        if (filter.firstParty()) {
-          return null;
-        }
-      } else {
+        return null;
+      }
+      if (_1p) {
         return null;
       }
     }
   }
 
+  /**
+   * abnormal filter that can not be parsed by NetworkFilter
+   */
+
   if (line.includes('$third-party') || line.includes('$frame')) {
+    /*
+     * `.bbelements.com^$third-party`
+     * `://o0e.ru^$third-party`
+     */
     return null;
   }
 
-  const lineEndsWithCaret = line.endsWith('^');
-  const lineEndsWithCaretVerticalBar = line.endsWith('^|');
-
-  if (line[0] === '@' && line[1] === '@') {
+  const lineEndsWithCaretOrCaretVerticalBar = (
+    lastChar === '^'
+    || (lastChar === '|' && line[len - 2] === '^')
+  );
+
+  // whitelist (exception)
+  if (firstChar === '@' && line[1] === '@') {
+    /**
+     * cname exceptional filter can not be parsed by NetworkFilter
+     *
+     * `@@||m.faz.net^$cname`
+     *
+     * Surge / Clash can't handle CNAME either, so we just ignore them
+     */
     if (line.endsWith('$cname')) {
       return null;
     }
 
+    /**
+     * Some "malformed" regex-based filters can not be parsed by NetworkFilter
+     * "$genericblock`" is also not supported by NetworkFilter
+     *
+     * `@@||cmechina.net^$genericblock`
+     * `@@|ftp.bmp.ovh^|`
+     * `@@|adsterra.com^|`
+     */
     if (
       // (line.startsWith('@@|') || line.startsWith('@@.'))
       (line[2] === '|' || line[2] === '.')
       && (
-        lineEndsWithCaret
-        || lineEndsWithCaretVerticalBar
+        lineEndsWithCaretOrCaretVerticalBar
         || line.endsWith('$genericblock')
         || line.endsWith('$document')
       )
@@ -352,22 +391,29 @@ function parse($line, includeThirdParties) {
       if (domain) {
         return [domain, 0];
       }
-      console.warn('      * [parse-filter E0001] (black) invalid domain:', _domain);
 
+      console.warn('      * [parse-filter E0001] (black) invalid domain:', _domain);
       return null;
     }
   }
 
   if (
-    line.startsWith('||')
+    firstChar === '|' && line[1] === '|'
     && (
-      lineEndsWithCaret
-      || lineEndsWithCaretVerticalBar
+      lineEndsWithCaretOrCaretVerticalBar
       || line.endsWith('$cname')
     )
   ) {
+    /**
+     * Some malformed filters can not be parsed by NetworkFilter:
+     *
+     * `||smetrics.teambeachbody.com^.com^`
+     * `||solutions.|pages.indigovision.com^`
+     * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
+     */
     const _domain = line
-      .replace('||', '')
+      // .replace('||', '')
+      .slice(2) // we already make sure line startsWith ||
       .replace('^|', '')
       .replace('$cname', '')
       .replaceAll('^', '')
@@ -382,20 +428,28 @@ function parse($line, includeThirdParties) {
     return null;
   }
 
-  const lineStartsWithSingleDot = line[0] === '.';
+  const lineStartsWithSingleDot = firstChar === '.';
   if (
     lineStartsWithSingleDot
-    && (
-      lineEndsWithCaret
-      || lineEndsWithCaretVerticalBar
-    )
+    && lineEndsWithCaretOrCaretVerticalBar
   ) {
+    /**
+     * `.ay.delivery^`
+     * `.m.bookben.com^`
+     * `.wap.x4399.com^`
+     */
     const _domain = line
+      .slice(1) // remove prefix dot
       .replace('^|', '')
       .replaceAll('^', '')
-      .slice(1)
       .trim();
 
+    const suffix = gorhill.getPublicSuffix(_domain);
+    if (!gorhill.suffixInPSL(suffix)) {
+      // This exclude domain-like resource like `1.1.4.514.js`
+      return null;
+    }
+
     const domain = normalizeDomain(_domain);
     if (domain) {
       return [domain, 2];
@@ -404,6 +458,12 @@ function parse($line, includeThirdParties) {
 
     return null;
   }
+
+  /**
+   * `|http://x.o2.pl^`
+   * `://mine.torrent.pw^`
+   * `://say.ac^`
+   */
   if (
     (
       line.startsWith('://')
@@ -412,10 +472,7 @@ function parse($line, includeThirdParties) {
       || line.startsWith('|http://')
       || line.startsWith('|https://')
     )
-    && (
-      lineEndsWithCaret
-      || lineEndsWithCaretVerticalBar
-    )
+    && lineEndsWithCaretOrCaretVerticalBar
   ) {
     const _domain = line
       .replace('|https://', '')
@@ -431,33 +488,54 @@ function parse($line, includeThirdParties) {
     if (domain) {
       return [domain, 1];
     }
-    console.warn('      * [parse-filter E0004] (black) invalid domain:', _domain);
 
+    console.warn('      * [parse-filter E0004] (black) invalid domain:', _domain);
     return null;
   }
-  if (line[0] !== '|' && lineEndsWithCaret) {
+
+  /**
+   * `_vmind.qqvideo.tc.qq.com^`
+   * `arketing.indianadunes.com^`
+   * `charlestownwyllie.oaklawnnonantum.com^`
+   * `-telemetry.officeapps.live.com^`
+   * `-tracker.biliapi.net`
+   * `_social_tracking.js^`
+   */
+  if (firstChar !== '|' && lastChar === '^') {
     const _domain = line.slice(0, -1);
     const domain = normalizeDomain(_domain);
     if (domain) {
       return [domain, 1];
     }
-    console.warn('      * [parse-filter E0005] (black) invalid domain:', _domain);
 
+    console.warn('      * [parse-filter E0005] (black) invalid domain:', _domain);
     return null;
   }
+
+  /**
+   * `.3.n.2.2.l30.js`
+   * `_prebid.js`
+   * `t.yesware.com`
+   * `ubmcmm.baidustatic.com`
+   * `portal.librus.pl$$advertisement-module`
+   * `@@-ds.metric.gstatic.com^|`
+   * `://gom.ge/cookie.js`
+   * `://accout-update-smba.jp.$document`
+   * `@@://googleadservices.com^|`
+   */
   const tryNormalizeDomain = normalizeDomain(line);
-  if (
-    tryNormalizeDomain
-    && (
-      lineStartsWithSingleDot
-        ? tryNormalizeDomain.length === line.length - 1
-        : tryNormalizeDomain === line
-    )
-  ) {
-    return [line, 2];
+  if (tryNormalizeDomain) {
+    if (tryNormalizeDomain === line) {
+      // the entire rule is domain
+      return [line, 2];
+    }
+    if (lineStartsWithSingleDot && tryNormalizeDomain === line.slice(1)) {
+      // dot prefixed line has stripped
+      return [line, 2];
+    }
   }
 
-  if (!line.endsWith('.js')) {
+  if (!line.endsWith('.js') && !line.endsWith('.css')) {
     console.warn('      * [parse-filter E0010] can not parse:', line);
   }