浏览代码

Refactor/Perf: rewrite how rules are salvaged

SukkaW 1 年之前
父节点
当前提交
2643903b24
共有 2 个文件被更改,包括 139 次插入310 次删除
  1. 131 310
      Build/lib/parse-filter.ts
  2. 8 0
      Source/non_ip/reject.conf

+ 131 - 310
Build/lib/parse-filter.ts

@@ -300,7 +300,7 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
     return result;
   }
 
-  let line = $line.trim();
+  const line = $line.trim();
 
   if (line.length === 0) {
     result[1] = ParseType.Null;
@@ -308,11 +308,14 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
   }
 
   const firstCharCode = line.charCodeAt(0);
-  let lastCharCode = line.charCodeAt(line.length - 1);
+  const lastCharCode = line.charCodeAt(line.length - 1);
 
   if (
     firstCharCode === 47 // 47 `/`
     // ends with
+    // _160-600.
+    // -detect-adblock.
+    // _web-advert.
     || lastCharCode === 46 // 46 `.`, line.endsWith('.')
     || lastCharCode === 45 // 45 `-`, line.endsWith('-')
     || lastCharCode === 95 // 95 `_`, line.endsWith('_')
@@ -405,263 +408,130 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
     }
   }
 
-  // After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
-  // We now need to "salvage" the line as much as possible
-
-  /*
+  /**
    * From now on, we are mostly facing non-standard domain rules (some are regex like)
-   * We first skip third-party and frame rules, as Surge / Clash can't handle them
    *
-   * `.sharecounter.$third-party`
-   * `.bbelements.com^$third-party`
-   * `://o0e.ru^$third-party`
-   * `.1.1.1.l80.js^$third-party`
+   * We can still salvage some of them by removing modifiers
    */
-  if (line.includes('$third-party')) {
-    if (!allowThirdParty) {
-      result[1] = ParseType.Null;
-      return result;
-    }
 
-    line = line
-      .replace('$third-party,', '$')
-      .replace('$third-party', '');
-  }
+  let sliceStart = 0;
+  let sliceEnd = 0;
 
-  lastCharCode = line.charCodeAt(line.length - 1);
+  // After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
+  // We now need to "salvage" the line as much as possible
 
-  /** @example line.endsWith('^') */
-  const lineEndsWithCaret = lastCharCode === 94; // lastChar === '^';
-  /** @example line.endsWith('|') */
-  const lineEndsWithVerticalBar = lastCharCode === 124; // lastChar === '|';
-  /** @example line.endsWith('^|') */
-  const lineEndsWithCaretVerticalBar = lineEndsWithVerticalBar && line[line.length - 2] === '^';
-  /** @example line.endsWith('^') || line.endsWith('^|') */
-  const lineEndsWithCaretOrCaretVerticalBar = lineEndsWithCaret || lineEndsWithCaretVerticalBar;
+  let white = false;
+  let includeAllSubDomain = false;
 
-  // whitelist (exception)
   if (
     firstCharCode === 64 // 64 `@`
-    && line[1] === '@'
+    && line.charCodeAt(1) === 64 // 64 `@`
   ) {
-    let whiteIncludeAllSubDomain = true;
-
-    /**
-     * Some "malformed" regex-based filters can not be parsed by NetworkFilter
-     * "$genericblock`" is also not supported by NetworkFilter, see:
-     *  https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950
-     *
-     * `@@||cmechina.net^$genericblock`
-     * `@@|ftp.bmp.ovh^|`
-     * `@@|adsterra.com^|`
-     * `@@.atlassian.net$document`
-     * `@@||ad.alimama.com^$genericblock`
-     */
-
-    let sliceStart = 0;
-    let sliceEnd: number | undefined;
-
-    switch (line[2]) {
-      case '|':
-        // line.startsWith('@@|')
-        sliceStart = 3;
-        whiteIncludeAllSubDomain = false;
-
-        if (line[3] === '|') { // line.startsWith('@@||')
-          sliceStart = 4;
-          whiteIncludeAllSubDomain = true;
-        }
+    sliceStart += 2;
+    white = true;
+    includeAllSubDomain = true;
+  }
 
-        break;
+  /**
+   * Some "malformed" regex-based filters can not be parsed by NetworkFilter
+   * "$genericblock`" is also not supported by NetworkFilter, see:
+   *  https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950
+   *
+   * `@@||cmechina.net^$genericblock`
+   * `@@|ftp.bmp.ovh^|`
+   * `@@|adsterra.com^|`
+   * `@@.atlassian.net$document`
+   * `@@||ad.alimama.com^$genericblock`
+   */
 
-      case '.': { // line.startsWith('@@.')
-        sliceStart = 3;
-        whiteIncludeAllSubDomain = true;
-        break;
-      }
+  switch (line.charCodeAt(sliceStart)) {
+    case 124: /** | */
+      // line.startsWith('@@|') || line.startsWith('|')
+      sliceStart += 1;
+      includeAllSubDomain = false;
 
-      case ':': {
-        /**
-         * line.startsWith('@@://')
-         *
-         * `@@://googleadservices.com^|`
-         * `@@://www.googleadservices.com^|`
-         */
-        if (line[3] === '/' && line[4] === '/') {
-          whiteIncludeAllSubDomain = false;
-          sliceStart = 5;
-        }
-        break;
+      if (line[sliceStart] === '|') { // line.startsWith('@@||') || line.startsWith('||')
+        sliceStart += 1;
+        includeAllSubDomain = true;
       }
 
-      default:
-        break;
+      break;
+
+    case 46: { /** | */ // line.startsWith('@@.') || line.startsWith('.')
+      /**
+       * `.ay.delivery^`
+       * `.m.bookben.com^`
+       * `.wap.x4399.com^`
+       */
+      sliceStart += 1;
+      includeAllSubDomain = true;
+      break;
     }
 
-    if (lineEndsWithCaret) {
-      sliceEnd = -1;
-    } else if (lineEndsWithVerticalBar) {
-      // It is possible that a whitelist filter ends with '|' without '^|'
-      // @@|www.auslogics.com|
-      sliceEnd = lineEndsWithCaretVerticalBar ? -2 : -1;
-    } else if (line.endsWith('$genericblock')) {
-      sliceEnd = -13;
-      if (line[line.length - 14] === '^') { // line.endsWith('^$genericblock')
-        sliceEnd = -14;
-      }
-    } else if (line.endsWith('$document')) {
-      sliceEnd = -9;
-      if (line[line.length - 10] === '^') { // line.endsWith('^$document')
-        sliceEnd = -10;
+    default:
+      break;
+  }
+
+  switch (line.charCodeAt(sliceStart)) {
+    case 58: { /** : */
+      /**
+       * `@@://googleadservices.com^|`
+       * `@@://www.googleadservices.com^|`
+       * `://mine.torrent.pw^`
+       * `://say.ac^`
+       */
+      if (line[sliceStart + 1] === '/' && line[sliceStart + 2] === '/') {
+        includeAllSubDomain = false;
+        sliceStart += 3;
       }
+      break;
     }
 
-    if (sliceStart !== 0 || sliceEnd !== undefined) {
-      const sliced = line.slice(sliceStart, sliceEnd);
-      const domain = normalizeDomain(sliced);
-      if (domain) {
-        result[0] = domain;
-        result[1] = whiteIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
-        return result;
+    case 104: { /** h */
+      /** |http://x.o2.pl^ */
+      if (line.startsWith('http://', sliceStart)) {
+        sliceStart += 7;
+      } else if (line.startsWith('https://', sliceStart)) {
+        sliceStart += 8;
       }
-
-      result[0] = `[parse-filter E0001] (white) invalid domain: ${JSON.stringify({
-        line, sliced, sliceStart, sliceEnd, domain
-      })}`;
-      result[1] = ParseType.ErrorMessage;
-      return result;
+      break;
     }
 
-    result[0] = `[parse-filter E0006] (white) failed to parse: ${JSON.stringify({
-      line, sliceStart, sliceEnd
-    })}`;
-    result[1] = ParseType.ErrorMessage;
-    return result;
+    default:
+      break;
   }
 
-  if (
-    // 124 `|`
-    // line.startsWith('|')
-    firstCharCode === 124
-    && lineEndsWithCaretOrCaretVerticalBar
-  ) {
-    /**
-     * Some malformed filters can not be parsed by NetworkFilter:
-     *
-     * `||smetrics.teambeachbody.com^.com^`
-     * `||solutions.|pages.indigovision.com^`
-     * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
-     * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
-     */
-
-    const includeAllSubDomain = line[1] === '|';
-
-    const sliceStart = includeAllSubDomain ? 2 : 1;
-    const sliceEnd = lineEndsWithCaret
-      ? -1
-      : (lineEndsWithCaretVerticalBar ? -2 : undefined);
-
-    const sliced = line.slice(sliceStart, sliceEnd); // we already make sure line startsWith "|"
-
-    const domain = normalizeDomain(sliced);
-    if (domain) {
-      result[0] = domain;
-      result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
-      return result;
-    }
-
-    result[0] = `[parse-filter E0002] (black) invalid domain: ${sliced}`;
-    result[1] = ParseType.ErrorMessage;
-    return result;
+  const indexOfDollar = line.indexOf('$', sliceStart);
+  if (indexOfDollar > -1) {
+    sliceEnd = indexOfDollar - line.length;
   }
 
-  // if (line.endsWith('$image')) {
-  //   /**
-  //    * Some $image filters are not NetworkFilter:
-  //    *
-  //    * `app.site123.com$image`
-  //    * `t.signaux$image`
-  //    * `track.customer.io$image`
-  //    */
-  // }
-  const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.`
-
+  /*
+   * We skip third-party and frame rules, as Surge / Clash can't handle them
+   *
+   * `.sharecounter.$third-party`
+   * `.bbelements.com^$third-party`
+   * `://o0e.ru^$third-party`
+   * `.1.1.1.l80.js^$third-party`
+   */
   if (
-    lineStartsWithSingleDot
-    && lineEndsWithCaretOrCaretVerticalBar
+    !allowThirdParty
+    && (
+      line.includes('third-party', indexOfDollar + 1)
+      || line.includes('3p', indexOfDollar + 1)
+    )
   ) {
-    /**
-     * `.ay.delivery^`
-     * `.m.bookben.com^`
-     * `.wap.x4399.com^`
-     */
-    const sliced = line.slice(
-      1, // remove prefix dot
-      lineEndsWithCaret // replaceAll('^', '')
-        ? -1
-        : (lineEndsWithCaretVerticalBar ? -2 : undefined) // replace('^|', '')
-    );
-
-    const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt);
-    if (!suffix) {
-      // This exclude domain-like resource like `1.1.4.514.js`
-      result[1] = ParseType.Null;
-      return result;
-    }
-
-    const domain = normalizeDomain(sliced);
-    if (domain) {
-      result[0] = domain;
-      result[1] = ParseType.BlackIncludeSubdomain;
-      return result;
-    }
-
-    result[0] = `[parse-filter E0003] (black) invalid domain: ${JSON.stringify({ sliced, domain })}`;
-    result[1] = ParseType.ErrorMessage;
+    result[1] = ParseType.Null;
     return result;
   }
 
-  /**
-   * `|http://x.o2.pl^`
-   * `://mine.torrent.pw^`
-   * `://say.ac^`
-   */
-  if (lineEndsWithCaretOrCaretVerticalBar) {
-    let sliceStart = 0;
-    let sliceEnd;
-    if (lineEndsWithCaret) { // line.endsWith('^')
-      sliceEnd = -1;
-    } else if (lineEndsWithCaretVerticalBar) { // line.endsWith('^|')
-      sliceEnd = -2;
-    }
-    if (line.startsWith('://')) {
-      sliceStart = 3;
-    } else if (line.startsWith('http://')) {
-      sliceStart = 7;
-    } else if (line.startsWith('https://')) {
-      sliceStart = 8;
-    } else if (line.startsWith('|http://')) {
-      sliceStart = 8;
-    } else if (line.startsWith('|https://')) {
-      sliceStart = 9;
-    }
-
-    if (sliceStart !== 0 || sliceEnd !== undefined) {
-      const sliced = line.slice(sliceStart, sliceEnd);
-      const domain = normalizeDomain(sliced);
-      if (domain) {
-        result[0] = domain;
-        result[1] = ParseType.BlackIncludeSubdomain;
-        return result;
-      }
-
-      result[0] = `[parse-filter E0004] (black) invalid domain: ${JSON.stringify({
-        line, sliced, sliceStart, sliceEnd, domain
-      })}`;
-      result[1] = ParseType.ErrorMessage;
-      return result;
-    }
+  if (line.includes('badfilter', indexOfDollar + 1)) {
+    white = true;
+  }
+  if (line.includes('all', indexOfDollar + 1)) {
+    includeAllSubDomain = true;
   }
+
   /**
    * `_vmind.qqvideo.tc.qq.com^`
    * `arketing.indianadunes.com^`
@@ -671,103 +541,54 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
    * `-logging.nextmedia.com`
    * `_social_tracking.js^`
    */
-  if (
-    firstCharCode !== 124 // 124 `|`
-    && lastCharCode === 94 // 94 `^`
-  ) {
-    const _domain = line.slice(0, -1);
-
-    const suffix = tldts.getPublicSuffix(_domain, looseTldtsOpt);
-    if (!suffix) {
-      // This exclude domain-like resource like `_social_tracking.js^`
-      result[1] = ParseType.Null;
-      return result;
-    }
+  if (line.charCodeAt(line.length + sliceEnd - 1) === 94) { // 94 `^`
+    /** line.endsWith('^') */
+    sliceEnd -= 1;
+  } else if (line.charCodeAt(line.length + sliceEnd - 1) === 124) { // 124 `|`
+    /** line.endsWith('|') */
+    sliceEnd -= 1;
 
-    const domain = normalizeDomain(_domain);
-    if (domain) {
-      result[0] = domain;
-      result[1] = ParseType.BlackAbsolute;
-      return result;
+    if (line.charCodeAt(line.length + sliceEnd - 1) === 94) { // 94 `^`
+      /** line.endsWith('^|') */
+      sliceEnd -= 1;
     }
+  } else if (line.charCodeAt(line.length + sliceEnd - 1) === 46) { // 46 `.`
+    /** line.endsWith('.') */
+    sliceEnd -= 1;
+  }
 
-    result[0] = `[parse-filter E0005] (black) invalid domain: ${_domain}`;
+  const sliced = (sliceStart > 0 || sliceEnd < 0) ? line.slice(sliceStart, sliceEnd === 0 ? undefined : sliceEnd) : line;
+  if (sliced.charCodeAt(0) === 45 /* - */) {
+    // line.startsWith('-') is not a valid domain
     result[1] = ParseType.ErrorMessage;
+    result[0] = `[parse-filter E0001] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({
+      line, sliced, sliceStart, sliceEnd
+    })}`;
     return result;
   }
 
-  // Possibly that entire rule is domain
-
-  /**
-   * lineStartsWithSingleDot:
-   *
-   * `.cookielaw.js`
-   * `.content_tracking.js`
-   * `.ads.css`
-   *
-   * else:
-   *
-   * `_prebid.js`
-   * `t.yesware.com`
-   * `ubmcmm.baidustatic.com`
-   * `://www.smfg-card.$document`
-   * `portal.librus.pl$$advertisement-module`
-   * `@@-ds.metric.gstatic.com^|`
-   * `://gom.ge/cookie.js`
-   * `://accout-update-smba.jp.$document`
-   * `_200x250.png`
-   * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
-   */
-  let sliceStart = 0;
-  let sliceEnd = line.length;
-  let isWhieList = false;
-
-  if (lineStartsWithSingleDot) {
-    // .usercentrics.eu^
-    sliceStart = 1;
-  } else if (firstCharCode === 58 /** : */ && line.startsWith('://')) {
-    // ://backcb.one^$all
-    sliceStart = 3;
-  }
-
-  if (line.endsWith('$all')) {
-    sliceEnd -= 4;
-  } else if (line.endsWith('$document')) {
-    sliceEnd -= 9;
-  } else if (line.endsWith('$badfilter')) {
-    isWhieList = true;
-    sliceEnd -= 10;
-  }
-
-  const charBeforeModifier = line.charCodeAt(sliceEnd - 1);
-  if (
-    charBeforeModifier === 94 /** ^$all, ^$document, etc. */
-    || charBeforeModifier === 46 /** .$all */
-  ) {
-    sliceEnd -= 1;
+  const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt);
+  if (!suffix) {
+    // This exclude domain-like resource like `_social_tracking.js^`
+    result[1] = ParseType.Null;
+    return result;
   }
 
-  const sliced = (sliceStart !== 0 || sliceEnd !== line.length) ? line.slice(sliceStart, sliceEnd) : line;
-
-  const tryNormalizeDomain = normalizeDomain(sliced);
-  if (tryNormalizeDomain === sliced) {
-    // the entire rule is domain
-    result[0] = sliced;
-    result[1] = isWhieList
-      ? ParseType.WhiteIncludeSubdomain
-      : ParseType.BlackIncludeSubdomain;
+  const domain = normalizeDomain(sliced);
+  if (domain) {
+    result[0] = domain;
 
+    if (white) {
+      result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
+    } else {
+      result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
+    }
     return result;
   }
 
-  console.log({
-    line,
-    lineEndsWithCaret,
-    lineEndsWithCaretOrCaretVerticalBar,
-    lineEndsWithCaretVerticalBar
-  });
-
-  result[0] = `[parse-filter ${tryNormalizeDomain === null ? 'E0010' : 'E0011'}] can not parse: ${JSON.stringify({ line, tryNormalizeDomain, sliced, sliceStart, sliceEnd })}`;
+  result[0] = `[parse-filter E0010] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({
+    line, domain, suffix, sliced, sliceStart, sliceEnd
+  })}`;
   result[1] = ParseType.ErrorMessage;
   return result;
 }

+ 8 - 0
Source/non_ip/reject.conf

@@ -67,6 +67,14 @@ DOMAIN-KEYWORD,-attr.appsflyersdk.com
 DOMAIN-KEYWORD,-s2s.sensic.net
 DOMAIN-KEYWORD,-rtb.gravite.net
 
+# >> Migrate from EasyPrivacy
+DOMAIN-KEYWORD,analytics-cdn.
+DOMAIN-KEYWORD,backstory.ebay.
+DOMAIN-KEYWORD,click.rum.
+DOMAIN-KEYWORD,cmpworker.
+DOMAIN-KEYWORD,insights-collector.
+DOMAIN-KEYWORD,track.opentable.
+
 DOMAIN-WILDCARD,f-log*.grammarly.io
 DOMAIN-WILDCARD,*.ad.*.prod.hosts.ooklaserver.net