ソースを参照

Perf: make reject parsing faster & more robust

SukkaW 2 年 前
コミット
91df00f7f3
1 ファイル変更159 行追加127 行削除
  1. 159 127
      Build/lib/parse-filter.ts

+ 159 - 127
Build/lib/parse-filter.ts

@@ -5,7 +5,7 @@ import { processLine } from './process-line';
 import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
 import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
 
-import { traceAsync } from './trace-runner';
+import { traceAsync, traceSync } from './trace-runner';
 import picocolors from 'picocolors';
 import { normalizeDomain } from './normalize-domain';
 import { fetchAssets } from './fetch-assets';
@@ -156,9 +156,13 @@ export async function processFilterRules(
         () => fetchAssets(filterRulesUrl, fallbackUrls),
         picocolors.gray
       )).split('\n');
+
+      const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`);
+      console.time(key);
       for (let i = 0, len = filterRules.length; i < len; i++) {
         lineCb(filterRules[i]);
       }
+      console.timeEnd(key);
     }
   });
 
@@ -305,17 +309,19 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
   }
 
   /** @example line.endsWith('^') */
-  const linedEndsWithCaret = lastCharCode === 94; // lastChar === '^';
+  const lineEndsWithCaret = lastCharCode === 94; // lastChar === '^';
   /** @example line.endsWith('^|') */
   const lineEndsWithCaretVerticalBar = (lastCharCode === 124 /** lastChar === '|' */) && line[len - 2] === '^';
   /** @example line.endsWith('^') || line.endsWith('^|') */
-  const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
+  const lineEndsWithCaretOrCaretVerticalBar = lineEndsWithCaret || lineEndsWithCaretVerticalBar;
 
   // whitelist (exception)
   if (
     firstCharCode === 64 // 64 `@`
     && line[1] === '@'
   ) {
+    let whiteIncludeAllSubDomain = true;
+
     /**
      * Some "malformed" regex-based filters can not be parsed by NetworkFilter
      * "$genericblock`" is also not supported by NetworkFilter, see:
@@ -331,22 +337,27 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
     let sliceStart = 0;
     let sliceEnd: number | undefined;
 
-    // line.startsWith('@@|') || line.startsWith('@@.')
-    if (line[2] === '|' || line[2] === '.') {
+    if (line[2] === '|') { // line.startsWith('@@|')
       sliceStart = 3;
-      // line.startsWith('@@||')
-      if (line[3] === '|') {
+      whiteIncludeAllSubDomain = false;
+
+      if (line[3] === '|') { // line.startsWith('@@||')
         sliceStart = 4;
+        whiteIncludeAllSubDomain = true;
       }
-    }
-
-    /**
-     * line.startsWith('@@://')
-     *
-     * `@@://googleadservices.com^|`
-     * `@@://www.googleadservices.com^|`
-     */
-    if (line[2] === ':' && line[3] === '/' && line[4] === '/') {
+    } else if (line[2] === '.') { // line.startsWith('@@.')
+      sliceStart = 3;
+      whiteIncludeAllSubDomain = true;
+    } else if (
+      /**
+       * line.startsWith('@@://')
+       *
+       * `@@://googleadservices.com^|`
+       * `@@://www.googleadservices.com^|`
+       */
+      line[2] === ':' && line[3] === '/' && line[4] === '/'
+    ) {
+      whiteIncludeAllSubDomain = false;
       sliceStart = 5;
     }
 
@@ -368,7 +379,7 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
       const sliced = line.slice(sliceStart, sliceEnd);
       const domain = normalizeDomain(sliced);
       if (domain) {
-        return [domain, ParseType.WhiteIncludeSubdomain];
+        return [domain, whiteIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute];
       }
       return [
         `[parse-filter E0001] (white) invalid domain: ${JSON.stringify({
@@ -386,40 +397,39 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
     ];
   }
 
-  if (firstCharCode === 124) { // 124 `|`
-    if (lineEndsWithCaretOrCaretVerticalBar) {
-      /**
-       * Some malformed filters can not be parsed by NetworkFilter:
-       *
-       * `||smetrics.teambeachbody.com^.com^`
-       * `||solutions.|pages.indigovision.com^`
-       * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
-       * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
-       */
-
-      const includeAllSubDomain = line[1] === '|';
+  if (
+    // 124 `|`
+    // line.startsWith('|')
+    firstCharCode === 124
+    && lineEndsWithCaretOrCaretVerticalBar
+  ) {
+    /**
+     * Some malformed filters can not be parsed by NetworkFilter:
+     *
+     * `||smetrics.teambeachbody.com^.com^`
+     * `||solutions.|pages.indigovision.com^`
+     * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
+     * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
+     */
 
-      const sliceStart = includeAllSubDomain ? 2 : 1;
-      const sliceEnd = lastCharCode === 94 // lastChar === '^'
-        ? -1
-        : (lineEndsWithCaretVerticalBar
-          ? -2
-          : undefined);
+    const includeAllSubDomain = line[1] === '|';
 
-      const _domain = line
-        .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
-        .trim();
+    const sliceStart = includeAllSubDomain ? 2 : 1;
+    const sliceEnd = lineEndsWithCaret
+      ? -1
+      : (lineEndsWithCaretVerticalBar ? -2 : undefined);
 
-      const domain = normalizeDomain(_domain);
-      if (domain) {
-        return [domain, includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
-      }
+    const sliced = line.slice(sliceStart, sliceEnd); // we already make sure line startsWith "|"
 
-      return [
-        `[parse-filter E0002] (black) invalid domain: ${_domain}`,
-        ParseType.ErrorMessage
-      ];
+    const domain = normalizeDomain(sliced);
+    if (domain) {
+      return [domain, includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
     }
+
+    return [
+      `[parse-filter E0002] (black) invalid domain: ${sliced}`,
+      ParseType.ErrorMessage
+    ];
   }
 
   const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.`
@@ -432,75 +442,78 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
      * `.m.bookben.com^`
      * `.wap.x4399.com^`
      */
-    const _domain = line.slice(
+    const sliced = line.slice(
       1, // remove prefix dot
-      linedEndsWithCaret // replaceAll('^', '')
+      lineEndsWithCaret // replaceAll('^', '')
         ? -1
-        : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
+        : (lineEndsWithCaretVerticalBar ? -2 : undefined) // replace('^|', '')
     );
 
-    const suffix = gorhill.getPublicSuffix(_domain);
+    const suffix = gorhill.getPublicSuffix(sliced);
     if (!gorhill.suffixInPSL(suffix)) {
       // This exclude domain-like resource like `1.1.4.514.js`
       return null;
     }
 
-    const domain = normalizeDomain(_domain);
+    const domain = normalizeDomain(sliced);
     if (domain) {
       return [domain, ParseType.BlackIncludeSubdomain];
     }
 
     return [
-      `[paparse-filter E0003] (black) invalid domain: ${_domain}`,
+      `[paparse-filter E0003] (black) invalid domain: ${sliced}`,
       ParseType.ErrorMessage
     ];
   }
 
   /**
- * `|http://x.o2.pl^`
- * `://mine.torrent.pw^`
- * `://say.ac^`
- */
-  if (
-    (
-      line.startsWith('://')
-      || line.startsWith('http://')
-      || line.startsWith('https://')
-      || line.startsWith('|http://')
-      || line.startsWith('|https://')
-    )
-    && lineEndsWithCaretOrCaretVerticalBar
-  ) {
-    const _domain = line
-      .replace('|https://', '')
-      .replace('https://', '')
-      .replace('|http://', '')
-      .replace('http://', '')
-      .replace('://', '')
-      .replace('^|', '')
-      .replaceAll('^', '')
-      .trim();
-
-    const domain = normalizeDomain(_domain);
-    if (domain) {
-      return [domain, ParseType.BlackAbsolute];
+   * `|http://x.o2.pl^`
+   * `://mine.torrent.pw^`
+   * `://say.ac^`
+   */
+  if (lineEndsWithCaretOrCaretVerticalBar) {
+    let sliceStart = 0;
+    let sliceEnd;
+    if (lineEndsWithCaret) { // line.endsWith('^')
+      sliceEnd = -1;
+    } else if (lineEndsWithCaretVerticalBar) { // line.endsWith('^|')
+      sliceEnd = -2;
+    }
+    if (line.startsWith('://')) {
+      sliceStart = 3;
+    } else if (line.startsWith('http://')) {
+      sliceStart = 7;
+    } else if (line.startsWith('https://')) {
+      sliceStart = 8;
+    } else if (line.startsWith('|http://')) {
+      sliceStart = 8;
+    } else if (line.startsWith('|https://')) {
+      sliceStart = 9;
     }
 
-    return [
-      `[parse-filter E0004] (black) invalid domain: ${_domain}`,
-      ParseType.ErrorMessage
-    ];
+    if (sliceStart !== 0 || sliceEnd !== undefined) {
+      const sliced = line.slice(sliceStart, sliceEnd);
+      const domain = normalizeDomain(sliced);
+      if (domain) {
+        return [domain, ParseType.BlackIncludeSubdomain];
+      }
+      return [
+        `[parse-filter E0004] (black) invalid domain: ${JSON.stringify({
+          line, sliced, sliceStart, sliceEnd
+        })}`,
+        ParseType.ErrorMessage
+      ];
+    }
   }
-
   /**
- * `_vmind.qqvideo.tc.qq.com^`
- * `arketing.indianadunes.com^`
- * `charlestownwyllie.oaklawnnonantum.com^`
- * `-telemetry.officeapps.live.com^`
- * `-tracker.biliapi.net`
- * `-logging.nextmedia.com`
- * `_social_tracking.js^`
- */
+   * `_vmind.qqvideo.tc.qq.com^`
+   * `arketing.indianadunes.com^`
+   * `charlestownwyllie.oaklawnnonantum.com^`
+   * `-telemetry.officeapps.live.com^`
+   * `-tracker.biliapi.net`
+   * `-logging.nextmedia.com`
+   * `_social_tracking.js^`
+   */
   if (
     firstCharCode !== 124 // 124 `|`
     && lastCharCode === 94 // 94 `^`
@@ -524,43 +537,62 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
     ];
   }
 
-  if (lineStartsWithSingleDot) {
-    /**
-     * `.cookielaw.js`
-     * `.content_tracking.js`
-     * `.ads.css`
-     */
-    const _domain = line.slice(1);
+  // Possibly that entire rule is domain
 
-    const suffix = gorhill.getPublicSuffix(_domain);
-    if (!suffix || !gorhill.suffixInPSL(suffix)) {
-      // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
-      return null;
-    }
+  /**
+   * lineStartsWithSingleDot:
+   *
+   * `.cookielaw.js`
+   * `.content_tracking.js`
+   * `.ads.css`
+   *
+   * else:
+   *
+   * `_prebid.js`
+   * `t.yesware.com`
+   * `ubmcmm.baidustatic.com`
+   * `://www.smfg-card.$document`
+   * `portal.librus.pl$$advertisement-module`
+   * `@@-ds.metric.gstatic.com^|`
+   * `://gom.ge/cookie.js`
+   * `://accout-update-smba.jp.$document`
+   * `_200x250.png`
+   * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
+   */
+  let sliceStart = 0;
+  let sliceEnd: number | undefined;
+  if (lineStartsWithSingleDot) {
+    sliceStart = 1;
+  }
+  if (line.endsWith('^$all')) { // This salvage line `thepiratebay3.com^$all`
+    sliceEnd = -5;
+  } else if (
+    // Try to salvage line like `://account.smba.$document`
+    // For this specific line, it will fail anyway though.
+    line.endsWith('$document')
+  ) {
+    sliceEnd = -9;
+  }
+  const sliced = (sliceStart !== 0 || sliceEnd !== undefined) ? line.slice(sliceStart, sliceEnd) : line;
+  const suffix = gorhill.getPublicSuffix(sliced);
+  /**
+   * Fast exclude definitely not domain-like resource
+   *
+   * `.gatracking.js`, suffix is `js`,
+   * `.ads.css`, suffix is `css`,
+   * `-cpm-ads.$badfilter`, suffix is `$badfilter`,
+   * `portal.librus.pl$$advertisement-module`, suffix is `pl$$advertisement-module`
+   */
+  if (!suffix || !gorhill.suffixInPSL(suffix)) {
+    // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
+    console.log({ line, suffix });
+    return null;
+  }
 
-    const tryNormalizeDomain = normalizeDomain(_domain);
-    if (tryNormalizeDomain === _domain) {
-      // the entire rule is domain
-      return [line, ParseType.BlackIncludeSubdomain];
-    }
-  } else {
-    /**
-     * `_prebid.js`
-     * `t.yesware.com`
-     * `ubmcmm.baidustatic.com`
-     * `://www.smfg-card.$document`
-     * `portal.librus.pl$$advertisement-module`
-     * `@@-ds.metric.gstatic.com^|`
-     * `://gom.ge/cookie.js`
-     * `://accout-update-smba.jp.$document`
-     * `_200x250.png`
-     * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
-     */
-    const tryNormalizeDomain = normalizeDomain(line);
-    if (tryNormalizeDomain === line) {
-      // the entire rule is domain
-      return [line, ParseType.BlackIncludeSubdomain];
-    }
+  const tryNormalizeDomain = normalizeDomain(sliced);
+  if (tryNormalizeDomain === sliced) {
+    // the entire rule is domain
+    return [sliced, ParseType.BlackIncludeSubdomain];
   }
 
   return [