Browse Source

Refactor: speed up reject parsing

SukkaW 2 years ago
parent
commit
16a08bd07d

+ 2 - 2
Build/build-internal-cdn-rules.ts

@@ -58,7 +58,7 @@ export const buildInternalCDNDomains = task(import.meta.path, async () => {
     }
   };
 
-  const [gorhill] = await Promise.all([
+  const gorhill = (await Promise.all([
     getGorhillPublicSuffixPromise(),
     processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/cdn.conf')),
     processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/global.conf')),
@@ -70,7 +70,7 @@ export const buildInternalCDNDomains = task(import.meta.path, async () => {
     processLocalDomainSet(path.resolve(import.meta.dir, '../List/domainset/download.conf')),
 
     fsp.mkdir(path.resolve(import.meta.dir, '../List/internal'), { recursive: true })
-  ]);
+  ]))[0];
 
   return compareAndWriteFile(
     [

+ 2 - 2
Build/build-internal-chn-domains.ts

@@ -5,10 +5,10 @@ import { task } from './lib/trace-runner';
 import { compareAndWriteFile } from './lib/create-file';
 
 export const buildInternalChnDomains = task(import.meta.path, async () => {
-  const [result] = await Promise.all([
+  const result = (await Promise.all([
     parseFelixDnsmasq('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'),
     fsp.mkdir(path.resolve(import.meta.dir, '../List/internal'), { recursive: true })
-  ]);
+  ]))[0];
 
   return compareAndWriteFile(
     result.map(line => `SUFFIX,${line}`),

+ 2 - 2
Build/build-internal-reverse-chn-cidr.ts

@@ -25,10 +25,10 @@ const RESERVED_IPV4_CIDR = [
 ];
 
 export const buildInternalReverseChnCIDR = task(import.meta.path, async () => {
-  const [cidr] = await Promise.all([
+  const cidr = (await Promise.all([
     processLineFromReadline(await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')),
     fsp.mkdir(path.resolve(import.meta.dir, '../List/internal'), { recursive: true })
-  ]);
+  ]))[0];
 
   const reversedCidr = exclude(
     [

+ 2 - 2
Build/build-speedtest-domainset.ts

@@ -16,10 +16,10 @@ const latestTopUserAgentsPromise = fetchWithRetry('https://unpkg.com/top-user-ag
   .then(res => res.json() as Promise<string[]>);
 
 const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>> => {
-  const [topUserAgents] = await Promise.all([
+  const topUserAgents = (await Promise.all([
     latestTopUserAgentsPromise,
     s.acquire()
-  ]);
+  ]))[0];
 
   const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)];
 

+ 4 - 4
Build/download-previous-build.ts

@@ -53,10 +53,10 @@ export const downloadPreviousBuild = task(import.meta.path, async () => {
   await traceAsync(
     'Download and extract previous build',
     async () => {
-      const [resp] = await Promise.all([
+      const resp = (await Promise.all([
         fetchWithRetry('https://codeload.github.com/sukkalab/ruleset.skk.moe/tar.gz/master', defaultRequestInit),
         fsp.mkdir(extractedPath, { recursive: true })
-      ]);
+      ]))[0];
 
       const extract = tarStream.extract();
       Readable.fromWeb(resp.body!).pipe(zlib.createGunzip()).pipe(extract);
@@ -88,10 +88,10 @@ export const downloadPublicSuffixList = task(import.meta.path, async () => {
   const publicSuffixDir = path.resolve(import.meta.dir, '../node_modules/.cache');
   const publicSuffixPath = path.join(publicSuffixDir, 'public_suffix_list_dat.txt');
 
-  const [resp] = await Promise.all([
+  const resp = (await Promise.all([
     fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit),
     fsp.mkdir(publicSuffixDir, { recursive: true })
-  ]);
+  ]))[0];
 
   return Bun.write(publicSuffixPath, resp as Response);
 }, 'download-publicsuffixlist');

+ 1 - 1
Build/lib/cached-tld-parse.ts

@@ -1,6 +1,6 @@
 import * as tldts from 'tldts';
 import { createCache } from './cache-apply';
-import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
+import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
 
 const cache = createCache('cached-tld-parse', true);
 

+ 61 - 0
Build/lib/fetch-assets.ts

@@ -0,0 +1,61 @@
+import picocolors from 'picocolors';
+import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
+
+class CustomAbortError extends Error {
+  public readonly name = 'AbortError';
+  public readonly digest = 'AbortError';
+}
+
+const sleepWithAbort = (ms: number, signal: AbortSignal) => new Promise<void>((resolve, reject) => {
+  signal.throwIfAborted();
+  signal.addEventListener('abort', stop);
+  Bun.sleep(ms).then(done).catch(doReject);
+
+  function done() {
+    signal.removeEventListener('abort', stop);
+    resolve();
+  }
+  function stop(this: AbortSignal) {
+    reject(this.reason);
+  }
+  function doReject(reason: unknown) {
+    signal.removeEventListener('abort', stop);
+    reject(reason);
+  }
+});
+
+export async function fetchAssets(url: string, fallbackUrls: string[] | readonly string[]) {
+  const controller = new AbortController();
+
+  const fetchMainPromise = fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit })
+    .then(r => r.text())
+    .then(text => {
+      controller.abort();
+      return text;
+    });
+  const createFetchFallbackPromise = async (url: string, index: number) => {
+    // Most assets can be downloaded within 250ms. To avoid wasting bandwidth, we will wait for 500ms before downloading from the fallback URL.
+    try {
+      await sleepWithAbort(500 + (index + 1) * 20, controller.signal);
+    } catch {
+      console.log(picocolors.gray('[fetch cancelled early]'), picocolors.gray(url));
+      throw new CustomAbortError();
+    }
+    if (controller.signal.aborted) {
+      console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url));
+      throw new CustomAbortError();
+    }
+    const res = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
+    const text = await res.text();
+    controller.abort();
+    return text;
+  };
+
+  return Promise.any([
+    fetchMainPromise,
+    ...fallbackUrls.map(createFetchFallbackPromise)
+  ]).catch(e => {
+    console.log(`Download Rule for [${url}] failed`);
+    throw e;
+  });
+}

+ 2 - 2
Build/lib/get-gorhill-publicsuffix.ts

@@ -2,7 +2,7 @@ import { toASCII } from 'punycode';
 import path from 'path';
 import { traceAsync } from './trace-runner';
 import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
-import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
+import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
 
 const publicSuffixPath = path.resolve(import.meta.dir, '../../node_modules/.cache/public_suffix_list_dat.txt');
 
@@ -18,7 +18,7 @@ const getGorhillPublicSuffix = () => traceAsync('create gorhill public suffix in
         console.log('public_suffix_list.dat not found, fetch directly from remote.');
         return r.text();
       }),
-    import('gorhill-publicsuffixlist')
+    import('@gorhill/publicsuffixlist')
   ]);
 
   gorhill.parse(publicSuffixListDat, toASCII);

+ 95 - 140
Build/lib/parse-filter.ts

@@ -1,21 +1,20 @@
 // @ts-check
-import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
-
 import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
 import { NetworkFilter } from '@cliqz/adblocker';
 import { processLine } from './process-line';
 import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
-import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
+import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
 
 import { traceAsync } from './trace-runner';
 import picocolors from 'picocolors';
 import { normalizeDomain } from './normalize-domain';
+import { fetchAssets } from './fetch-assets';
 
 const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
 let foundDebugDomain = false;
 
 const warnOnceUrl = new Set<string>();
-const warnOnce = (url: string, isWhite: boolean, ...message: any[]) => {
+const warnOnce = (url: string, isWhite: boolean, ...message: string[]) => {
   const key = `${url}${isWhite ? 'white' : 'black'}`;
   if (warnOnceUrl.has(key)) {
     return;
@@ -54,7 +53,7 @@ export function processHosts(hostsUrl: string, includeAllSubDomain = false, skip
         continue;
       }
 
-      const [, domain] = line.split(/\s/);
+      const domain = line.split(/\s/)[1];
       if (!domain) {
         continue;
       }
@@ -185,7 +184,9 @@ export async function processFilterRules(
 }
 
 const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
-const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
+const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder|\$cname)/;
+// cname exceptional filter can not be parsed by NetworkFilter
+// Surge / Clash can't handle CNAME either, so we just ignore them
 
 function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: ParseType] {
   if (
@@ -213,15 +214,15 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
     return null;
   }
 
-  const firstChar = line[0];
-  const lastChar = line[len - 1];
+  const firstCharCode = line[0].charCodeAt(0);
+  const lastCharCode = line[len - 1].charCodeAt(0);
 
   if (
-    firstChar === '/'
+    firstCharCode === 47 // 47 `/`
     // ends with
-    || lastChar === '.' // || line.endsWith('.')
-    || lastChar === '-' // || line.endsWith('-')
-    || lastChar === '_' // || line.endsWith('_')
+    || lastCharCode === 46 // 46 `.`, line.endsWith('.')
+    || lastCharCode === 45 // 45 `-`, line.endsWith('-')
+    || lastCharCode === 95 // 95 `_`, line.endsWith('_')
     // special modifier
     || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
     // || line.includes('$popup')
@@ -238,6 +239,8 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
   const filter = NetworkFilter.parse(line);
   if (filter) {
     if (
+      // filter.isCosmeticFilter() // always false
+      // filter.isNetworkFilter() // always true
       filter.isElemHide()
       || filter.isGenericHide()
       || filter.isSpecificHide()
@@ -253,8 +256,7 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
 
     if (
       filter.hostname // filter.hasHostname() // must have
-      && filter.isPlain()
-      // && (!filter.isRegex()) // isPlain() === !isRegex()
+      && filter.isPlain() // isPlain() === !isRegex()
       && (!filter.isFullRegex())
     ) {
       const hostname = normalizeDomain(filter.hostname);
@@ -286,95 +288,106 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
     }
   }
 
-  /**
-   * abnormal filter that can not be parsed by NetworkFilter
+  // After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
+  // We now need to "salvage" the line as much as possible
+
+  /*
+   * From now on, we are mostly facing non-standard domain rules (some are regex like)
+   * We first skip third-party and frame rules, as Surge / Clash can't handle them
+   *
+   * `.sharecounter.$third-party`
+   * `.bbelements.com^$third-party`
+   * `://o0e.ru^$third-party`
+   * `.1.1.1.l80.js^$third-party`
    */
-
   if (line.includes('$third-party') || line.includes('$frame')) {
-    /*
-     * `.bbelements.com^$third-party`
-     * `://o0e.ru^$third-party`
-     */
     return null;
   }
 
   /** @example line.endsWith('^') */
-  const linedEndsWithCaret = lastChar === '^';
+  const linedEndsWithCaret = lastCharCode === 94; // lastChar === '^';
   /** @example line.endsWith('^|') */
-  const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
+  const lineEndsWithCaretVerticalBar = (lastCharCode === 124 /** lastChar === '|' */) && line[len - 2] === '^';
   /** @example line.endsWith('^') || line.endsWith('^|') */
   const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
 
   // whitelist (exception)
-  if (firstChar === '@' && line[1] === '@') {
+  if (
+    firstCharCode === 64 // 64 `@`
+    && line[1] === '@'
+  ) {
     /**
-     * cname exceptional filter can not be parsed by NetworkFilter
-     *
-     * `@@||m.faz.net^$cname`
+     * Some "malformed" regex-based filters can not be parsed by NetworkFilter
+     * "$genericblock`" is also not supported by NetworkFilter, see:
+     *  https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950
      *
-     * Surge / Clash can't handle CNAME either, so we just ignore them
+     * `@@||cmechina.net^$genericblock`
+     * `@@|ftp.bmp.ovh^|`
+     * `@@|adsterra.com^|`
+     * `@@.atlassian.net$document`
+     * `@@||ad.alimama.com^$genericblock`
      */
-    if (line.endsWith('$cname')) {
-      return null;
+
+    let sliceStart = 0;
+    let sliceEnd: number | undefined;
+
+    // line.startsWith('@@|') || line.startsWith('@@.')
+    if (line[2] === '|' || line[2] === '.') {
+      sliceStart = 3;
+      // line.startsWith('@@||')
+      if (line[3] === '|') {
+        sliceStart = 4;
+      }
     }
 
     /**
-     * Some "malformed" regex-based filters can not be parsed by NetworkFilter
-     * "$genericblock`" is also not supported by NetworkFilter
+     * line.startsWith('@@://')
      *
-     * `@@||cmechina.net^$genericblock`
-     * `@@|ftp.bmp.ovh^|`
-     * `@@|adsterra.com^|`
+     * `@@://googleadservices.com^|`
+     * `@@://www.googleadservices.com^|`
      */
-    if (
-      (
-        // line.startsWith('@@|')
-        line[2] === '|'
-        // line.startsWith('@@.')
-        || line[2] === '.'
-        /**
-         * line.startsWith('@@://')
-         *
-         * `@@://googleadservices.com^|`
-         * `@@://www.googleadservices.com^|`
-         */
-        || (line[2] === ':' && line[3] === '/' && line[4] === '/')
-      )
-      && (
-        lineEndsWithCaretOrCaretVerticalBar
-        || line.endsWith('$genericblock')
-        || line.endsWith('$document')
-      )
-    ) {
-      const _domain = line
-        .replace('@@||', '')
-        .replace('@@://', '')
-        .replace('@@|', '')
-        .replace('@@.', '')
-        .replace('^|', '')
-        .replace('^$genericblock', '')
-        .replace('$genericblock', '')
-        .replace('^$document', '')
-        .replace('$document', '')
-        .replaceAll('^', '')
-        .trim();
+    if (line[2] === ':' && line[3] === '/' && line[4] === '/') {
+      sliceStart = 5;
+    }
 
-      const domain = normalizeDomain(_domain);
+    if (lineEndsWithCaretOrCaretVerticalBar) {
+      sliceEnd = -2;
+    } else if (line.endsWith('$genericblock')) {
+      sliceEnd = -13;
+      if (line[len - 14] === '^') { // line.endsWith('^$genericblock')
+        sliceEnd = -14;
+      }
+    } else if (line.endsWith('$document')) {
+      sliceEnd = -9;
+      if (line[len - 10] === '^') { // line.endsWith('^$document')
+        sliceEnd = -10;
+      }
+    }
+
+    if (sliceStart !== 0 || sliceEnd !== undefined) {
+      const sliced = line.slice(sliceStart, sliceEnd);
+      const domain = normalizeDomain(sliced);
       if (domain) {
         return [domain, ParseType.WhiteIncludeSubdomain];
       }
-
       return [
-        `[parse-filter E0001] (white) invalid domain: ${_domain}`,
+        `[parse-filter E0001] (white) invalid domain: ${JSON.stringify({
+          line, sliced, sliceStart, sliceEnd
+        })}`,
         ParseType.ErrorMessage
       ];
     }
-  }
 
-  if (firstChar === '|') {
-    const lineEndswithCname = line.endsWith('$cname');
+    return [
+      `[parse-filter E0006] (white) failed to parse: ${JSON.stringify({
+        line, sliceStart, sliceEnd
+      })}`,
+      ParseType.ErrorMessage
+    ];
+  }
 
-    if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
+  if (firstCharCode === 124) { // 124 `|`
+    if (lineEndsWithCaretOrCaretVerticalBar) {
       /**
        * Some malformed filters can not be parsed by NetworkFilter:
        *
@@ -387,12 +400,11 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
       const includeAllSubDomain = line[1] === '|';
 
       const sliceStart = includeAllSubDomain ? 2 : 1;
-      const sliceEnd = lastChar === '^'
+      const sliceEnd = lastCharCode === 94 // lastChar === '^'
         ? -1
-        : lineEndsWithCaretOrCaretVerticalBar
+        : (lineEndsWithCaretVerticalBar
           ? -2
-          // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
-          : (lineEndswithCname ? -6 : 0);
+          : undefined);
 
       const _domain = line
         .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
@@ -410,7 +422,7 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
     }
   }
 
-  const lineStartsWithSingleDot = firstChar === '.';
+  const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.`
   if (
     lineStartsWithSingleDot
     && lineEndsWithCaretOrCaretVerticalBar
@@ -489,7 +501,10 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
  * `-logging.nextmedia.com`
  * `_social_tracking.js^`
  */
-  if (firstChar !== '|' && lastChar === '^') {
+  if (
+    firstCharCode !== 124 // 124 `|`
+    && lastCharCode === 94 // 94 `^`
+  ) {
     const _domain = line.slice(0, -1);
 
     const suffix = gorhill.getPublicSuffix(_domain);
@@ -553,63 +568,3 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
     ParseType.ErrorMessage
   ];
 }
-
-class CustomAbortError extends Error {
-  public readonly name = 'AbortError';
-  public readonly digest = 'AbortError';
-}
-
-const sleepWithAbort = (ms: number, signal: AbortSignal) => new Promise<void>((resolve, reject) => {
-  signal.throwIfAborted();
-  signal.addEventListener('abort', stop);
-  Bun.sleep(ms).then(done).catch(doReject);
-
-  function done() {
-    signal.removeEventListener('abort', stop);
-    resolve();
-  }
-  function stop(this: AbortSignal) {
-    reject(this.reason);
-  }
-  function doReject(reason: unknown) {
-    signal.removeEventListener('abort', stop);
-    reject(reason);
-  }
-});
-
-async function fetchAssets(url: string, fallbackUrls: string[] | readonly string[]) {
-  const controller = new AbortController();
-
-  const fetchMainPromise = fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit })
-    .then(r => r.text())
-    .then(text => {
-      console.log(picocolors.gray('[fetch finish]'), picocolors.gray(url));
-      controller.abort();
-      return text;
-    });
-  const createFetchFallbackPromise = async (url: string, index: number) => {
-    // Most assets can be downloaded within 250ms. To avoid wasting bandwidth, we will wait for 350ms before downloading from the fallback URL.
-    try {
-      await sleepWithAbort(300 + (index + 1) * 20, controller.signal);
-    } catch {
-      console.log(picocolors.gray('[fetch cancelled early]'), picocolors.gray(url));
-      throw new CustomAbortError();
-    }
-    if (controller.signal.aborted) {
-      console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url));
-      throw new CustomAbortError();
-    }
-    const res = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
-    const text = await res.text();
-    controller.abort();
-    return text;
-  };
-
-  return Promise.any([
-    fetchMainPromise,
-    ...fallbackUrls.map(createFetchFallbackPromise)
-  ]).catch(e => {
-    console.log(`Download Rule for [${url}] failed`);
-    throw e;
-  });
-}

+ 1 - 1
Build/lib/stable-sort-domain.ts

@@ -1,4 +1,4 @@
-import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
+import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
 import { createCachedGorhillGetDomain } from './cached-tld-parse';
 
 const compare = (a: string | null, b: string | null) => {

+ 1 - 1
Build/mod.d.ts

@@ -1,4 +1,4 @@
-declare module 'gorhill-publicsuffixlist' {
+declare module '@gorhill/publicsuffixlist' {
   type Selfie =
     | string
     | {

BIN
bun.lockb


+ 1 - 1
package.json

@@ -15,13 +15,13 @@
   "license": "ISC",
   "dependencies": {
     "@cliqz/adblocker": "^1.26.12",
+    "@gorhill/publicsuffixlist": "^3.0.1",
     "@sukka/listdir": "^0.3.1",
     "async-retry": "^1.3.3",
     "async-sema": "^3.1.1",
     "ci-info": "^4.0.0",
     "csv-parse": "^5.5.3",
     "fast-cidr-tools": "^0.2.2",
-    "gorhill-publicsuffixlist": "github:gorhill/publicsuffixlist.js",
     "mnemonist": "^0.39.6",
     "path-scurry": "^1.10.1",
     "picocolors": "^1.0.0",