浏览代码

Perf: faster adguard filter syntax parsing

SukkaW 1 年之前
父节点
当前提交
05ccd9fa50
共有 4 个文件被更改,包括 42 次插入29 次删除
  1. 5 0
      Build/constants/loose-tldts-opt.ts
  2. 17 11
      Build/lib/normalize-domain.ts
  3. 10 2
      Build/lib/parse-filter.test.ts
  4. 10 16
      Build/lib/parse-filter.ts

+ 5 - 0
Build/constants/loose-tldts-opt.ts

@@ -12,3 +12,8 @@ export const loosTldOptWithPrivateDomains: Parameters<typeof tldts.getSubdomain>
   ...looseTldtsOpt,
   ...looseTldtsOpt,
   allowPrivateDomains: true
   allowPrivateDomains: true
 };
 };
+
+export const normalizeTldtsOpt: Parameters<typeof tldts.getSubdomain>[1] = {
+  allowPrivateDomains: true
+  // detectIp: true
+};

+ 17 - 11
Build/lib/normalize-domain.ts

@@ -1,26 +1,32 @@
 // https://github.com/remusao/tldts/issues/2121
 // https://github.com/remusao/tldts/issues/2121
 // import tldts from 'tldts-experimental';
 // import tldts from 'tldts-experimental';
 import tldts from 'tldts';
 import tldts from 'tldts';
-export const normalizeDomain = (domain: string) => {
-  if (!domain) return null;
+import { normalizeTldtsOpt } from '../constants/loose-tldts-opt';
+
+type TldTsParsed = ReturnType<typeof tldts.parse>;
+
+export const normalizeDomain = (domain: string, parsed: TldTsParsed | null = null) => {
+  if (domain.length === 0) return null;
+
+  parsed ??= tldts.parse(domain, normalizeTldtsOpt);
 
 
-  const parsed = tldts.parse(domain, { allowPrivateDomains: true, allowIcannDomains: true, detectIp: true });
   if (parsed.isIp) return null;
   if (parsed.isIp) return null;
-  if (!parsed.hostname) return null;
-  // Private invalid domain (things like .tor, .dn42, etc)
-  if (!parsed.isIcann && !parsed.isPrivate) return null;
 
 
   let h = parsed.hostname;
   let h = parsed.hostname;
+  if (h === null) return null;
+  // Private invalid domain (things like .tor, .dn42, etc)
+  if (!parsed.isIcann && !parsed.isPrivate) return null;
 
 
-  let sliceStart: number | undefined;
-  let sliceEnd: number | undefined;
+  let sliceStart = 0;
+  let sliceEnd = 0;
 
 
   if (h[0] === '.') sliceStart = 1;
   if (h[0] === '.') sliceStart = 1;
-  if (h.endsWith('.')) sliceEnd = -1;
+  // eslint-disable-next-line sukka/string/prefer-string-starts-ends-with -- performance
+  if (h[h.length - 1] === '.') sliceEnd = -1;
 
 
-  if (sliceStart !== undefined || sliceEnd !== undefined) {
+  if (sliceStart !== 0 || sliceEnd !== 0) {
     h = h.slice(sliceStart, sliceEnd);
     h = h.slice(sliceStart, sliceEnd);
   }
   }
 
 
-  return h || null;
+  return h.length > 0 ? h : null;
 };
 };

+ 10 - 2
Build/lib/parse-filter.test.ts

@@ -1,12 +1,20 @@
 import { describe, it } from 'mocha';
 import { describe, it } from 'mocha';
 
 
-import { processFilterRules } from './parse-filter';
+import { parse, processFilterRules, type ParseType } from './parse-filter';
 import { createCacheKey } from './cache-filesystem';
 import { createCacheKey } from './cache-filesystem';
 import { createSpan } from '../trace';
 import { createSpan } from '../trace';
 
 
 const cacheKey = createCacheKey(__filename);
 const cacheKey = createCacheKey(__filename);
 
 
-describe('processFilterRules', () => {
+describe('parse', () => {
+  const MUTABLE_PARSE_LINE_RESULT: [string, ParseType] = ['', 1000];
+
+  it('||top.mail.ru^$badfilter', () => {
+    console.log(parse('||top.mail.ru^$badfilter', MUTABLE_PARSE_LINE_RESULT));
+  });
+});
+
+describe.skip('processFilterRules', () => {
   it('https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt', () => {
   it('https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt', () => {
     console.log(processFilterRules(
     console.log(processFilterRules(
       createSpan('noop'),
       createSpan('noop'),

+ 10 - 16
Build/lib/parse-filter.ts

@@ -143,6 +143,8 @@ const enum ParseType {
   Null = 1000
   Null = 1000
 }
 }
 
 
+export { type ParseType };
+
 export async function processFilterRules(
 export async function processFilterRules(
   parentSpan: Span,
   parentSpan: Span,
   filterRulesUrl: string,
   filterRulesUrl: string,
@@ -289,10 +291,12 @@ const kwfilter = createKeywordFilter([
   '$popup',
   '$popup',
   '$removeparam',
   '$removeparam',
   '$popunder',
   '$popunder',
-  '$cname'
+  '$cname',
+  // some bad syntax
+  '^popup'
 ]);
 ]);
 
 
-function parse($line: string, result: [string, ParseType]): [hostname: string, flag: ParseType] {
+export function parse($line: string, result: [string, ParseType]): [hostname: string, flag: ParseType] {
   if (
   if (
     // doesn't include
     // doesn't include
     !$line.includes('.') // rule with out dot can not be a domain
     !$line.includes('.') // rule with out dot can not be a domain
@@ -685,6 +689,7 @@ function parse($line: string, result: [string, ParseType]): [hostname: string, f
    */
    */
   let sliceStart = 0;
   let sliceStart = 0;
   let sliceEnd: number | undefined;
   let sliceEnd: number | undefined;
+
   if (lineStartsWithSingleDot) {
   if (lineStartsWithSingleDot) {
     sliceStart = 1;
     sliceStart = 1;
   }
   }
@@ -696,28 +701,17 @@ function parse($line: string, result: [string, ParseType]): [hostname: string, f
     line.endsWith('$document')
     line.endsWith('$document')
   ) {
   ) {
     sliceEnd = -9;
     sliceEnd = -9;
+  } else if (line.endsWith('$badfilter')) {
+    sliceEnd = -10;
   }
   }
   const sliced = (sliceStart !== 0 || sliceEnd !== undefined) ? line.slice(sliceStart, sliceEnd) : line;
   const sliced = (sliceStart !== 0 || sliceEnd !== undefined) ? line.slice(sliceStart, sliceEnd) : line;
-  const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt);
-  /**
-   * Fast exclude definitely not domain-like resource
-   *
-   * `.gatracking.js`, suffix is `js`,
-   * `.ads.css`, suffix is `css`,
-   * `-cpm-ads.$badfilter`, suffix is `$badfilter`,
-   * `portal.librus.pl$$advertisement-module`, suffix is `pl$$advertisement-module`
-   */
-  if (!suffix) {
-    // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
-    result[1] = ParseType.Null;
-    return result;
-  }
 
 
   const tryNormalizeDomain = normalizeDomain(sliced);
   const tryNormalizeDomain = normalizeDomain(sliced);
   if (tryNormalizeDomain === sliced) {
   if (tryNormalizeDomain === sliced) {
     // the entire rule is domain
     // the entire rule is domain
     result[0] = sliced;
     result[0] = sliced;
     result[1] = ParseType.BlackIncludeSubdomain;
     result[1] = ParseType.BlackIncludeSubdomain;
+
     return result;
     return result;
   }
   }