ソースを参照

Improve parsing (ready for reject ip and reject wildcard)

SukkaW 9 ヶ月 前
コミット
172e4efd8a
2 ファイル変更76 行追加24 行削除
  1. 2 2
      Build/constants/loose-tldts-opt.ts
  2. 74 22
      Build/lib/parse-filter/filters.ts

+ 2 - 2
Build/constants/loose-tldts-opt.ts

@@ -3,9 +3,9 @@ import type * as tldts from 'tldts';
 export const looseTldtsOpt: NonNullable<Parameters<typeof tldts.getSubdomain>[1]> = {
   allowPrivateDomains: false,
   extractHostname: false,
+  mixedInputs: false,
   validateHostname: false,
-  detectIp: false,
-  mixedInputs: false
+  detectIp: false
 };
 
 export const loosTldOptWithPrivateDomains: NonNullable<Parameters<typeof tldts.getSubdomain>[1]> = {

+ 74 - 22
Build/lib/parse-filter/filters.ts

@@ -4,9 +4,8 @@ import { fetchAssets } from '../fetch-assets';
 import { onBlackFound, onWhiteFound } from './shared';
 import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
 import { looseTldtsOpt } from '../../constants/loose-tldts-opt';
-import tldts from 'tldts-experimental';
+import tldts from 'tldts';
 import { NetworkFilter } from '@ghostery/adblocker';
-import { fastNormalizeDomain, fastNormalizeDomainWithoutWww, fastNormalizeDomainWithoutWwwNoIP } from '../normalize-domain';
 import { isProbablyIpv4, isProbablyIpv6 } from 'foxts/is-probably-ip';
 
 const enum ParseType {
@@ -232,16 +231,31 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
       && filter.isPlain() // isPlain() === !isRegex()
       && (!filter.isFullRegex())
     ) {
+      const white = filter.isException() || filter.isBadFilter();
+
       // We don't want tldts to call its own "extractHostname" on ip, bail out ip first.
       // Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
       if (isProbablyIpv4(filter.hostname) || isProbablyIpv6(filter.hostname)) {
+        if (white) {
+          // We do not support whitelist IP anyway.
+          result[1] = ParseType.Null;
+          return result;
+        }
         result[0] = filter.hostname;
         result[1] = ParseType.BlackIP;
         return result;
       }
 
-      const hostname = fastNormalizeDomainWithoutWwwNoIP(filter.hostname);
-      if (!hostname) {
+      const parsed = tldts.parse(filter.hostname, looseTldtsOpt);
+
+      /**
+       * We can exclude wildcard in TLD
+       *
+       * ||example.*
+       *
+       * This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc.
+       */
+      if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) {
         result[1] = ParseType.Null;
         return result;
       }
@@ -251,12 +265,23 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
       //  |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
       const isIncludeAllSubDomain = filter.isHostnameAnchor();
 
-      if (filter.isException() || filter.isBadFilter()) {
-        result[0] = hostname;
+      let hostname = parsed.hostname;
+      if (white) {
+        result[0] = filter.hostname;
         result[1] = isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
         return result;
       }
 
+      // we only strip www when it is blacklist
+      if (parsed.subdomain) {
+        if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') {
+          hostname = parsed.domain;
+        }
+        if (parsed.subdomain.startsWith('www.')) {
+          hostname = parsed.subdomain.slice(4) + '.' + parsed.domain;
+        }
+      }
+
       const _1p = filter.firstParty();
       const _3p = filter.thirdParty();
 
@@ -439,38 +464,65 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
     return result;
   }
 
-  if (sliced.charCodeAt(0) === 45 /* - */) {
-    // line.startsWith('-') is not a valid domain
-    result[1] = ParseType.ErrorMessage;
-    result[0] = `[parse-filter E0001] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({
+  // We don't want tldts to call its own "extractHostname" on ip, bail out ip first.
+  // Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
+  if (isProbablyIpv4(sliced) || isProbablyIpv6(sliced)) {
+    // TODO: we might want to implements reject ip in the future
+    result[0] = `[parse-filter E0002] (${white ? 'white' : 'black'}) ip: ${JSON.stringify({
       line, sliced, sliceStart, sliceEnd
     })}`;
+    result[1] = ParseType.ErrorMessage;
     return result;
   }
 
-  const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt);
-  if (!suffix) {
-    // This exclude domain-like resource like `_social_tracking.js^`
+  const parsed = tldts.parse(sliced, looseTldtsOpt);
+
+  /**
+   * We can exclude wildcard in TLD
+   *
+   * ||example.*
+   *
+   * We can also exclude URL path pattern like this, since TLD and file extension don't overlapped
+   *
+   * -ad.css
+   * -ad.js
+   *
+   * This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc.
+   */
+  if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) {
     result[1] = ParseType.Null;
     return result;
   }
 
-  const normalizer = white ? fastNormalizeDomain : fastNormalizeDomainWithoutWww;
-  const domain = normalizer(sliced);
-
-  if (domain) {
-    result[0] = domain;
-
+  // no wildcard, we can safely normalize it˝
+  if (!parsed.hostname.includes('*')) {
     if (white) {
+      result[0] = parsed.hostname;
       result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
-    } else {
-      result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
+      return result;
     }
+
+    // blacklist, we can strip www from subdomain
+    if (parsed.subdomain) {
+      if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') {
+        result[0] = parsed.domain;
+        result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
+        return result;
+      }
+      if (parsed.subdomain.startsWith('www.')) {
+        result[0] = parsed.subdomain.slice(4) + '.' + parsed.domain;
+        result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
+        return result;
+      }
+    }
+
+    result[0] = parsed.hostname;
+    result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
     return result;
   }
 
   result[0] = `[parse-filter E0010] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({
-    line, domain, suffix, sliced, sliceStart, sliceEnd
+    line, sliced, sliceStart, sliceEnd, parsed
   })}`;
   result[1] = ParseType.ErrorMessage;
   return result;