Browse Source

Refactor: rework reject hosts parser

SukkaW 2 years ago
parent
commit
065eeff91f

+ 112 - 75
Build/lib/parse-filter.js

@@ -20,6 +20,9 @@ const warnOnce = (url, isWhite, ...message) => {
   console.warn(url, isWhite ? '(white)' : '(black)', ...message);
   console.warn(url, isWhite ? '(white)' : '(black)', ...message);
 };
 };
 
 
+/**
+ * @param {string} domain
+ */
 const normalizeDomain = (domain) => {
 const normalizeDomain = (domain) => {
   if (!domain) return null;
   if (!domain) return null;
 
 
@@ -28,7 +31,10 @@ const normalizeDomain = (domain) => {
 
 
   if (parsed.isIcann || parsed.isPrivate) {
   if (parsed.isIcann || parsed.isPrivate) {
     const h = parsed.hostname;
     const h = parsed.hostname;
-    return h?.[0] === '.' ? h.slice(1) : h;
+
+    if (h === null) return null;
+
+    return h[0] === '.' ? h.slice(1) : h;
   }
   }
 
 
   return null;
   return null;
@@ -259,6 +265,7 @@ function parse($line, gorhill) {
 
 
   const line = $line.trim();
   const line = $line.trim();
 
 
+  /** @example line.length */
   const len = line.length;
   const len = line.length;
   if (len === 0) {
   if (len === 0) {
     return null;
     return null;
@@ -268,15 +275,13 @@ function parse($line, gorhill) {
   const lastChar = line[len - 1];
   const lastChar = line[len - 1];
 
 
   if (
   if (
-    len === 0
-    || firstChar === '/'
+    firstChar === '/'
     // ends with
     // ends with
     || lastChar === '.' // || line.endsWith('.')
     || lastChar === '.' // || line.endsWith('.')
     || lastChar === '-' // || line.endsWith('-')
     || lastChar === '-' // || line.endsWith('-')
     || lastChar === '_' // || line.endsWith('_')
     || lastChar === '_' // || line.endsWith('_')
     // special modifier
     // special modifier
     || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
     || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
-    || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
     // || line.includes('$popup')
     // || line.includes('$popup')
     // || line.includes('$removeparam')
     // || line.includes('$removeparam')
     // || line.includes('$popunder')
     // || line.includes('$popunder')
@@ -284,6 +289,10 @@ function parse($line, gorhill) {
     return null;
     return null;
   }
   }
 
 
+  if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
+    return null;
+  }
+
   const filter = NetworkFilter.parse(line);
   const filter = NetworkFilter.parse(line);
   if (filter) {
   if (filter) {
     if (
     if (
@@ -352,9 +361,11 @@ function parse($line, gorhill) {
     return null;
     return null;
   }
   }
 
 
+  /** @example line.endsWith('^') */
   const linedEndsWithCaret = lastChar === '^';
   const linedEndsWithCaret = lastChar === '^';
+  /** @example line.endsWith('^|') */
   const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
   const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
-
+  /** @example line.endsWith('^') || line.endsWith('^|') */
   const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
   const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
 
 
   // whitelist (exception)
   // whitelist (exception)
@@ -379,8 +390,19 @@ function parse($line, gorhill) {
      * `@@|adsterra.com^|`
      * `@@|adsterra.com^|`
      */
      */
     if (
     if (
-      // (line.startsWith('@@|') || line.startsWith('@@.'))
-      (line[2] === '|' || line[2] === '.')
+      (
+        // line.startsWith('@@|')
+        line[2] === '|'
+        // line.startsWith('@@.')
+        || line[2] === '.'
+        /**
+         * line.startsWith('@@://')
+         *
+         * `@@://googleadservices.com^|`
+         * `@@://www.googleadservices.com^|`
+         */
+        || (line[2] === ':' && line[3] === '/' && line[4] === '/')
+      )
       && (
       && (
         lineEndsWithCaretOrCaretVerticalBar
         lineEndsWithCaretOrCaretVerticalBar
         || line.endsWith('$genericblock')
         || line.endsWith('$genericblock')
@@ -389,6 +411,7 @@ function parse($line, gorhill) {
     ) {
     ) {
       const _domain = line
       const _domain = line
         .replace('@@||', '')
         .replace('@@||', '')
+        .replace('@@://', '')
         .replace('@@|', '')
         .replace('@@|', '')
         .replace('@@.', '')
         .replace('@@.', '')
         .replace('^|', '')
         .replace('^|', '')
@@ -409,37 +432,41 @@ function parse($line, gorhill) {
     }
     }
   }
   }
 
 
-  if (firstChar === '|' && (lineEndsWithCaretOrCaretVerticalBar || line.endsWith('$cname'))) {
-    /**
-     * Some malformed filters can not be parsed by NetworkFilter:
-     *
-     * `||smetrics.teambeachbody.com^.com^`
-     * `||solutions.|pages.indigovision.com^`
-     * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
-     */
-
-    const includeAllSubDomain = line[1] === '|';
+  if (firstChar === '|') {
+    const lineEndswithCname = line.endsWith('$cname');
+
+    if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
+      /**
+       * Some malformed filters can not be parsed by NetworkFilter:
+       *
+       * `||smetrics.teambeachbody.com^.com^`
+       * `||solutions.|pages.indigovision.com^`
+       * `||vystar..0rg@client.iebetanialaargentina.edu.co^`
+       * `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
+       */
+
+      const includeAllSubDomain = line[1] === '|';
+
+      const sliceStart = includeAllSubDomain ? 2 : 1;
+      const sliceEnd = lastChar === '^'
+        ? -1
+        : lineEndsWithCaretOrCaretVerticalBar
+          ? -2
+          // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
+          : (lineEndswithCname ? -6 : 0);
 
 
-    const sliceStart = includeAllSubDomain ? 2 : 1;
-    const sliceEnd = lastChar === '^'
-      ? -1
-      : lineEndsWithCaretOrCaretVerticalBar
-        ? -2
-        // eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
-        : (line.endsWith('$cname') ? -6 : 0);
+      const _domain = line
+        .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
+        .trim();
 
 
-    const _domain = line
-      // .replace('||', '')
-      .slice(sliceStart, sliceEnd) // we already make sure line startsWith ||
-      .trim();
+      const domain = normalizeDomain(_domain);
+      if (domain) {
+        return [domain, includeAllSubDomain ? 2 : 1];
+      }
+      console.warn('      * [parse-filter E0002] (black) invalid domain:', _domain);
 
 
-    const domain = normalizeDomain(_domain);
-    if (domain) {
-      return [domain, includeAllSubDomain ? 2 : 1];
+      return null;
     }
     }
-    console.warn('      * [parse-filter E0002] (black) invalid domain:', _domain);
-
-    return null;
   }
   }
 
 
   const lineStartsWithSingleDot = firstChar === '.';
   const lineStartsWithSingleDot = firstChar === '.';
@@ -452,16 +479,12 @@ function parse($line, gorhill) {
      * `.m.bookben.com^`
      * `.m.bookben.com^`
      * `.wap.x4399.com^`
      * `.wap.x4399.com^`
      */
      */
-    const _domain = line
-      .slice(
-        1,
-        linedEndsWithCaret
-          ? -1
-          : (lineEndsWithCaretVerticalBar ? -2 : 0)
-      ) // remove prefix dot
-      .replace('^|', '')
-      .replaceAll('^', '')
-      .trim();
+    const _domain = line.slice(
+      1, // remove prefix dot
+      linedEndsWithCaret // replaceAll('^', '')
+        ? -1
+        : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
+    );
 
 
     const suffix = gorhill.getPublicSuffix(_domain);
     const suffix = gorhill.getPublicSuffix(_domain);
     if (!gorhill.suffixInPSL(suffix)) {
     if (!gorhill.suffixInPSL(suffix)) {
@@ -479,10 +502,10 @@ function parse($line, gorhill) {
   }
   }
 
 
   /**
   /**
-   * `|http://x.o2.pl^`
-   * `://mine.torrent.pw^`
-   * `://say.ac^`
-   */
+ * `|http://x.o2.pl^`
+ * `://mine.torrent.pw^`
+ * `://say.ac^`
+ */
   if (
   if (
     (
     (
       line.startsWith('://')
       line.startsWith('://')
@@ -513,13 +536,14 @@ function parse($line, gorhill) {
   }
   }
 
 
   /**
   /**
-   * `_vmind.qqvideo.tc.qq.com^`
-   * `arketing.indianadunes.com^`
-   * `charlestownwyllie.oaklawnnonantum.com^`
-   * `-telemetry.officeapps.live.com^`
-   * `-tracker.biliapi.net`
-   * `_social_tracking.js^`
-   */
+ * `_vmind.qqvideo.tc.qq.com^`
+ * `arketing.indianadunes.com^`
+ * `charlestownwyllie.oaklawnnonantum.com^`
+ * `-telemetry.officeapps.live.com^`
+ * `-tracker.biliapi.net`
+ * `-logging.nextmedia.com`
+ * `_social_tracking.js^`
+ */
   if (firstChar !== '|' && lastChar === '^') {
   if (firstChar !== '|' && lastChar === '^') {
     const _domain = line.slice(0, -1);
     const _domain = line.slice(0, -1);
 
 
@@ -538,35 +562,48 @@ function parse($line, gorhill) {
     return null;
     return null;
   }
   }
 
 
-  /**
-   * `.3.n.2.2.l30.js`
-   * `_prebid.js`
-   * `t.yesware.com`
-   * `ubmcmm.baidustatic.com`
-   * `portal.librus.pl$$advertisement-module`
-   * `@@-ds.metric.gstatic.com^|`
-   * `://gom.ge/cookie.js`
-   * `://accout-update-smba.jp.$document`
-   * `@@://googleadservices.com^|`
-   */
-  const tryNormalizeDomain = normalizeDomain(line);
-  if (tryNormalizeDomain) {
-    if (tryNormalizeDomain === line) {
+  if (lineStartsWithSingleDot) {
+    /**
+     * `.cookielaw.js`
+     * `.content_tracking.js`
+     * `.ads.css`
+     */
+    const _domain = line.slice(1);
+
+    const suffix = gorhill.getPublicSuffix(_domain);
+    if (!suffix || !gorhill.suffixInPSL(suffix)) {
+      // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
+      return null;
+    }
+
+    const tryNormalizeDomain = normalizeDomain(_domain);
+    if (tryNormalizeDomain === _domain) {
       // the entire rule is domain
       // the entire rule is domain
       return [line, 2];
       return [line, 2];
     }
     }
-    if (lineStartsWithSingleDot && tryNormalizeDomain === line.slice(1)) {
-      // dot prefixed line has stripped
+  } else {
+    /**
+     * `_prebid.js`
+     * `t.yesware.com`
+     * `ubmcmm.baidustatic.com`
+     * `://www.smfg-card.$document`
+     * `portal.librus.pl$$advertisement-module`
+     * `@@-ds.metric.gstatic.com^|`
+     * `://gom.ge/cookie.js`
+     * `://accout-update-smba.jp.$document`
+     * `_200x250.png`
+     * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
+     */
+    const tryNormalizeDomain = normalizeDomain(line);
+    if (tryNormalizeDomain === line) {
+      // the entire rule is domain
       return [line, 2];
       return [line, 2];
     }
     }
   }
   }
 
 
-  if (!line.endsWith('.js') && !line.endsWith('.css')) {
-    console.warn('      * [parse-filter E0010] can not parse:', line);
-  }
+  console.warn('      * [parse-filter E0010] can not parse:', line);
 
 
   return null;
   return null;
-  /* eslint-enable no-nested-ternary */
 }
 }
 
 
 module.exports.processDomainLists = processDomainLists;
 module.exports.processDomainLists = processDomainLists;

+ 24 - 52
Build/lib/reject-data-source.js

@@ -27,7 +27,9 @@ const ADGUARD_FILTERS = /** @type {const} */([
     [
     [
       'https://secure.fanboy.co.nz/easyprivacy.txt',
       'https://secure.fanboy.co.nz/easyprivacy.txt',
       'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt',
       'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt',
-      'https://easylist-downloads.adblockplus.org/easyprivacy.txt'
+      'https://easylist-downloads.adblockplus.org/easyprivacy.txt',
+      'https://ublockorigin.github.io/uAssets/thirdparties/easyprivacy.txt',
+      'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt'
     ]
     ]
   ],
   ],
   // AdGuard DNS Filter
   // AdGuard DNS Filter
@@ -45,70 +47,42 @@ const ADGUARD_FILTERS = /** @type {const} */([
   'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers.txt',
   'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers.txt',
   // uBlock Origin Filter List
   // uBlock Origin Filter List
   [
   [
-    'https://ublockorigin.github.io/uAssets/filters/filters.txt',
+    'https://ublockorigin.github.io/uAssets/filters/filters.min.txt',
     [
     [
-      'https://ublockorigin.github.io/uAssetsCDN/filters/filters.txt',
-      'https://ublockorigin.pages.dev/filters/filters.txt'
-    ]
-  ],
-  [
-    'https://ublockorigin.github.io/uAssets/filters/filters-2020.txt',
-    [
-      'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2020.txt',
-      'https://ublockorigin.pages.dev/filters/filters-2020.txt'
-    ]
-  ],
-  [
-    'https://ublockorigin.github.io/uAssets/filters/filters-2021.txt',
-    [
-      'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2021.txt',
-      'https://ublockorigin.pages.dev/filters/filters-2021.txt'
-    ]
-  ],
-  [
-    'https://ublockorigin.github.io/uAssets/filters/filters-2022.txt',
-    [
-      'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2022.txt',
-      'https://ublockorigin.pages.dev/filters/filters-2022.txt'
-    ]
-  ],
-  [
-    'https://ublockorigin.github.io/uAssets/filters/filters-2023.txt',
-    [
-      'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2023.txt',
-      'https://ublockorigin.pages.dev/filters/filters-2023.txt'
+      'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt',
+      'https://ublockorigin.pages.dev/filters/filters.min.txt'
     ]
     ]
   ],
   ],
   // uBlock Origin Badware Risk List
   // uBlock Origin Badware Risk List
   [
   [
-    'https://ublockorigin.github.io/uAssets/filters/badware.txt',
+    'https://ublockorigin.github.io/uAssets/filters/badware.min.txt',
     [
     [
-      'https://ublockorigin.github.io/uAssetsCDN/filters/badware.txt',
-      'https://ublockorigin.pages.dev/filters/badware.txt'
+      'https://ublockorigin.github.io/uAssetsCDN/filters/badware.min.txt',
+      'https://ublockorigin.pages.dev/filters/badware.min.txt'
     ]
     ]
   ],
   ],
   // uBlock Origin Privacy List
   // uBlock Origin Privacy List
   [
   [
-    'https://ublockorigin.github.io/uAssets/filters/privacy.txt',
-    [
-      'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.txt',
-      'https://ublockorigin.pages.dev/filters/privacy.txt'
-    ]
-  ],
-  // uBlock Origin Resource Abuse
-  [
-    'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt',
+    'https://ublockorigin.github.io/uAssets/filters/privacy.min.txt',
     [
     [
-      'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt',
-      'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
+      'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.min.txt',
+      'https://ublockorigin.pages.dev/filters/privacy.min.txt'
     ]
     ]
   ],
   ],
+  // uBlock Origin Resource Abuse: merged in uBlock Origin Privacy List
+  // [
+  //   'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt',
+  //   [
+  //     'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt',
+  //     'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
+  //   ]
+  // ],
   // uBlock Origin Unbreak
   // uBlock Origin Unbreak
   [
   [
-    'https://ublockorigin.github.io/uAssets/filters/unbreak.txt',
+    'https://ublockorigin.github.io/uAssets/filters/unbreak.min.txt',
     [
     [
-      'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.txt',
-      'https://ublockorigin.pages.dev/filters/unbreak.txt'
+      'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt',
+      'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
     ]
     ]
   ],
   ],
   // AdGuard Base Filter
   // AdGuard Base Filter
@@ -171,9 +145,7 @@ const ADGUARD_FILTERS = /** @type {const} */([
   // BarbBlock
   // BarbBlock
   'https://paulgb.github.io/BarbBlock/blacklists/ublock-origin.txt',
   'https://paulgb.github.io/BarbBlock/blacklists/ublock-origin.txt',
   // Brave First Party & First Party CNAME
   // Brave First Party & First Party CNAME
-  'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt',
-  'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty-cname.txt',
-  'https://raw.githubusercontent.com/brave/adblock-lists/master/coin-miners.txt'
+  'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt'
 ]);
 ]);
 
 
 const PREDEFINED_WHITELIST = [
 const PREDEFINED_WHITELIST = [

+ 2 - 0
Source/domainset/cdn.conf

@@ -340,6 +340,8 @@ th.bing.com
 images.ecomm.microsoft.com
 images.ecomm.microsoft.com
 .yammerusercontent.com
 .yammerusercontent.com
 .assets-yammer.com
 .assets-yammer.com
+.edgestorewebcdn.azureedge.net
+img-s-msn-com.akamaized.net
 # Microsoft Cookie Conscent
 # Microsoft Cookie Conscent
 wcpstatic.microsoft.com
 wcpstatic.microsoft.com
 # Xbox
 # Xbox

+ 7 - 1
Source/domainset/reject_sukka.conf

@@ -519,6 +519,11 @@ www.kuguopush.com
 .adcast.deviantart.com
 .adcast.deviantart.com
 .inside.bitcomet.com
 .inside.bitcomet.com
 .cdnads.com
 .cdnads.com
+.stats.esomniture.com
+.adalyser.com
+.tradedoubler.com
+.xiti.com
+.cjt1.net
 
 
 .youxiaoad.com
 .youxiaoad.com
 .iteye.com
 .iteye.com
@@ -1749,7 +1754,6 @@ ntp.msn.cn
 assets.msn.cn
 assets.msn.cn
 api.msn.com
 api.msn.com
 browser.events.data.msn.com
 browser.events.data.msn.com
-img-s-msn-com.akamaized.net
 
 
 # >> OPPO
 # >> OPPO
 adsfs.oppomobile.com
 adsfs.oppomobile.com
@@ -1820,6 +1824,7 @@ adserve2.tom.com
 .discovery.tom.com
 .discovery.tom.com
 
 
 # brightdata (luminati) SDK
 # brightdata (luminati) SDK
+.l-err.biz
 .lum-sdk.io
 .lum-sdk.io
 .luminatinet.com
 .luminatinet.com
 .luminati.io
 .luminati.io
@@ -1829,3 +1834,4 @@ adserve2.tom.com
 .hola.org
 .hola.org
 .h-vpn.org
 .h-vpn.org
 .holashop.org
 .holashop.org
+.svd-cdn.com

+ 3 - 1
Source/non_ip/reject.conf

@@ -83,7 +83,7 @@ DOMAIN-SUFFIX,tw1.ru
 # >> General
 # >> General
 
 
 DOMAIN-KEYWORD,track.tiara
 DOMAIN-KEYWORD,track.tiara
-DOMAIN-KEYWORD,adservice
+# DOMAIN-KEYWORD,adservice # conflict with @@://www.googleadservices.com^|
 DOMAIN-KEYWORD,umeng
 DOMAIN-KEYWORD,umeng
 DOMAIN-KEYWORD,adsby
 DOMAIN-KEYWORD,adsby
 DOMAIN-KEYWORD,adsdk
 DOMAIN-KEYWORD,adsdk
@@ -108,6 +108,8 @@ DOMAIN-KEYWORD,_vmind.qqvideo.tc.qq.com
 DOMAIN-KEYWORD,-logging.nextmedia.com
 DOMAIN-KEYWORD,-logging.nextmedia.com
 DOMAIN-KEYWORD,-spiky.clevertap-prod.com
 DOMAIN-KEYWORD,-spiky.clevertap-prod.com
 DOMAIN-KEYWORD,.engage.3m.
 DOMAIN-KEYWORD,.engage.3m.
+# -telemetry.officeapps.live.com.mcas.ms
+# -telemetry.officeapps.live.com
 DOMAIN-KEYWORD,telemetry.officeapps.live.com
 DOMAIN-KEYWORD,telemetry.officeapps.live.com
 DOMAIN-KEYWORD,-launches.appsflyersdk.com
 DOMAIN-KEYWORD,-launches.appsflyersdk.com