Browse Source

Refactor: separate modules

SukkaW 1 year ago
parent
commit
29410eb1c3

+ 42 - 55
Build/build-reject-domainset.ts

@@ -2,7 +2,9 @@
 import path from 'node:path';
 import process from 'node:process';
 
-import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
+import { processHosts } from './lib/parse-filter/hosts';
+import { processDomainLists } from './lib/parse-filter/domainlists';
+import { processFilterRules } from './lib/parse-filter/filters';
 
 import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source';
 import { compareAndWriteFile } from './lib/create-file';
@@ -18,6 +20,7 @@ import { addArrayElementsToSet } from 'foxts/add-array-elements-to-set';
 import { appendArrayInPlace } from './lib/append-array-in-place';
 import { OUTPUT_INTERNAL_DIR, SOURCE_DIR } from './constants/dir';
 import { DomainsetOutput } from './lib/create-file';
+import { foundDebugDomain } from './lib/parse-filter/shared';
 
 const readLocalRejectDomainsetPromise = readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf'));
 const readLocalRejectExtraDomainsetPromise = readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka_extra.conf'));
@@ -63,65 +66,49 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
   const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
 
   // Parse from AdGuard Filters
-  const shouldStop = await span
+  await span
     .traceChild('download and process hosts / adblock filter rules')
-    .traceAsyncFn(async (childSpan) => {
-      // eslint-disable-next-line sukka/no-single-return -- not single return
-      let shouldStop = false;
-      await Promise.all([
-        // Parse from remote hosts & domain lists
-        HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectOutput)),
-        HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),
-
-        DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectOutput)),
-        DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),
-
-        ADGUARD_FILTERS.map(
-          entry => processFilterRules(childSpan, ...entry)
-            .then(({ white, black, foundDebugDomain }) => {
-              if (foundDebugDomain) {
-                // eslint-disable-next-line sukka/no-single-return -- not single return
-                shouldStop = true;
-                // we should not break here, as we want to see full matches from all data source
-              }
-              addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
-              appendArrayToRejectOutput(black);
-            })
-        ),
-        ADGUARD_FILTERS_EXTRA.map(
-          entry => processFilterRules(childSpan, ...entry)
-            .then(({ white, black, foundDebugDomain }) => {
-              if (foundDebugDomain) {
-                // eslint-disable-next-line sukka/no-single-return -- not single return
-                shouldStop = true;
-                // we should not break here, as we want to see full matches from all data source
-              }
-              addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
-              appendArrayToRejectExtraOutput(black);
-            })
-        ),
-        ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => {
-          addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
-          addArrayElementsToSet(filterRuleWhitelistDomainSets, black);
-        })),
-        getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput),
-        readLocalRejectDomainsetPromise.then(appendArrayToRejectOutput),
-        readLocalRejectDomainsetPromise.then(appendArrayToRejectExtraOutput),
-        readLocalRejectExtraDomainsetPromise.then(appendArrayToRejectExtraOutput),
-        // Dedupe domainSets
-        // span.traceChildAsync('collect black keywords/suffixes', async () =>
-        /**
+    .traceAsyncFn((childSpan) => Promise.all([
+      // Parse from remote hosts & domain lists
+      HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectOutput)),
+      HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),
+
+      DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectOutput)),
+      DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),
+
+      ADGUARD_FILTERS.map(
+        entry => processFilterRules(childSpan, ...entry)
+          .then(({ white, black }) => {
+            addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
+            appendArrayToRejectOutput(black);
+          })
+      ),
+      ADGUARD_FILTERS_EXTRA.map(
+        entry => processFilterRules(childSpan, ...entry)
+          .then(({ white, black }) => {
+            addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
+            appendArrayToRejectExtraOutput(black);
+          })
+      ),
+      ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => {
+        addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
+        addArrayElementsToSet(filterRuleWhitelistDomainSets, black);
+      })),
+      getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput),
+      readLocalRejectDomainsetPromise.then(appendArrayToRejectOutput),
+      readLocalRejectDomainsetPromise.then(appendArrayToRejectExtraOutput),
+      readLocalRejectExtraDomainsetPromise.then(appendArrayToRejectExtraOutput),
+      // Dedupe domainSets
+      // span.traceChildAsync('collect black keywords/suffixes', async () =>
+      /**
          * Collect DOMAIN, DOMAIN-SUFFIX, and DOMAIN-KEYWORD from non_ip/reject.conf for deduplication
          * DOMAIN-WILDCARD is not really useful for deduplication, it is only included in AdGuardHome output
         */
-        rejectOutput.addFromRuleset(readLocalRejectRulesetPromise),
-        rejectExtraOutput.addFromRuleset(readLocalRejectRulesetPromise)
-      ].flat());
-      // eslint-disable-next-line sukka/no-single-return -- not single return
-      return shouldStop;
-    });
+      rejectOutput.addFromRuleset(readLocalRejectRulesetPromise),
+      rejectExtraOutput.addFromRuleset(readLocalRejectRulesetPromise)
+    ].flat()));
 
-  if (shouldStop) {
+  if (foundDebugDomain.value) {
     process.exit(1);
   }
 

+ 3 - 1
Build/lib/get-phishing-domains.ts

@@ -1,4 +1,6 @@
-import { processDomainLists, processHosts } from './parse-filter';
+import { processHosts } from './parse-filter/hosts';
+import { processDomainLists } from './parse-filter/domainlists';
+
 import * as tldts from 'tldts-experimental';
 
 import { dummySpan, printTraceResult } from '../trace';

+ 3 - 4
Build/lib/parse-filter.test.ts

@@ -1,7 +1,7 @@
 import { describe, it } from 'mocha';
 
-import { parse, processFilterRules } from './parse-filter';
-import type { ParseType } from './parse-filter';
+import { parse, processFilterRules } from './parse-filter/filters';
+import type { ParseType } from './parse-filter/filters';
 import { createCacheKey } from './cache-filesystem';
 import { createSpan } from '../trace';
 
@@ -20,8 +20,7 @@ describe.skip('processFilterRules', () => {
     console.log(processFilterRules(
       createSpan('noop'),
       cacheKey('https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt'),
-      [],
-      7_200_000
+      []
     ));
   });
 });

+ 51 - 0
Build/lib/parse-filter/domainlists.ts

@@ -0,0 +1,51 @@
+import picocolors from 'picocolors';
+import { normalizeDomain } from '../normalize-domain';
+import { processLine } from '../process-line';
+import { onBlackFound } from './shared';
+import { fetchAssetsWithout304 } from '../fetch-assets';
+import type { Span } from '../../trace';
+
+function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
+  let line = processLine(l);
+  if (!line) return;
+  line = line.toLowerCase();
+
+  const domain = normalizeDomain(line);
+  if (!domain) return;
+  if (domain !== line) {
+    console.log(
+      picocolors.red('[process domain list]'),
+      picocolors.gray(`line: ${line}`),
+      picocolors.gray(`domain: ${domain}`),
+      picocolors.gray(meta)
+    );
+
+    return;
+  }
+
+  onBlackFound(domain, meta);
+
+  set.push(includeAllSubDomain ? `.${line}` : line);
+}
+
+export function processDomainLists(
+  span: Span,
+  domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
+) {
+  return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => {
+    const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304(
+      domainListsUrl,
+      mirrors
+    ));
+    const domainSets: string[] = [];
+    const filterRules = text.split('\n');
+
+    span.traceChildSync('parse domain list', () => {
+      for (let i = 0, len = filterRules.length; i < len; i++) {
+        domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
+      }
+    });
+
+    return domainSets;
+  });
+}

+ 9 - 119
Build/lib/parse-filter.ts → Build/lib/parse-filter/filters.ts

@@ -1,121 +1,12 @@
-import { NetworkFilter } from '@ghostery/adblocker';
-import { processLine } from './process-line';
-import tldts from 'tldts-experimental';
-
 import picocolors from 'picocolors';
-import { normalizeDomain } from './normalize-domain';
-import type { Span } from '../trace';
+import type { Span } from '../../trace';
+import { fetchAssetsWithout304 } from '../fetch-assets';
+import { onBlackFound, onWhiteFound } from './shared';
 import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
-import { looseTldtsOpt } from '../constants/loose-tldts-opt';
-import { DEBUG_DOMAIN_TO_FIND } from '../constants/reject-data-source';
-import { noop } from 'foxts/noop';
-import { fetchAssetsWithout304 } from './fetch-assets';
-
-let foundDebugDomain = false;
-
-const onBlackFound = DEBUG_DOMAIN_TO_FIND
-  ? (line: string, meta: string) => {
-    if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
-      console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
-      foundDebugDomain = true;
-    }
-  }
-  : noop;
-
-const onWhiteFound = DEBUG_DOMAIN_TO_FIND
-  ? (line: string, meta: string) => {
-    if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
-      console.warn(picocolors.red(meta), '(white)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
-      foundDebugDomain = true;
-    }
-  }
-  : noop;
-
-function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
-  let line = processLine(l);
-  if (!line) return;
-  line = line.toLowerCase();
-
-  const domain = normalizeDomain(line);
-  if (!domain) return;
-  if (domain !== line) {
-    console.log(
-      picocolors.red('[process domain list]'),
-      picocolors.gray(`line: ${line}`),
-      picocolors.gray(`domain: ${domain}`),
-      picocolors.gray(meta)
-    );
-
-    return;
-  }
-
-  onBlackFound(domain, meta);
-
-  set.push(includeAllSubDomain ? `.${line}` : line);
-}
-
-export function processDomainLists(
-  span: Span,
-  domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
-) {
-  return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => {
-    const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304(
-      domainListsUrl,
-      mirrors
-    ));
-    const domainSets: string[] = [];
-    const filterRules = text.split('\n');
-
-    span.traceChildSync('parse domain list', () => {
-      for (let i = 0, len = filterRules.length; i < len; i++) {
-        domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
-      }
-    });
-
-    return domainSets;
-  });
-}
-
-function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
-  const line = processLine(l);
-  if (!line) {
-    return;
-  }
-
-  const _domain = line.split(/\s/)[1]?.trim();
-  if (!_domain) {
-    return;
-  }
-  const domain = normalizeDomain(_domain);
-  if (!domain) {
-    return;
-  }
-
-  onBlackFound(domain, meta);
-
-  set.push(includeAllSubDomain ? `.${domain}` : domain);
-}
-
-export function processHosts(
-  span: Span,
-  hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
-) {
-  return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => {
-    const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors));
-
-    const domainSets: string[] = [];
-
-    const filterRules = text.split('\n');
-
-    span.traceChild('parse hosts').traceSyncFn(() => {
-      for (let i = 0, len = filterRules.length; i < len; i++) {
-        hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl);
-      }
-    });
-
-    return domainSets;
-  });
-}
+import { normalizeDomain } from '../normalize-domain';
+import { looseTldtsOpt } from '../../constants/loose-tldts-opt';
+import tldts from 'tldts-experimental';
+import { NetworkFilter } from '@ghostery/adblocker';
 
 const enum ParseType {
   WhiteIncludeSubdomain = 0,
@@ -134,7 +25,7 @@ export async function processFilterRules(
   filterRulesUrl: string,
   fallbackUrls?: string[] | null,
   allowThirdParty = false
-): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
+): Promise<{ white: string[], black: string[] }> {
   const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn(async (span) => {
     const text = await fetchAssetsWithout304(filterRulesUrl, fallbackUrls);
 
@@ -226,8 +117,7 @@ export async function processFilterRules(
 
   return {
     white,
-    black,
-    foundDebugDomain
+    black
   };
 }
 

+ 46 - 0
Build/lib/parse-filter/hosts.ts

@@ -0,0 +1,46 @@
+import type { Span } from '../../trace';
+import { fetchAssetsWithout304 } from '../fetch-assets';
+import { normalizeDomain } from '../normalize-domain';
+import { processLine } from '../process-line';
+import { onBlackFound } from './shared';
+
+function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
+  const line = processLine(l);
+  if (!line) {
+    return;
+  }
+
+  const _domain = line.split(/\s/)[1]?.trim();
+  if (!_domain) {
+    return;
+  }
+  const domain = normalizeDomain(_domain);
+  if (!domain) {
+    return;
+  }
+
+  onBlackFound(domain, meta);
+
+  set.push(includeAllSubDomain ? `.${domain}` : domain);
+}
+
+export function processHosts(
+  span: Span,
+  hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
+) {
+  return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => {
+    const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors));
+
+    const domainSets: string[] = [];
+
+    const filterRules = text.split('\n');
+
+    span.traceChild('parse hosts').traceSyncFn(() => {
+      for (let i = 0, len = filterRules.length; i < len; i++) {
+        hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl);
+      }
+    });
+
+    return domainSets;
+  });
+}

+ 23 - 0
Build/lib/parse-filter/shared.ts

@@ -0,0 +1,23 @@
+import picocolors from 'picocolors';
+import { DEBUG_DOMAIN_TO_FIND } from '../../constants/reject-data-source';
+import { noop } from 'foxts/noop';
+
+export const foundDebugDomain = { value: false };
+
+export const onBlackFound = DEBUG_DOMAIN_TO_FIND
+  ? (line: string, meta: string) => {
+    if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
+      console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
+      foundDebugDomain.value = true;
+    }
+  }
+  : noop;
+
+export const onWhiteFound = DEBUG_DOMAIN_TO_FIND
+  ? (line: string, meta: string) => {
+    if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
+      console.warn(picocolors.red(meta), '(white)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
+      foundDebugDomain.value = true;
+    }
+  }
+  : noop;