Browse Source

Chore: new util run against source file

SukkaW 11 months ago
parent
commit
2d706f4775

+ 41 - 0
Build/lib/run-against-source-file.ts

@@ -0,0 +1,41 @@
+import { never } from 'foxts/guard';
+import { readFileByLine } from './fetch-text-by-line';
+import { processLine } from './process-line';
+
+export default async function runAgainstSourceFile(
+  filePath: string,
+  callback: (domain: string, includeAllSubDomain: boolean) => void,
+  type?: 'ruleset' | 'domainset'
+) {
+  for await (const line of readFileByLine(filePath)) {
+    const l = processLine(line);
+    if (!l) {
+      continue;
+    }
+    if (type == null) {
+      if (l.includes(',')) {
+        type = 'ruleset';
+      } else {
+        type = 'domainset';
+      }
+    }
+
+    if (type === 'ruleset') {
+      const [ruleType, domain] = l.split(',', 3);
+      if (ruleType === 'DOMAIN') {
+        callback(domain, false);
+      } else if (ruleType === 'DOMAIN-SUFFIX') {
+        callback(domain, true);
+      }
+    // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition -- exhaus options
+    } else if (type === 'domainset') {
+      if (l[0] === '.') {
+        callback(l.slice(1), true);
+      } else {
+        callback(l, false);
+      }
+    } else {
+      never(type);
+    }
+  }
+}

+ 8 - 19
Build/tools-lum-apex-domains.ts

@@ -1,9 +1,9 @@
-import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line';
+import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
 import tldts from 'tldts';
 import { HostnameSmolTrie } from './lib/trie';
 import path from 'node:path';
 import { SOURCE_DIR } from './constants/dir';
-import { processLine } from './lib/process-line';
+import runAgainstSourceFile from './lib/run-against-source-file';
 
 (async () => {
   const lines1 = await Array.fromAsync(await fetchRemoteTextByLine('https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true));
@@ -31,23 +31,12 @@ import { processLine } from './lib/process-line';
     });
   }
 
-  for await (const line of readFileByLine(path.join(SOURCE_DIR, 'domainset', 'reject.conf'))) {
-    const l = processLine(line);
-    if (l) {
-      trie.whitelist(l);
-    }
-  }
-  for await (const line of readFileByLine(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'))) {
-    const l = processLine(line);
-    if (l) {
-      const [type, domain] = l.split(',', 3);
-      if (type === 'DOMAIN') {
-        trie.whitelist(domain, false);
-      } else if (type === 'DOMAIN-SUFFIX') {
-        trie.whitelist(domain, true);
-      }
-    }
-  }
+  await runAgainstSourceFile(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), (domain, includeAllSubDomain) => {
+    trie.whitelist(domain, includeAllSubDomain);
+  }, 'domainset');
+  await runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), (domain, includeAllSubDomain) => {
+    trie.whitelist(domain, includeAllSubDomain);
+  }, 'ruleset');
 
   console.log(trie.dump().map(i => '.' + i).join('\n'));
 })();

+ 7 - 22
Build/tools-migrate-domains.ts

@@ -1,20 +1,23 @@
 import path from 'node:path';
-import { readFileByLine } from './lib/fetch-text-by-line';
 import { processFilterRulesWithPreload } from './lib/parse-filter/filters';
 import { processHosts } from './lib/parse-filter/hosts';
-import { processLine } from './lib/process-line';
 import { HostnameSmolTrie } from './lib/trie';
 import { dummySpan } from './trace';
 import { SOURCE_DIR } from './constants/dir';
 import { PREDEFINED_WHITELIST } from './constants/reject-data-source';
+import runAgainstSourceFile from './lib/run-against-source-file';
 
 (async () => {
   const trie = new HostnameSmolTrie();
 
   await writeHostsToTrie(trie, 'https://cdn.jsdelivr.net/gh/jerryn70/GoodbyeAds@master/Extension/GoodbyeAds-Xiaomi-Extension.txt', true);
 
-  await runWhiteOnSource(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), trie);
-  await runWhiteOnSource(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), trie);
+  const callback = (domain: string, includeAllSubDomain: boolean) => {
+    trie.whitelist(domain, includeAllSubDomain);
+  };
+
+  await runAgainstSourceFile(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), callback, 'domainset');
+  await runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), callback, 'ruleset');
 
   for (let i = 0, len = PREDEFINED_WHITELIST.length; i < len; i++) {
     trie.whitelist(PREDEFINED_WHITELIST[i]);
@@ -25,24 +28,6 @@ import { PREDEFINED_WHITELIST } from './constants/reject-data-source';
   console.log('---------------------------');
 })();
 
-async function runWhiteOnSource(sourceFile: string, trie: HostnameSmolTrie) {
-  for await (const line of readFileByLine(sourceFile)) {
-    const l = processLine(line);
-    if (l) {
-      if (l.includes(',')) {
-        const [type, domain] = l.split(',', 3);
-        if (type === 'DOMAIN') {
-          trie.whitelist(domain, false);
-        } else if (type === 'DOMAIN-SUFFIX') {
-          trie.whitelist(domain, true);
-        }
-      } else {
-        trie.whitelist(l);
-      }
-    }
-  }
-}
-
 async function writeHostsToTrie(trie: HostnameSmolTrie, hostsUrl: string, includeAllSubDomain = false) {
   const hosts = await processHosts(dummySpan, hostsUrl, [], includeAllSubDomain);
 

+ 25 - 41
Build/validate-domain-alive.ts

@@ -1,11 +1,9 @@
-import { readFileByLine } from './lib/fetch-text-by-line';
-import { processLine } from './lib/process-line';
-
 import { SOURCE_DIR } from './constants/dir';
 import path from 'node:path';
 import { newQueue } from '@henrygd/queue';
 import { isDomainAlive, keyedAsyncMutexWithQueue } from './lib/is-domain-alive';
 import { fdir as Fdir } from 'fdir';
+import runAgainstSourceFile from './lib/run-against-source-file';
 
 const queue = newQueue(24);
 
@@ -19,10 +17,20 @@ function onDomain(args: [string, boolean]) {
 (async () => {
   const domainSets = await new Fdir()
     .withFullPaths()
+    .filter((filePath, isDirectory) => {
+      if (isDirectory) return false;
+      const extname = path.extname(filePath);
+      return extname === '.txt' || extname === '.conf';
+    })
     .crawl(SOURCE_DIR + path.sep + 'domainset')
     .withPromise();
   const domainRules = await new Fdir()
     .withFullPaths()
+    .filter((filePath, isDirectory) => {
+      if (isDirectory) return false;
+      const extname = path.extname(filePath);
+      return extname === '.txt' || extname === '.conf';
+    })
     .crawl(SOURCE_DIR + path.sep + 'non_ip')
     .withPromise();
 
@@ -37,53 +45,29 @@ function onDomain(args: [string, boolean]) {
 })();
 
 export async function runAgainstRuleset(filepath: string) {
-  const extname = path.extname(filepath);
-  if (extname !== '.conf') {
-    console.log('[skip]', filepath);
-    return;
-  }
-
   const promises: Array<Promise<void>> = [];
-
-  for await (const l of readFileByLine(filepath)) {
-    const line = processLine(l);
-    if (!line) continue;
-    const [type, domain] = line.split(',');
-    switch (type) {
-      case 'DOMAIN-SUFFIX':
-      case 'DOMAIN': {
-        promises.push(
-          queue.add(() => keyedAsyncMutexWithQueue(domain, () => isDomainAlive(domain, type === 'DOMAIN-SUFFIX')))
-            .then(onDomain)
-        );
-        break;
-      }
-      // no default
-    }
-  }
+  await runAgainstSourceFile(
+    filepath,
+    (domain: string, includeAllSubdomain: boolean) => queue.add(() => keyedAsyncMutexWithQueue(
+      domain,
+      () => isDomainAlive(domain, includeAllSubdomain)
+    ).then(onDomain))
+  );
 
   await Promise.all(promises);
   console.log('[done]', filepath);
 }
 
 export async function runAgainstDomainset(filepath: string) {
-  const extname = path.extname(filepath);
-  if (extname !== '.conf') {
-    console.log('[skip]', filepath);
-    return;
-  }
-
   const promises: Array<Promise<void>> = [];
 
-  for await (const l of readFileByLine(filepath)) {
-    const line = processLine(l);
-    if (!line) continue;
-    promises.push(
-      queue.add(() => keyedAsyncMutexWithQueue(line, () => isDomainAlive(line, line[0] === '.')))
-        .then(onDomain)
-    );
-  }
-
+  await runAgainstSourceFile(
+    filepath,
+    (domain: string, includeAllSubdomain: boolean) => queue.add(() => keyedAsyncMutexWithQueue(
+      domain,
+      () => isDomainAlive(domain, includeAllSubdomain)
+    ).then(onDomain))
+  );
   await Promise.all(promises);
   console.log('[done]', filepath);
 }

+ 14 - 18
Build/validate-domestic.ts

@@ -1,11 +1,10 @@
-import { readFileByLine } from './lib/fetch-text-by-line';
 import { parse } from 'csv-parse/sync';
 import { HostnameSmolTrie } from './lib/trie';
 import path from 'node:path';
-import { processLine } from './lib/process-line';
 import { SOURCE_DIR } from './constants/dir';
 import { parseFelixDnsmasqFromResp } from './lib/parse-dnsmasq';
 import { $$fetch } from './lib/fetch-retry';
+import runAgainstSourceFile from './lib/run-against-source-file';
 
 export async function parseDomesticList() {
   const trie = new HostnameSmolTrie(await parseFelixDnsmasqFromResp(await $$fetch('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')));
@@ -36,27 +35,24 @@ export async function parseDomesticList() {
 
   const notIncludedDomestic = new Set<string>(top5000);
 
-  const runAgainstRuleset = async (ruleset: string) => {
-    for await (const l of readFileByLine(ruleset)) {
-      const line = processLine(l);
-      if (!line) continue;
-      const [type, domain] = line.split(',');
-      if (type === 'DOMAIN-SUFFIX') {
+  // await Promise.all([
+  await runAgainstSourceFile(
+    path.resolve(SOURCE_DIR, 'non_ip/domestic.conf'),
+    (domain, includeAllSubdomain) => {
+      if (includeAllSubdomain) {
         if (top5000.has(domain)) {
           notIncludedDomestic.delete(domain);
         }
-      } else if (type === 'DOMAIN-KEYWORD') {
-        for (const d of top5000) {
-          if (d.includes(domain)) {
-            notIncludedDomestic.delete(d);
-          }
-        }
+      } else {
+        // noop, DOMAIN-KEYWORD handing
+        // for (const d of top5000) {
+        //   if (d.includes(domain)) {
+        //     notIncludedDomestic.delete(d);
+        //   }
+        // }
       }
     }
-  };
-
-  // await Promise.all([
-  await runAgainstRuleset(path.resolve(SOURCE_DIR, 'non_ip/domestic.conf'));
+  );
   // ]);
 
   console.log(notIncludedDomestic.size, notIncludedDomestic);

+ 13 - 38
Build/validate-gfwlist.ts

@@ -3,11 +3,12 @@ import { fastNormalizeDomain } from './lib/normalize-domain';
 import { HostnameSmolTrie } from './lib/trie';
 // import { Readable } from 'stream';
 import { parse } from 'csv-parse/sync';
-import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line';
+import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
 import path from 'node:path';
 import { OUTPUT_SURGE_DIR } from './constants/dir';
 import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
 import { $$fetch } from './lib/fetch-retry';
+import runAgainstSourceFile from './lib/run-against-source-file';
 
 export async function parseGfwList() {
   const whiteSet = new Set<string>();
@@ -77,46 +78,20 @@ export async function parseGfwList() {
 
   const keywordSet = new Set<string>();
 
-  const runAgainstRuleset = async (ruleset: string) => {
-    for await (const l of readFileByLine(ruleset)) {
-      const line = processLine(l);
-      if (!line) continue;
-      const [type, domain] = line.split(',');
-      switch (type) {
-        case 'DOMAIN-SUFFIX': {
-          trie.whitelist('.' + domain);
-          break;
-        }
-        case 'DOMAIN': {
-          trie.whitelist(domain);
-          break;
-        }
-        case 'DOMAIN-KEYWORD': {
-          keywordSet.add(domain);
-          break;
-        }
-        // no default
-      }
-    }
+  const callback = (domain: string, includeAllSubdomain: boolean) => {
+    trie.whitelist(domain, includeAllSubdomain);
   };
 
-  const runAgainstDomainset = async (ruleset: string) => {
-    for await (const l of readFileByLine(ruleset)) {
-      const line = processLine(l);
-      if (!line) continue;
-      trie.whitelist(line);
-    }
-  };
   await Promise.all([
-    runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/global.conf')),
-    runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/reject.conf')),
-    runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/telegram.conf')),
-    runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf')),
-    runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/ai.conf')),
-    runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/microsoft.conf')),
-    runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf')),
-    runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf')),
-    runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf'))
+    runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/global.conf'), callback, 'ruleset'),
+    runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/reject.conf'), callback, 'ruleset'),
+    runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/telegram.conf'), callback, 'ruleset'),
+    runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf'), callback, 'ruleset'),
+    runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/ai.conf'), callback, 'ruleset'),
+    runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/microsoft.conf'), callback, 'ruleset'),
+    runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf'), callback, 'domainset'),
+    runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf'), callback, 'domainset'),
+    runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf'), callback, 'domainset')
   ]);
 
   whiteSet.forEach(domain => trie.whitelist(domain));

+ 8 - 22
Build/validate-global-tld.ts

@@ -1,42 +1,28 @@
 import path from 'node:path';
-import { readFileByLine } from './lib/fetch-text-by-line';
 import { HostnameSmolTrie } from './lib/trie';
-import { OUTPUT_SURGE_DIR, SOURCE_DIR } from './constants/dir';
+import { OUTPUT_SURGE_DIR } from './constants/dir';
 import { ICP_TLD } from './constants/domains';
 import tldts from 'tldts-experimental';
 import { looseTldtsOpt } from './constants/loose-tldts-opt';
+import runAgainstSourceFile from './lib/run-against-source-file';
 
 (async () => {
   const trie = new HostnameSmolTrie();
   const extraWhiteTLDs = new Set<string>();
 
-  for await (const line of readFileByLine(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'domestic.conf'))) {
-    const [type, domain] = line.split(',');
-    if (type !== 'DOMAIN' && type !== 'DOMAIN-SUFFIX') {
-      continue;
-    }
+  await runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'domestic.conf'), (domain) => {
     if (domain === 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe') {
-      continue;
+      return;
     }
     const tld = tldts.getPublicSuffix(domain, looseTldtsOpt);
     if (tld) {
       extraWhiteTLDs.add(tld);
     }
-  }
+  }, 'ruleset');
 
-  for await (const line of readFileByLine(path.join(SOURCE_DIR, 'non_ip', 'global.conf'))) {
-    const [type, domain] = line.split(',');
-    switch (type) {
-      case 'DOMAIN':
-        trie.add(domain);
-        break;
-      case 'DOMAIN-SUFFIX':
-        trie.add(domain, true);
-        break;
-      default:
-        break;
-    }
-  }
+  await runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'global.conf'), (domain, includeAllSubDomain) => {
+    trie.add(domain, includeAllSubDomain);
+  }, 'ruleset');
 
   ICP_TLD.forEach(tld => trie.whitelist(tld, true));
   extraWhiteTLDs.forEach(tld => trie.whitelist(tld, true));

+ 16 - 17
Build/validate-reject-stats.ts

@@ -1,9 +1,8 @@
 import path from 'node:path';
-import { readFileByLine } from './lib/fetch-text-by-line';
 import { OUTPUT_SURGE_DIR } from './constants/dir';
-import { processLine } from './lib/process-line';
 import tldts from 'tldts';
 import { loosTldOptWithPrivateDomains } from './constants/loose-tldts-opt';
+import runAgainstSourceFile from './lib/run-against-source-file';
 
 (async () => {
   const rejectDomainCountMap = await runAgainstDomainset(new Map<string, number>(), path.join(OUTPUT_SURGE_DIR, 'domainset', 'reject.conf'));
@@ -17,22 +16,22 @@ import { loosTldOptWithPrivateDomains } from './constants/loose-tldts-opt';
 })();
 
 async function runAgainstDomainset(rejectDomainCountMap: Map<string, number>, file: string) {
-  for await (const line of readFileByLine(file)) {
-    if (!processLine(line)) {
-      continue;
-    }
-    const apexDomain = tldts.getDomain(line, loosTldOptWithPrivateDomains);
-    if (!apexDomain) {
-      continue;
-    }
+  await runAgainstSourceFile(
+    file,
+    (domain: string) => {
+      const apexDomain = tldts.getDomain(domain, loosTldOptWithPrivateDomains);
+      if (!apexDomain) {
+        return;
+      }
 
-    rejectDomainCountMap.set(
-      apexDomain,
-      rejectDomainCountMap.has(apexDomain)
-        ? rejectDomainCountMap.get(apexDomain)! + 1
-        : 1
-    );
-  }
+      rejectDomainCountMap.set(
+        apexDomain,
+        rejectDomainCountMap.has(apexDomain)
+          ? rejectDomainCountMap.get(apexDomain)! + 1
+          : 1
+      );
+    }
+  );
 
   return rejectDomainCountMap;
 }