Browse Source

Chore: validate domain tools

SukkaW 8 months ago
parent
commit
ede1b7e25b
2 changed files with 95 additions and 82 deletions
  1. 49 41
      Build/validate-domestic.ts
  2. 46 41
      Build/validate-gfwlist.ts

+ 49 - 41
Build/validate-domestic.ts

@@ -1,59 +1,67 @@
-import { parse } from 'csv-parse/sync';
-import { HostnameSmolTrie } from './lib/trie';
 import path from 'node:path';
 import { SOURCE_DIR } from './constants/dir';
 import { parseFelixDnsmasqFromResp } from './lib/parse-dnsmasq';
 import { $$fetch } from './lib/fetch-retry';
 import runAgainstSourceFile from './lib/run-against-source-file';
+import { getTopOneMillionDomains } from './validate-gfwlist';
+import { HostnameSmolTrie } from './lib/trie';
+import tldts from 'tldts-experimental';
+import { DOMESTICS } from '../Source/non_ip/domestic';
 
 export async function parseDomesticList() {
-  const trie = new HostnameSmolTrie(await parseFelixDnsmasqFromResp(await $$fetch('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')));
-
-  const top5000 = new Set<string>();
-
-  const res = await (await $$fetch('https://radar.cloudflare.com/charts/LargerTopDomainsTable/attachment?id=1077&top=10000', {
-    headers: {
-      accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-      'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6,es;q=0.5',
-      'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
-      'sec-ch-ua-mobile': '?0',
-      'sec-ch-ua-platform': '"macOS"',
-      'sec-fetch-dest': 'document',
-      'sec-fetch-mode': 'navigate',
-      'sec-fetch-site': 'none',
-      'sec-fetch-user': '?1',
-      'upgrade-insecure-requests': '1'
-    }
-  })).text();
-  const stream = parse(res);
-  for await (const [domain] of stream) {
-    if (trie.has(domain)) {
-      top5000.add(domain);
+  const allChinaDomains = new Set<string>(await parseFelixDnsmasqFromResp(await $$fetch('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')));
+
+  const topDomainTrie = await getTopOneMillionDomains();
+
+  const resultTrie = new HostnameSmolTrie();
+
+  topDomainTrie.dumpWithoutDot((domain) => {
+    const apexDomain = tldts.getDomain(domain);
+
+    if (apexDomain && allChinaDomains.has(apexDomain)) {
+      resultTrie.add(apexDomain, false);
     }
-    console.log({ domain });
-  }
+  });
 
-  const notIncludedDomestic = new Set<string>(top5000);
+  const callback = (domain: string, includeAllSubdomain: boolean) => resultTrie.whitelist(domain, includeAllSubdomain);
 
   // await Promise.all([
   await runAgainstSourceFile(
     path.resolve(SOURCE_DIR, 'non_ip/domestic.conf'),
-    (domain, includeAllSubdomain) => {
-      if (includeAllSubdomain) {
-        if (top5000.has(domain)) {
-          notIncludedDomestic.delete(domain);
+    callback
+  );
+  await runAgainstSourceFile(
+    path.resolve(SOURCE_DIR, 'domainset/reject.conf'),
+    callback
+  );
+
+  Object.values(DOMESTICS).forEach(domestic => {
+    domestic.domains.forEach(domain => {
+      switch (domain[0]) {
+        case '+':
+        case '$': {
+          resultTrie.whitelist(domain.slice(1), true);
+          break;
+        }
+        default: {
+          resultTrie.whitelist(domain, true);
+          break;
         }
-      } else {
-        // noop, DOMAIN-KEYWORD handing
-        // for (const d of top5000) {
-        //   if (d.includes(domain)) {
-        //     notIncludedDomestic.delete(d);
-        //   }
-        // }
       }
-    }
-  );
+    });
+  });
+
+  // noop, DOMAIN-KEYWORD handing
+  // for (const d of top5000) {
+  //   if (d.includes(domain)) {
+  //     notIncludedDomestic.delete(d);
+  //   }
+  // }
   // ]);
 
-  console.log(notIncludedDomestic.size, notIncludedDomestic);
+  console.log(resultTrie.dump().join('\n') + '\n');
+}
+
+if (require.main === module) {
+  parseDomesticList().catch(console.error);
 }

+ 46 - 41
Build/validate-gfwlist.ts

@@ -11,13 +11,52 @@ import runAgainstSourceFile from './lib/run-against-source-file';
 import { nullthrow } from 'foxts/guard';
 import { Buffer } from 'node:buffer';
 
-export async function parseGfwList() {
+export async function getTopOneMillionDomains() {
   const { parse: csvParser } = await import('csv-parse');
 
+  const topDomainTrie = new HostnameSmolTrie();
+  const csvParse = csvParser({ columns: false, skip_empty_lines: true });
+
+  const topDomainsZipBody = await (await $$fetch('https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip', {
+    headers: {
+      accept: '*/*',
+      'user-agent': 'curl/8.12.1'
+    }
+  })).arrayBuffer();
+  let entry: yauzl.Entry | null = null;
+  for await (const e of await yauzl.fromBuffer(Buffer.from(topDomainsZipBody))) {
+    if (e.filename === 'top-1m.csv') {
+      entry = e;
+      break;
+    }
+  }
+
+  const { promise, resolve, reject } = Promise.withResolvers<HostnameSmolTrie>();
+
+  const readable = await nullthrow(entry, 'top-1m.csv entry not found').openReadStream();
+  const parser = readable.pipe(csvParse);
+  parser.on('readable', () => {
+    let record;
+    while ((record = parser.read()) !== null) {
+      topDomainTrie.add(record[1]);
+    }
+  });
+
+  parser.on('end', () => {
+    resolve(topDomainTrie);
+  });
+  parser.on('error', (err) => {
+    reject(err);
+  });
+
+  return promise;
+}
+
+export async function parseGfwList() {
   const whiteSet = new Set<string>();
   const gfwListTrie = new HostnameSmolTrie();
 
-  const excludeGfwList = createKeywordFilter([
+  const gfwlistIgnoreLineKwfilter = createKeywordFilter([
     '.*',
     '*',
     '=',
@@ -31,7 +70,7 @@ export async function parseGfwList() {
     const line = processLine(l);
     if (!line) continue;
 
-    if (excludeGfwList(line)) {
+    if (gfwlistIgnoreLineKwfilter(line)) {
       continue;
     }
     if (line.startsWith('@@||')) {
@@ -71,42 +110,7 @@ export async function parseGfwList() {
     gfwListTrie.add(l);
   }
 
-  const topDomainTrie = new HostnameSmolTrie();
-
-  const csvParse = csvParser({ columns: false, skip_empty_lines: true });
-  const topDomainsZipBody = await (await $$fetch('https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip', {
-    headers: {
-      accept: '*/*',
-      'user-agent': 'curl/8.12.1'
-    }
-  })).arrayBuffer();
-  let entry: yauzl.Entry | null = null;
-  for await (const e of await yauzl.fromBuffer(Buffer.from(topDomainsZipBody))) {
-    if (e.filename === 'top-1m.csv') {
-      entry = e;
-      break;
-    }
-  }
-
-  const { promise, resolve, reject } = Promise.withResolvers<HostnameSmolTrie>();
-
-  const readable = await nullthrow(entry, 'top-1m.csv entry not found').openReadStream();
-  const parser = readable.pipe(csvParse);
-  parser.on('readable', () => {
-    let record;
-    while ((record = parser.read()) !== null) {
-      topDomainTrie.add(record[1]);
-    }
-  });
-
-  parser.on('end', () => {
-    resolve(topDomainTrie);
-  });
-  parser.on('error', (err) => {
-    reject(err);
-  });
-
-  await promise;
+  const topDomainTrie = await getTopOneMillionDomains();
 
   const keywordSet = new Set<string>();
 
@@ -116,18 +120,19 @@ export async function parseGfwList() {
   };
   await Promise.all([
     runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip/global.conf'), callback, 'ruleset', keywordSet),
-    runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/domestic.conf'), callback, 'ruleset', keywordSet),
+    // runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/domestic.conf'), callback, 'ruleset', keywordSet),
     runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip/reject.conf'), callback, 'ruleset', keywordSet),
     runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip/telegram.conf'), callback, 'ruleset', keywordSet),
     runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf'), callback, 'ruleset', keywordSet),
     runAgainstSourceFile(path.resolve(SOURCE_DIR, 'non_ip/ai.conf'), callback, 'ruleset', keywordSet),
     runAgainstSourceFile(path.resolve(SOURCE_DIR, 'non_ip/microsoft.conf'), callback, 'ruleset', keywordSet),
+    runAgainstSourceFile(path.resolve(SOURCE_DIR, 'non_ip/apple_service.conf'), callback, 'ruleset', keywordSet),
     runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf'), callback, 'domainset'),
     runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf'), callback, 'domainset'),
     runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf'), callback, 'domainset')
   ]);
 
-  whiteSet.forEach(domain => gfwListTrie.whitelist(domain));
+  whiteSet.forEach(domain => gfwListTrie.whitelist(domain, true));
 
   const kwfilter = createKeywordFilter([...keywordSet]);