浏览代码

Chore/CI: use fs cache to save bandwidth

SukkaW 2 年之前
父节点
当前提交
230ac3eb18

+ 6 - 0
.github/workflows/main.yml

@@ -15,6 +15,12 @@ jobs:
         uses: actions/checkout@v4
         with:
           persist-credentials: false
+      - name: Cache cache.db
+        uses: actions/cache@v3
+        with:
+          path: .cache
+          key: ${{ runner.os }}-v1
+
       - uses: oven-sh/setup-bun@v1
         with:
           bun-version: latest

+ 1 - 0
.gitignore

@@ -2,6 +2,7 @@
 node_modules
 .clinic
 .wireit
+.cache
 public
 
 # $ build output

+ 2 - 2
Build/build-anti-bogus-domain.ts

@@ -1,7 +1,7 @@
 // @ts-check
 import path from 'path';
 import { createRuleset } from './lib/create-file';
-import { fetchRemoteTextAndReadByLine, readFileByLine } from './lib/fetch-text-by-line';
+import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line';
 import { processLine } from './lib/process-line';
 import { task } from './lib/trace-runner';
 import { SHARED_DESCRIPTION } from './lib/constants';
@@ -9,7 +9,7 @@ import { isProbablyIpv4, isProbablyIpv6 } from './lib/is-fast-ip';
 
 const getBogusNxDomainIPs = async () => {
   const result: string[] = [];
-  for await (const line of await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/bogus-nxdomain.china.conf')) {
+  for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/bogus-nxdomain.china.conf')) {
     if (line && line.startsWith('bogus-nxdomain=')) {
       const ip = line.slice(15).trim();
       if (isProbablyIpv4(ip)) {

+ 4 - 16
Build/build-cdn-conf.ts

@@ -1,27 +1,15 @@
 import path from 'path';
 import { createRuleset } from './lib/create-file';
-import { fetchRemoteTextAndReadByLine, readFileByLine } from './lib/fetch-text-by-line';
+import { readFileByLine } from './lib/fetch-text-by-line';
 import { createTrie } from './lib/trie';
 import { task } from './lib/trace-runner';
 import { processLine } from './lib/process-line';
 import { SHARED_DESCRIPTION } from './lib/constants';
-
-const publicSuffixPath: string = path.resolve(import.meta.dir, '../node_modules/.cache/public_suffix_list_dat.txt');
-
+import { getPublicSuffixListTextPromise } from './download-publicsuffixlist';
 const getS3OSSDomains = async (): Promise<Set<string>> => {
   const trie = createTrie();
-
-  const publicSuffixFile = Bun.file(publicSuffixPath);
-
-  if (await publicSuffixFile.exists()) {
-    for await (const line of readFileByLine(publicSuffixFile)) {
-      trie.add(line);
-    }
-  } else {
-    console.log('public_suffix_list.dat not found, fetch directly from remote.');
-    for await (const line of await fetchRemoteTextAndReadByLine('https://publicsuffix.org/list/public_suffix_list.dat')) {
-      trie.add(line);
-    }
+  for await (const line of (await getPublicSuffixListTextPromise()).split('\n')) {
+    trie.add(line);
   }
 
   /**

+ 2 - 2
Build/build-chn-cidr.ts

@@ -1,4 +1,4 @@
-import { fetchRemoteTextAndReadByLine } from './lib/fetch-text-by-line';
+import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
 import { resolve as pathResolve } from 'path';
 import { compareAndWriteFile, withBannerArray } from './lib/create-file';
 import { processLineFromReadline } from './lib/process-line';
@@ -21,7 +21,7 @@ const INCLUDE_CIDRS = [
 export const getChnCidrPromise = createMemoizedPromise(async () => {
   const cidr = await traceAsync(
     picocolors.gray('download chnroutes2'),
-    async () => processLineFromReadline(await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')),
+    async () => processLineFromReadline(await fetchRemoteTextByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')),
     picocolors.gray
   );
   return traceSync(

+ 2 - 2
Build/build-internal-reverse-chn-cidr.ts

@@ -1,4 +1,4 @@
-import { fetchRemoteTextAndReadByLine } from './lib/fetch-text-by-line';
+import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
 import { processLineFromReadline } from './lib/process-line';
 import path from 'path';
 import { task } from './lib/trace-runner';
@@ -26,7 +26,7 @@ const RESERVED_IPV4_CIDR = [
 ];
 
 export const buildInternalReverseChnCIDR = task(import.meta.path, async () => {
-  const cidr = await processLineFromReadline(await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt'));
+  const cidr = await processLineFromReadline(await fetchRemoteTextByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt'));
 
   const reversedCidr = merge(
     exclude(

+ 2 - 2
Build/build-microsoft-cdn.ts

@@ -1,7 +1,7 @@
 import path from 'path';
 import { task, traceAsync } from './lib/trace-runner';
 import { createRuleset } from './lib/create-file';
-import { fetchRemoteTextAndReadByLine } from './lib/fetch-text-by-line';
+import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
 import { createTrie } from './lib/trie';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { createMemoizedPromise } from './lib/memo-promise';
@@ -22,7 +22,7 @@ const BLACKLIST = [
 export const getMicrosoftCdnRulesetPromise = createMemoizedPromise(async () => {
   const set = await traceAsync('fetch accelerated-domains.china.conf', async () => {
     const trie = createTrie();
-    for await (const line of await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) {
+    for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) {
       if (line.startsWith('server=/') && line.endsWith('/114.114.114.114')) {
         const domain = line.slice(8, -16);
         trie.add(domain);

+ 3 - 3
Build/build-reject-domainset.ts

@@ -32,16 +32,16 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
     const [gorhill] = await Promise.all([
       getGorhillPublicSuffixPromise(),
       // Parse from remote hosts & domain lists
-      ...HOSTS.map(entry => processHosts(entry[0], entry[1]).then(hosts => {
+      ...HOSTS.map(entry => processHosts(entry[0], entry[1], entry[2], entry[3]).then(hosts => {
         hosts.forEach(host => {
           domainSets.add(host);
         });
       })),
-      ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1])),
+      ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2])),
       ...ADGUARD_FILTERS.map(input => {
         const promise = typeof input === 'string'
           ? processFilterRules(input)
-          : processFilterRules(input[0], input[1]);
+          : processFilterRules(input[0], input[1], input[2]);
 
         return promise.then(({ white, black, foundDebugDomain }) => {
           if (foundDebugDomain) {

+ 2 - 2
Build/build-speedtest-domainset.ts

@@ -21,9 +21,8 @@ const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>>
     s.acquire()
   ]))[0];
 
-  const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)];
-
   try {
+    const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)];
     const key = `fetch speedtest endpoints: ${keyword}`;
     console.time(key);
 
@@ -47,6 +46,7 @@ const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>>
     }
 
     const json = await res.json() as Array<{ url: string }>;
+
     s.release();
 
     console.timeEnd(key);

+ 1 - 12
Build/download-previous-build.ts

@@ -1,7 +1,6 @@
 import fs from 'fs';
 import fsp from 'fs/promises';
 import path from 'path';
-import os from 'os';
 import { Readable } from 'stream';
 import { pipeline } from 'stream/promises';
 import { readFileByLine } from './lib/fetch-text-by-line';
@@ -85,16 +84,6 @@ export const downloadPreviousBuild = task(import.meta.path, async () => {
   );
 });
 
-export const downloadPublicSuffixList = task(import.meta.path, async () => {
-  const publicSuffixPath = path.resolve(import.meta.dir, '../node_modules/.cache/public_suffix_list_dat.txt');
-  const resp = await fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit);
-
-  return Bun.write(publicSuffixPath, resp as Response);
-}, 'download-publicsuffixlist');
-
 if (import.meta.main) {
-  Promise.all([
-    downloadPreviousBuild(),
-    downloadPublicSuffixList()
-  ]);
+  downloadPreviousBuild();
 }

+ 10 - 0
Build/download-publicsuffixlist.ts

@@ -0,0 +1,10 @@
+import { fsCache } from './lib/cache-filesystem';
+import { defaultRequestInit, fetchWithRetry } from './lib/fetch-retry';
+import { createMemoizedPromise } from './lib/memo-promise';
+import { traceAsync } from './lib/trace-runner';
+
+export const getPublicSuffixListTextPromise = createMemoizedPromise(() => traceAsync('obtain public_suffix_list', () => fsCache.apply(
+  'public_suffix_list.dat',
+  () => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text()),
+  { ttl: 24 * 60 * 60 * 1000 }
+)));

+ 3 - 12
Build/index.ts

@@ -1,4 +1,4 @@
-import { downloadPreviousBuild, downloadPublicSuffixList } from './download-previous-build';
+import { downloadPreviousBuild } from './download-previous-build';
 import { buildCommon } from './build-common';
 import { buildAntiBogusDomain } from './build-anti-bogus-domain';
 import { buildAppleCdn } from './build-apple-cdn';
@@ -33,23 +33,15 @@ import type { TaskResult } from './lib/trace-runner';
     // const buildInternalReverseChnCIDRWorker = new Worker(new URL('./workers/build-internal-reverse-chn-cidr-worker.ts', import.meta.url));
 
     const downloadPreviousBuildPromise = downloadPreviousBuild();
-    const downloadPublicSuffixListPromise = downloadPublicSuffixList();
     const buildCommonPromise = downloadPreviousBuildPromise.then(() => buildCommon());
     const buildAntiBogusDomainPromise = downloadPreviousBuildPromise.then(() => buildAntiBogusDomain());
     const buildAppleCdnPromise = downloadPreviousBuildPromise.then(() => buildAppleCdn());
-    const buildCdnConfPromise = Promise.all([
-      downloadPreviousBuildPromise,
-      downloadPublicSuffixListPromise
-    ]).then(() => buildCdnConf());
-    const buildRejectDomainSetPromise = Promise.all([
-      downloadPreviousBuildPromise,
-      downloadPublicSuffixListPromise
-    ]).then(() => buildRejectDomainSet());
+    const buildCdnConfPromise = downloadPreviousBuildPromise.then(() => buildCdnConf());
+    const buildRejectDomainSetPromise = downloadPreviousBuildPromise.then(() => buildRejectDomainSet());
     const buildTelegramCIDRPromise = downloadPreviousBuildPromise.then(() => buildTelegramCIDR());
     const buildChnCidrPromise = downloadPreviousBuildPromise.then(() => buildChnCidr());
     const buildSpeedtestDomainSetPromise = downloadPreviousBuildPromise.then(() => buildSpeedtestDomainSet());
     const buildInternalCDNDomainsPromise = Promise.all([
-      downloadPublicSuffixListPromise,
       buildCommonPromise,
       buildCdnConfPromise
     ]).then(() => buildInternalCDNDomains());
@@ -84,7 +76,6 @@ import type { TaskResult } from './lib/trace-runner';
 
     const stats = await Promise.all([
       downloadPreviousBuildPromise,
-      downloadPublicSuffixListPromise,
       buildCommonPromise,
       buildAntiBogusDomainPromise,
       buildAppleCdnPromise,

+ 131 - 0
Build/lib/cache-filesystem.ts

@@ -0,0 +1,131 @@
+// eslint-disable-next-line import/no-unresolved -- bun built-in module
+import { Database } from 'bun:sqlite';
+import os from 'os';
+import path from 'path';
+import fs from 'fs';
+import picocolors from 'picocolors';
+
+const identity = (x: any) => x;
+
+// eslint-disable-next-line sukka-ts/no-const-enum -- bun is smart, right?
+const enum CacheStatus {
+  Hit = 'hit',
+  Stale = 'stale',
+  Miss = 'miss'
+}
+
+export interface CacheOptions {
+  cachePath?: string,
+  tbd?: number
+}
+
+interface CacheApplyNonStringOption<T> {
+  ttl?: number | null,
+  serializer: (value: T) => string,
+  deserializer: (cached: string) => T,
+  temporaryBypass?: boolean
+}
+
+interface CacheApplyStringOption {
+  ttl?: number | null,
+  temporaryBypass?: boolean
+}
+
+type CacheApplyOption<T> = T extends string ? CacheApplyStringOption : CacheApplyNonStringOption<T>;
+
+export class Cache {
+  db: Database;
+  tbd = 60 * 1000; // time before deletion
+  cachePath: string;
+
+  constructor({ cachePath = path.join(os.tmpdir() || '/tmp', 'hdc'), tbd }: CacheOptions = {}) {
+    this.cachePath = cachePath;
+    fs.mkdirSync(this.cachePath, { recursive: true });
+    if (tbd != null) this.tbd = tbd;
+
+    const db = new Database(path.join(this.cachePath, 'cache.db'));
+    db.exec('PRAGMA journal_mode = WAL');
+
+    db.prepare('CREATE TABLE IF NOT EXISTS cache (key TEXT PRIMARY KEY, value TEXT, ttl REAL NOT NULL);').run();
+    db.prepare('CREATE INDEX IF NOT EXISTS cache_ttl ON cache (ttl);').run();
+
+    // perform purge on startup
+
+    // ttl + tbd < now => ttl < now - tbd
+    const now = Date.now() - this.tbd;
+    db.prepare('DELETE FROM cache WHERE ttl < ?').run(now);
+
+    this.db = db;
+  }
+
+  set(key: string, value: string, ttl = 60 * 1000): void {
+    const insert = this.db.prepare(
+      'INSERT INTO cache (key, value, ttl) VALUES ($key, $value, $valid) ON CONFLICT(key) DO UPDATE SET value = $value, ttl = $valid'
+    );
+
+    insert.run({
+      $key: key,
+      $value: value,
+      $valid: Date.now() + ttl
+    });
+  }
+
+  get(key: string, defaultValue?: string): string | undefined {
+    const rv = this.db.prepare<{ value: string }, string>(
+      'SELECT value FROM cache WHERE key = ?'
+    ).get(key);
+
+    if (!rv) return defaultValue;
+    return rv.value;
+  }
+
+  has(key: string): CacheStatus {
+    const now = Date.now();
+    const rv = this.db.prepare<{ ttl: number }, string>('SELECT ttl FROM cache WHERE key = ?').get(key);
+
+    return !rv ? CacheStatus.Miss : (rv.ttl > now ? CacheStatus.Hit : CacheStatus.Stale);
+  }
+
+  del(key: string): void {
+    this.db.prepare('DELETE FROM cache WHERE key = ?').run(key);
+  }
+
+  async apply<T>(
+    key: string,
+    fn: () => Promise<T>,
+    opt: CacheApplyOption<T>
+  ): Promise<T> {
+    const { ttl, temporaryBypass } = opt;
+
+    if (temporaryBypass) {
+      return fn();
+    }
+    if (ttl === null) {
+      this.del(key);
+      return fn();
+    }
+
+    const cached = this.get(key);
+    let value: T;
+    if (cached == null) {
+      console.log(picocolors.yellow('[cache] miss'), picocolors.gray(key));
+      value = await fn();
+
+      const serializer = 'serializer' in opt ? opt.serializer : identity;
+      this.set(key, serializer(value), ttl);
+    } else {
+      console.log(picocolors.green('[cache] hit'), picocolors.gray(key));
+
+      const deserializer = 'deserializer' in opt ? opt.deserializer : identity;
+      value = deserializer(cached);
+    }
+    return value;
+  }
+}
+
+export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') });
+
+const separator = String.fromCharCode(0);
+
+export const serializeSet = (set: Set<string>) => Array.from(set).join(separator);
+export const deserializeSet = (str: string) => new Set(str.split(separator));

+ 3 - 1
Build/lib/fetch-text-by-line.ts

@@ -1,5 +1,7 @@
 import type { BunFile } from 'bun';
 import { fetchWithRetry, defaultRequestInit } from './fetch-retry';
+import { fsCache } from './cache-filesystem';
+import picocolors from 'picocolors';
 // import { TextLineStream } from './text-line-transform-stream';
 // import { PolyfillTextDecoderStream } from './text-decoder-stream';
 
@@ -78,6 +80,6 @@ export async function *createReadlineInterfaceFromResponse(resp: Response): Asyn
   }
 }
 
-export function fetchRemoteTextAndReadByLine(url: string | URL) {
+export function fetchRemoteTextByLine(url: string | URL) {
   return fetchWithRetry(url, defaultRequestInit).then(res => createReadlineInterfaceFromResponse(res as Response));
 }

+ 4 - 16
Build/lib/get-gorhill-publicsuffix.ts

@@ -1,23 +1,13 @@
 import { toASCII } from 'punycode';
-import path from 'path';
 import { traceAsync } from './trace-runner';
-import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
 import { createMemoizedPromise } from './memo-promise';
+import { getPublicSuffixListTextPromise } from '../download-publicsuffixlist';
 
-const publicSuffixPath = path.resolve(import.meta.dir, '../../node_modules/.cache/public_suffix_list_dat.txt');
-
-const getGorhillPublicSuffix = () => traceAsync('create gorhill public suffix instance', async () => {
+export const getGorhillPublicSuffixPromise = createMemoizedPromise(() => traceAsync('create gorhill public suffix instance', async () => {
   const customFetch = (url: string | URL) => Promise.resolve(Bun.file(url));
 
-  const publicSuffixFile = Bun.file(publicSuffixPath);
-
   const [publicSuffixListDat, { default: gorhill }] = await Promise.all([
-    await publicSuffixFile.exists()
-      ? publicSuffixFile.text()
-      : fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => {
-        console.log('public_suffix_list.dat not found, fetch directly from remote.');
-        return r.text();
-      }),
+    getPublicSuffixListTextPromise(),
     import('@gorhill/publicsuffixlist')
   ]);
 
@@ -25,6 +15,4 @@ const getGorhillPublicSuffix = () => traceAsync('create gorhill public suffix in
   await gorhill.enableWASM({ customFetch });
 
   return gorhill;
-});
-
-export const getGorhillPublicSuffixPromise = createMemoizedPromise(getGorhillPublicSuffix);
+}));

+ 2 - 2
Build/lib/parse-dnsmasq.ts

@@ -1,4 +1,4 @@
-import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
+import { fetchRemoteTextByLine } from './fetch-text-by-line';
 import { parse } from 'tldts';
 
 const isDomainLoose = (domain: string): boolean => {
@@ -8,7 +8,7 @@ const isDomainLoose = (domain: string): boolean => {
 
 export const parseFelixDnsmasq = async (url: string | URL): Promise<string[]> => {
   const res: string[] = [];
-  for await (const line of await fetchRemoteTextAndReadByLine(url)) {
+  for await (const line of await fetchRemoteTextByLine(url)) {
     if (line.startsWith('server=/') && line.endsWith('/114.114.114.114')) {
       const domain = line.replace('server=/', '').replace('/114.114.114.114', '');
       if (isDomainLoose(domain)) {

+ 154 - 115
Build/lib/parse-filter.ts

@@ -1,5 +1,5 @@
 // @ts-check
-import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
+import { fetchRemoteTextByLine } from './fetch-text-by-line';
 import { NetworkFilter } from '@cliqz/adblocker';
 import { processLine } from './process-line';
 import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
@@ -9,61 +9,79 @@ import { traceAsync } from './trace-runner';
 import picocolors from 'picocolors';
 import { normalizeDomain } from './normalize-domain';
 import { fetchAssets } from './fetch-assets';
+import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';
 
 const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
 let foundDebugDomain = false;
 
-export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) {
-  return traceAsync(`- processDomainLists: ${domainListsUrl}`, async () => {
-    const domainSets = new Set<string>();
+export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
+  return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply(
+    domainListsUrl,
+    async () => {
+      const domainSets = new Set<string>();
 
-    for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) {
-      const domainToAdd = processLine(line);
-      if (!domainToAdd) continue;
+      for await (const line of await fetchRemoteTextByLine(domainListsUrl)) {
+        const domainToAdd = processLine(line);
+        if (!domainToAdd) continue;
 
-      if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
-        console.warn(picocolors.red(domainListsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
-        foundDebugDomain = true;
+        if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
+          console.warn(picocolors.red(domainListsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
+          foundDebugDomain = true;
+        }
+
+        domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
       }
 
-      domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
+      return domainSets;
+    },
+    {
+      ttl,
+      temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
+      serializer: serializeSet,
+      deserializer: deserializeSet
     }
-
-    return domainSets;
-  });
+  ));
 }
+export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) {
+  return traceAsync(`- processHosts: ${hostsUrl}`, () => fsCache.apply(
+    hostsUrl,
+    async () => {
+      const domainSets = new Set<string>();
+
+      for await (const l of await fetchRemoteTextByLine(hostsUrl)) {
+        const line = processLine(l);
+        if (!line) {
+          continue;
+        }
 
-export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) {
-  return traceAsync(`- processHosts: ${hostsUrl}`, async () => {
-    const domainSets = new Set<string>();
+        const domain = line.split(/\s/)[1];
+        if (!domain) {
+          continue;
+        }
+        const _domain = domain.trim();
 
-    for await (const l of await fetchRemoteTextAndReadByLine(hostsUrl)) {
-      const line = processLine(l);
-      if (!line) {
-        continue;
-      }
+        if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
+          console.warn(picocolors.red(hostsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
+          foundDebugDomain = true;
+        }
 
-      const domain = line.split(/\s/)[1];
-      if (!domain) {
-        continue;
+        const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain);
+        if (domainToAdd) {
+          domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
+        }
       }
-      const _domain = domain.trim();
 
-      if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
-        console.warn(picocolors.red(hostsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
-        foundDebugDomain = true;
-      }
+      console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
 
-      const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain);
-      if (domainToAdd) {
-        domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
-      }
+      return domainSets;
+    },
+    {
+      ttl,
+      temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
+      serializer: serializeSet,
+      deserializer: deserializeSet
     }
-
-    console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
-
-    return domainSets;
-  });
+  ));
 }
 
 // eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
@@ -77,90 +95,111 @@ const enum ParseType {
 
 export async function processFilterRules(
   filterRulesUrl: string,
-  fallbackUrls?: readonly string[] | undefined
-): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
-  const whitelistDomainSets = new Set<string>();
-  const blacklistDomainSets = new Set<string>();
-
-  const warningMessages: string[] = [];
+  fallbackUrls?: readonly string[] | undefined | null,
+  ttl: number | null = null
+): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
+  const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, () => fsCache.apply<[
+    white: string[],
+    black: string[],
+    warningMessages: string[]
+  ]>(
+    filterRulesUrl,
+    async () => {
+      const whitelistDomainSets = new Set<string>();
+      const blacklistDomainSets = new Set<string>();
+
+      const warningMessages: string[] = [];
+
+      const gorhill = await getGorhillPublicSuffixPromise();
 
-  await traceAsync(`- processFilterRules: ${filterRulesUrl}`, async () => {
-    const gorhill = await getGorhillPublicSuffixPromise();
-
-    /**
+      /**
      * @param {string} line
      */
-    const lineCb = (line: string) => {
-      const result = parse(line, gorhill);
-      if (!result) {
-        return;
-      }
+      const lineCb = (line: string) => {
+        const result = parse(line, gorhill);
+        if (!result) {
+          return;
+        }
 
-      const flag = result[1];
-      const hostname = result[0];
-
-      if (DEBUG_DOMAIN_TO_FIND) {
-        if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
-          console.warn(
-            picocolors.red(filterRulesUrl),
-            flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute
-              ? '(white)'
-              : '(black)',
-            picocolors.bold(DEBUG_DOMAIN_TO_FIND)
-          );
-          foundDebugDomain = true;
+        const flag = result[1];
+        const hostname = result[0];
+
+        if (DEBUG_DOMAIN_TO_FIND) {
+          if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
+            console.warn(
+              picocolors.red(filterRulesUrl),
+              flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute
+                ? '(white)'
+                : '(black)',
+              picocolors.bold(DEBUG_DOMAIN_TO_FIND)
+            );
+            foundDebugDomain = true;
+          }
         }
-      }
 
-      switch (flag) {
-        case ParseType.WhiteIncludeSubdomain:
-          if (hostname[0] !== '.') {
-            whitelistDomainSets.add(`.${hostname}`);
-          } else {
+        switch (flag) {
+          case ParseType.WhiteIncludeSubdomain:
+            if (hostname[0] !== '.') {
+              whitelistDomainSets.add(`.${hostname}`);
+            } else {
+              whitelistDomainSets.add(hostname);
+            }
+            break;
+          case ParseType.WhiteAbsolute:
             whitelistDomainSets.add(hostname);
-          }
-          break;
-        case ParseType.WhiteAbsolute:
-          whitelistDomainSets.add(hostname);
-          break;
-        case ParseType.BlackAbsolute:
-          blacklistDomainSets.add(hostname);
-          break;
-        case ParseType.BlackIncludeSubdomain:
-          if (hostname[0] !== '.') {
-            blacklistDomainSets.add(`.${hostname}`);
-          } else {
+            break;
+          case ParseType.BlackAbsolute:
             blacklistDomainSets.add(hostname);
-          }
-          break;
-        case ParseType.ErrorMessage:
-          warningMessages.push(hostname);
-          break;
-        default:
-          break;
-      }
-    };
+            break;
+          case ParseType.BlackIncludeSubdomain:
+            if (hostname[0] !== '.') {
+              blacklistDomainSets.add(`.${hostname}`);
+            } else {
+              blacklistDomainSets.add(hostname);
+            }
+            break;
+          case ParseType.ErrorMessage:
+            warningMessages.push(hostname);
+            break;
+          default:
+            break;
+        }
+      };
 
-    if (!fallbackUrls || fallbackUrls.length === 0) {
-      for await (const line of await fetchRemoteTextAndReadByLine(filterRulesUrl)) {
+      // TODO-SUKKA: add cache here
+      if (!fallbackUrls || fallbackUrls.length === 0) {
+        for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) {
         // don't trim here
-        lineCb(line);
-      }
-    } else {
-      const filterRules = (await traceAsync(
-        picocolors.gray(`- download ${filterRulesUrl}`),
-        () => fetchAssets(filterRulesUrl, fallbackUrls),
-        picocolors.gray
-      )).split('\n');
-
-      const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`);
-      console.time(key);
-      for (let i = 0, len = filterRules.length; i < len; i++) {
-        lineCb(filterRules[i]);
+          lineCb(line);
+        }
+      } else {
+        const filterRules = (await traceAsync(
+          picocolors.gray(`- download ${filterRulesUrl}`),
+          () => fetchAssets(filterRulesUrl, fallbackUrls),
+          picocolors.gray
+        )).split('\n');
+
+        const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`);
+        console.time(key);
+        for (let i = 0, len = filterRules.length; i < len; i++) {
+          lineCb(filterRules[i]);
+        }
+        console.timeEnd(key);
       }
-      console.timeEnd(key);
+
+      return [
+        Array.from(whitelistDomainSets),
+        Array.from(blacklistDomainSets),
+        warningMessages
+      ];
+    },
+    {
+      ttl,
+      temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
+      serializer: JSON.stringify,
+      deserializer: JSON.parse
     }
-  });
+  ));
 
   warningMessages.forEach(msg => {
     console.warn(
@@ -172,13 +211,13 @@ export async function processFilterRules(
   console.log(
     picocolors.gray('[process filter]'),
     picocolors.gray(filterRulesUrl),
-    picocolors.gray(`white: ${whitelistDomainSets.size}`),
-    picocolors.gray(`black: ${blacklistDomainSets.size}`)
+    picocolors.gray(`white: ${white.length}`),
+    picocolors.gray(`black: ${black.length}`)
   );
 
   return {
-    white: whitelistDomainSets,
-    black: blacklistDomainSets,
+    white,
+    black,
     foundDebugDomain
   };
 }

+ 1 - 1
Build/lib/process-line.ts

@@ -4,7 +4,7 @@ export const processLine = (line: string): string | null => {
   }
 
   const trimmed: string = line.trim();
-  if (trimmed === '') {
+  if (trimmed.length === 0) {
     return null;
   }
 

+ 19 - 13
Build/lib/reject-data-source.ts

@@ -7,11 +7,11 @@ export const HOSTS = [
   ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false],
   ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true],
   // CoinBlockerList
-  ['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true],
+  // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 10 days cache ttl
+  ['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, 10 * 24 * 60 * 60 * 1000],
   // Curben's UrlHaus Malicious URL Blocklist
   // 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt',
   // 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
-  // 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/urlhaus-filter/urlhaus-filter-online.txt',
   ['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, true],
   // Curben's Phishing URL Blocklist
   // Covered by lib/get-phishing-domains.ts
@@ -21,14 +21,24 @@ export const HOSTS = [
   // Curben's PUP Domains Blocklist
   // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
   // 'https://pup-filter.pages.dev/pup-filter-agh.txt'
-  ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true],
+  // The PUP filter has paused the update since 2023-05, so we set a 7 days cache ttl
+  ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, 7 * 24 * 60 * 60 * 1000],
   // BarbBlock
-  ['https://paulgb.github.io/BarbBlock/blacklists/hosts-file.txt', true, true]
+  // The barbblock list has never been updated since 2019-05, so we set a 10 days cache ttl
+  ['https://paulgb.github.io/BarbBlock/blacklists/hosts-file.txt', true, true, 10 * 24 * 60 * 60 * 1000]
 ] as const;
 
 export const DOMAIN_LISTS = [
   // DigitalSide Threat-Intel - OSINT Hub
-  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true]
+  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true],
+
+  // AdGuard CNAME Filter Combined
+  // Update on a 7 days basis, so we add a 36 hours cache ttl
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, 36 * 60 * 60 * 1000],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, 36 * 60 * 60 * 1000],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, 36 * 60 * 60 * 1000],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000]
 ] as const;
 
 export const ADGUARD_FILTERS = [
@@ -41,7 +51,8 @@ export const ADGUARD_FILTERS = [
       'https://secure.fanboy.co.nz/easylist.txt',
       'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt',
       'https://ublockorigin.pages.dev/thirdparties/easylist.txt'
-    ]
+    ],
+    12 * 60 * 60 * 1000
   ],
   // EasyPrivacy
   [
@@ -52,7 +63,8 @@ export const ADGUARD_FILTERS = [
       'https://easylist-downloads.adblockplus.org/easyprivacy.txt',
       'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easyprivacy.txt',
       'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt'
-    ]
+    ],
+    12 * 60 * 60 * 1000
   ],
   // AdGuard DNS Filter
   [
@@ -62,12 +74,6 @@ export const ADGUARD_FILTERS = [
       'https://adguardteam.github.io/HostlistsRegistry/assets/filter_1.txt'
     ]
   ],
-  // AdGuard CNAME Filter Combined
-  'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads.txt',
-  'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers.txt',
-  'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs.txt',
-  'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites.txt',
-  'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers.txt',
   // uBlock Origin Filter List
   [
     'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt',

+ 2 - 2
Build/validate-domestic.ts

@@ -1,4 +1,4 @@
-import { fetchRemoteTextAndReadByLine, readFileByLine } from './lib/fetch-text-by-line';
+import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line';
 import { Readable } from 'stream';
 import { parse } from 'csv-parse';
 import { createTrie } from './lib/trie';
@@ -7,7 +7,7 @@ import { processLine } from './lib/process-line';
 
 export const parseDomesticList = async () => {
   const set = new Set<string>();
-  for await (const line of await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) {
+  for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) {
     if (line.startsWith('server=/') && line.endsWith('/114.114.114.114')) {
       const domain = line.slice(8, -16);
       set.add(domain);

+ 1 - 0
Source/domainset/cdn.conf

@@ -2266,3 +2266,4 @@ ocecdn.oraclecloud.com
 assets.humix.com
 .nelreports.net
 static.mediafire.com
+player.louisvuitton.com