Browse Source

Feat: implement HTTP 304 with SQLite Cache (#42)

Sukka 1 year ago
parent
commit
07d3fdf05b

+ 6 - 8
Build/build-apple-cdn.ts

@@ -1,18 +1,16 @@
-import { parseFelixDnsmasq } from './lib/parse-dnsmasq';
+import { parseFelixDnsmasqFromResp } from './lib/parse-dnsmasq';
 import { task } from './trace';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { createMemoizedPromise } from './lib/memo-promise';
-import { TTL, deserializeArray, fsFetchCache, serializeArray, createCacheKey } from './lib/cache-filesystem';
+import { deserializeArray, fsFetchCache, serializeArray, getFileContentHash } from './lib/cache-filesystem';
 import { DomainsetOutput } from './lib/create-file';
 
-const cacheKey = createCacheKey(__filename);
-
 const url = 'https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/apple.china.conf';
-export const getAppleCdnDomainsPromise = createMemoizedPromise(() => fsFetchCache.apply(
-  cacheKey(url),
-  () => parseFelixDnsmasq(url),
+export const getAppleCdnDomainsPromise = createMemoizedPromise(() => fsFetchCache.applyWithHttp304(
+  url,
+  getFileContentHash(__filename),
+  parseFelixDnsmasqFromResp,
   {
-    ttl: TTL.THREE_DAYS(),
     serializer: serializeArray,
     deserializer: deserializeArray
   }

+ 8 - 7
Build/build-reject-ip-list.ts

@@ -1,10 +1,10 @@
 // @ts-check
 import path from 'node:path';
-import { fetchRemoteTextByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
+import { createReadlineInterfaceFromResponse, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
 import { task } from './trace';
 import { SHARED_DESCRIPTION } from './lib/constants';
 import { isProbablyIpv4, isProbablyIpv6 } from './lib/is-fast-ip';
-import { TTL, fsFetchCache, createCacheKey } from './lib/cache-filesystem';
+import { TTL, fsFetchCache, createCacheKey, getFileContentHash } from './lib/cache-filesystem';
 import { fetchAssets } from './lib/fetch-assets';
 import { processLine } from './lib/process-line';
 import { RulesetOutput } from './lib/create-file';
@@ -14,12 +14,14 @@ const cacheKey = createCacheKey(__filename);
 
 const BOGUS_NXDOMAIN_URL = 'https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/bogus-nxdomain.china.conf';
 
-const getBogusNxDomainIPsPromise = fsFetchCache.apply<[ipv4: string[], ipv6: string[]]>(
-  cacheKey(BOGUS_NXDOMAIN_URL),
-  async () => {
+const getBogusNxDomainIPsPromise = fsFetchCache.applyWithHttp304(
+  BOGUS_NXDOMAIN_URL,
+  getFileContentHash(__filename),
+  async (resp) => {
     const ipv4: string[] = [];
     const ipv6: string[] = [];
-    for await (const line of await fetchRemoteTextByLine(BOGUS_NXDOMAIN_URL)) {
+
+    for await (const line of createReadlineInterfaceFromResponse(resp)) {
       if (line.startsWith('bogus-nxdomain=')) {
         const ip = line.slice(15).trim();
         if (isProbablyIpv4(ip)) {
@@ -32,7 +34,6 @@ const getBogusNxDomainIPsPromise = fsFetchCache.apply<[ipv4: string[], ipv6: str
     return [ipv4, ipv6] as const;
   },
   {
-    ttl: TTL.ONE_WEEK(),
     serializer: JSON.stringify,
     deserializer: JSON.parse
   }

+ 7 - 9
Build/build-speedtest-domainset.ts

@@ -139,18 +139,16 @@ const PREDEFINE_DOMAINS = [
 const s = new Sema(2);
 const cacheKey = createCacheKey(__filename);
 
-const latestTopUserAgentsPromise = fsFetchCache.apply(
+const latestTopUserAgentsPromise = fsFetchCache.applyWithHttp304<string[]>(
+  'https://cdn.jsdelivr.net/npm/top-user-agents@latest/src/desktop.json',
   cacheKey('https://cdn.jsdelivr.net/npm/top-user-agents@latest/src/desktop.json'),
-  () => fetchWithRetry(
-    'https://cdn.jsdelivr.net/npm/top-user-agents@latest/src/desktop.json',
-    { signal: AbortSignal.timeout(1000 * 60) }
-  )
-    .then(res => res.json() as Promise<string[]>)
-    .then((userAgents) => userAgents.filter(ua => ua.startsWith('Mozilla/5.0 '))),
+  async (res) => {
+    const userAgents = await (res.json() as Promise<string[]>);
+    return userAgents.filter(ua => ua.startsWith('Mozilla/5.0 '));
+  },
   {
     serializer: serializeArray,
-    deserializer: deserializeArray,
-    ttl: TTL.THREE_DAYS()
+    deserializer: deserializeArray
   }
 );
 

+ 75 - 2
Build/lib/cache-filesystem.ts

@@ -4,10 +4,11 @@ import os from 'node:os';
 import path from 'node:path';
 import { mkdirSync } from 'node:fs';
 import picocolors from 'picocolors';
-import { fastStringArrayJoin, identity } from './misc';
+import { fastStringArrayJoin, identity, mergeHeaders } from './misc';
 import { performance } from 'node:perf_hooks';
 import fs from 'node:fs';
 import { stringHash } from './string-hash';
+import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
 
 const enum CacheStatus {
   Hit = 'hit',
@@ -44,6 +45,7 @@ const ONE_HOUR = 60 * 60 * 1000;
 const ONE_DAY = 24 * ONE_HOUR;
 // Add some randomness to the cache ttl to avoid thundering herd
 export const TTL = {
+  useHttp304: Symbol('useHttp304'),
   humanReadable(ttl: number) {
     if (ttl >= ONE_DAY) {
       return `${Math.round(ttl / 24 / 60 / 60 / 1000)}d`;
@@ -56,6 +58,7 @@ export const TTL = {
   THREE_HOURS: () => randomInt(1, 3) * ONE_HOUR,
   TWLVE_HOURS: () => randomInt(8, 12) * ONE_HOUR,
   ONE_DAY: () => randomInt(23, 25) * ONE_HOUR,
+  ONE_WEEK_STATIC: ONE_DAY * 7,
   THREE_DAYS: () => randomInt(1, 3) * ONE_DAY,
   ONE_WEEK: () => randomInt(4, 7) * ONE_DAY,
   TEN_DAYS: () => randomInt(7, 10) * ONE_DAY,
@@ -204,6 +207,75 @@ export class Cache<S = string> {
     return deserializer(cached);
   }
 
+  async applyWithHttp304<T>(
+    url: string,
+    extraCacheKey: string,
+    fn: (resp: Response) => Promise<T>,
+    opt: Omit<CacheApplyOption<T, S>, 'ttl' | 'incrementTtlWhenHit'>,
+    requestInit?: RequestInit
+  ) {
+    const { temporaryBypass } = opt;
+
+    const ttl = TTL.ONE_WEEK_STATIC;
+
+    if (temporaryBypass) {
+      return fn(await fetchWithRetry(url, requestInit ?? defaultRequestInit));
+    }
+
+    const baseKey = url + '$' + extraCacheKey;
+    const etagKey = baseKey + '$etag';
+    const cachedKey = baseKey + '$cached';
+
+    const onMiss = (resp: Response) => {
+      console.log(picocolors.yellow('[cache] miss'), url, picocolors.gray(`ttl: ${TTL.humanReadable(ttl)}`));
+
+      const serializer = 'serializer' in opt ? opt.serializer : identity as any;
+
+      const etag = resp.headers.get('etag');
+
+      if (!etag) {
+        console.log(picocolors.red('[cache] no etag'), picocolors.gray(url));
+        return fn(resp);
+      }
+      const promise = fn(resp);
+
+      return promise.then((value) => {
+        this.set(etagKey, etag, ttl);
+        this.set(cachedKey, serializer(value), ttl);
+        return value;
+      });
+    };
+
+    const cached = this.get(cachedKey);
+    if (cached == null) {
+      return onMiss(await fetchWithRetry(url, requestInit ?? defaultRequestInit));
+    }
+
+    const etag = this.get(etagKey);
+    const resp = await fetchWithRetry(
+      url,
+      {
+        ...(requestInit ?? defaultRequestInit),
+        headers: (typeof etag === 'string' && etag.length > 0)
+          ? mergeHeaders(
+            (requestInit ?? defaultRequestInit).headers,
+            { 'If-None-Match': etag }
+          )
+          : (requestInit ?? defaultRequestInit).headers
+      }
+    );
+
+    if (resp.status !== 304) {
+      return onMiss(resp);
+    }
+
+    console.log(picocolors.green('[cache] http 304'), picocolors.gray(url));
+    this.updateTtl(cachedKey, ttl);
+
+    const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
+    return deserializer(cached);
+  }
+
   destroy() {
     this.db.close();
   }
@@ -222,7 +294,8 @@ export const deserializeSet = (str: string) => new Set(str.split(separator));
 export const serializeArray = (arr: string[]) => fastStringArrayJoin(arr, separator);
 export const deserializeArray = (str: string) => str.split(separator);
 
+export const getFileContentHash = (filename: string) => stringHash(fs.readFileSync(filename, 'utf-8'));
 export const createCacheKey = (filename: string) => {
-  const fileHash = stringHash(fs.readFileSync(filename, 'utf-8'));
+  const fileHash = getFileContentHash(filename);
   return (key: string) => key + '$' + fileHash + '$';
 };

+ 5 - 9
Build/lib/download-publicsuffixlist.ts

@@ -1,18 +1,14 @@
-import { TTL, deserializeArray, fsFetchCache, serializeArray, createCacheKey } from './cache-filesystem';
-import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
+import { deserializeArray, fsFetchCache, getFileContentHash, serializeArray } from './cache-filesystem';
 import { createMemoizedPromise } from './memo-promise';
 
-const cacheKey = createCacheKey(__filename);
-
-export const getPublicSuffixListTextPromise = createMemoizedPromise(() => fsFetchCache.apply(
-  cacheKey('https://publicsuffix.org/list/public_suffix_list.dat'),
-  () => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit)
-    .then(r => r.text()).then(text => text.split('\n')),
+export const getPublicSuffixListTextPromise = createMemoizedPromise(() => fsFetchCache.applyWithHttp304<string[]>(
+  'https://publicsuffix.org/list/public_suffix_list.dat',
+  getFileContentHash(__filename),
+  (r) => r.text().then(text => text.split('\n')),
   {
     // https://github.com/publicsuffix/list/blob/master/.github/workflows/tld-update.yml
     // Though the action runs every 24 hours, the IANA list is updated every 7 days.
     // So a 3 day TTL should be enough.
-    ttl: TTL.THREE_DAYS(),
     serializer: serializeArray,
     deserializer: deserializeArray
   }

+ 2 - 2
Build/lib/fetch-retry.ts

@@ -89,7 +89,7 @@ function createFetchRetry($fetch: typeof fetch): FetchWithRetry {
             }
             throw new ResponseError(res);
           } else {
-            if (!res.ok && retryOpts.retryOnNon2xx) {
+            if ((!res.ok && res.status !== 304) && retryOpts.retryOnNon2xx) {
               throw new ResponseError(res);
             }
             return res;
@@ -106,7 +106,7 @@ function createFetchRetry($fetch: typeof fetch): FetchWithRetry {
             return bail(err) as never;
           }
 
-          console.log(picocolors.gray('[fetch fail]'), url);
+          console.log(picocolors.gray('[fetch fail]'), url, err);
           throw err;
         }
       }, retryOpts);

+ 27 - 0
Build/lib/misc.ts

@@ -95,3 +95,30 @@ export function withBannerArray(title: string, description: string[] | readonly
     '################## EOF ##################'
   ];
 };
+
+export const mergeHeaders = (headersA: RequestInit['headers'] | undefined, headersB: RequestInit['headers']) => {
+  if (headersA == null) {
+    return headersB;
+  }
+
+  if (Array.isArray(headersB)) {
+    throw new TypeError('Array headers is not supported');
+  }
+
+  const result = new Headers(headersA);
+
+  if (headersB instanceof Headers) {
+    headersB.forEach((value, key) => {
+      result.set(key, value);
+    });
+    return result;
+  }
+
+  for (const key in headersB) {
+    if (Object.hasOwn(headersB, key)) {
+      result.set(key, (headersB as Record<string, string>)[key]);
+    }
+  }
+
+  return result;
+};

+ 13 - 6
Build/lib/parse-dnsmasq.ts

@@ -1,5 +1,6 @@
-import { fetchRemoteTextByLine } from './fetch-text-by-line';
+import { createReadlineInterfaceFromResponse } from './fetch-text-by-line';
 import { parse as tldtsParse } from 'tldts';
+import { fetchWithRetry, defaultRequestInit } from './fetch-retry';
 
 const isDomainLoose = (domain: string): boolean => {
   const { isIcann, isPrivate, isIp } = tldtsParse(domain);
@@ -13,14 +14,20 @@ export const extractDomainsFromFelixDnsmasq = (line: string): string | null => {
   return null;
 };
 
-export const parseFelixDnsmasq = async (url: string | URL): Promise<string[]> => {
-  const res: string[] = [];
-  for await (const line of await fetchRemoteTextByLine(url)) {
+export const parseFelixDnsmasqFromResp = async (resp: Response): Promise<string[]> => {
+  const results: string[] = [];
+
+  for await (const line of createReadlineInterfaceFromResponse(resp)) {
     const domain = extractDomainsFromFelixDnsmasq(line);
     if (domain && isDomainLoose(domain)) {
-      res.push(domain);
+      results.push(domain);
     }
   }
 
-  return res;
+  return results;
+};
+
+export const parseFelixDnsmasq = async (url: string | URL): Promise<string[]> => {
+  const resp = await fetchWithRetry(url, defaultRequestInit);
+  return parseFelixDnsmasqFromResp(resp);
 };

+ 1 - 5
Build/lib/parse-filter.ts

@@ -159,11 +159,7 @@ export async function processFilterRules(
   ttl: number | null = null,
   allowThirdParty = false
 ): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
-  const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn((span) => fsFetchCache.apply<Readonly<[
-    white: string[],
-    black: string[],
-    warningMessages: string[]
-  ]>>(
+  const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn((span) => fsFetchCache.apply<Readonly<[ white: string[], black: string[], warningMessages: string[] ]>>(
     cacheKey(filterRulesUrl),
     async () => {
       const whitelistDomainSets = new Set<string>();