Browse Source

Perf: use filesystem cache

SukkaW 2 years ago
parent
commit
85801b1b9e

+ 42 - 30
Build/build-speedtest-domainset.ts

@@ -12,17 +12,17 @@ import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
 import picocolors from 'picocolors';
 import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
 import { processLine } from './lib/process-line';
+import { TTL, deserializeArray, fsCache, serializeArray } from './lib/cache-filesystem';
 
 const s = new Sema(2);
 
 const latestTopUserAgentsPromise = fetchWithRetry('https://unpkg.com/top-user-agents@latest/index.json')
-  .then(res => res.json<string[]>());
+  .then(res => res.json<string[]>()).then(userAgents => userAgents.filter(ua => ua.startsWith('Mozilla/5.0 ')));
 
 const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>> => {
-  const topUserAgents = (await Promise.all([
-    latestTopUserAgentsPromise,
-    s.acquire()
-  ]))[0];
+  const topUserAgents = await latestTopUserAgentsPromise;
+
+  const url = `https://www.speedtest.net/api/js/servers?engine=js&search=${keyword}&limit=100`;
 
   try {
     const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)];
@@ -30,39 +30,51 @@ const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>>
     console.log(key);
     console.time(key);
 
-    const res = await fetchWithRetry(`https://www.speedtest.net/api/js/servers?engine=js&search=${keyword}&limit=100`, {
-      headers: {
-        dnt: '1',
-        Referer: 'https://www.speedtest.net/',
-        accept: 'application/json, text/plain, */*',
-        'User-Agent': randomUserAgent,
-        'Accept-Language': 'en-US,en;q=0.9',
-        ...(randomUserAgent.includes('Chrome')
-          ? {
-            'Sec-Ch-Ua-Mobile': '?0',
-            'Sec-Fetch-Dest': 'empty',
-            'Sec-Fetch-Mode': 'cors',
-            'Sec-Fetch-Site': 'same-origin',
-            'Sec-Gpc': '1'
+    const json = await fsCache.apply(
+      url,
+      () => s.acquire().then(() => fetchWithRetry(url, {
+        headers: {
+          dnt: '1',
+          Referer: 'https://www.speedtest.net/',
+          accept: 'application/json, text/plain, */*',
+          'User-Agent': randomUserAgent,
+          'Accept-Language': 'en-US,en;q=0.9',
+          ...(randomUserAgent.includes('Chrome')
+            ? {
+              'Sec-Ch-Ua-Mobile': '?0',
+              'Sec-Fetch-Dest': 'empty',
+              'Sec-Fetch-Mode': 'cors',
+              'Sec-Fetch-Site': 'same-origin',
+              'Sec-Gpc': '1'
+            }
+            : {})
+        },
+        signal: AbortSignal.timeout(1000 * 4),
+        retry: {
+          retries: 2
+        }
+      })).then(r => r.json<Array<{ url: string }>>()).then(data => data.reduce<string[]>(
+        (prev, cur) => {
+          const hn = tldts.getHostname(cur.url, { detectIp: false });
+          if (hn) {
+            prev.push(hn);
           }
-          : {})
-      },
-      signal: AbortSignal.timeout(1000 * 4),
-      retry: {
-        retries: 2
+          return prev;
+        }, []
+      )).finally(() => s.release()),
+      {
+        ttl: TTL.ONE_WEEK(),
+        serializer: serializeArray,
+        deserializer: deserializeArray
       }
-    });
-
-    const json = await res.json<Array<{ url: string }>>();
+    );
 
     console.timeEnd(key);
 
-    return json.map(({ url }) => tldts.getHostname(url, { detectIp: false }));
+    return json;
   } catch (e) {
     console.log(e);
     return [];
-  } finally {
-    s.release();
   }
 };
 

+ 4 - 4
Build/download-mock-assets.ts

@@ -4,10 +4,10 @@ import path from 'path';
 import { fetchWithRetry } from './lib/fetch-retry';
 
 const ASSETS_LIST = {
-  'www-google-analytics-com_ga.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/google-analytics-ga.js',
-  'www-googletagservices-com_gpt.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/googletagservices-gpt.js',
-  'www-google-analytics-com_analytics.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/google-analytics.js',
-  'www-googlesyndication-com_adsbygoogle.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/googlesyndication-adsbygoogle.js'
+  'www-google-analytics-com_ga.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/google-analytics-ga.js',
+  'www-googletagservices-com_gpt.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/googletagservices-gpt.js',
+  'www-google-analytics-com_analytics.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/google-analytics.js',
+  'www-googlesyndication-com_adsbygoogle.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/googlesyndication-adsbygoogle.js'
 } as const;
 
 const mockDir = path.resolve(import.meta.dir, '../Mock');

+ 16 - 1
Build/download-publicsuffixlist.ts

@@ -1,5 +1,20 @@
+import { TTL, fsCache } from './lib/cache-filesystem';
 import { defaultRequestInit, fetchWithRetry } from './lib/fetch-retry';
 import { createMemoizedPromise } from './lib/memo-promise';
 import { traceAsync } from './lib/trace-runner';
 
-export const getPublicSuffixListTextPromise = createMemoizedPromise(() => traceAsync('obtain public_suffix_list', () => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text())));
+export const getPublicSuffixListTextPromise = createMemoizedPromise(
+  () => traceAsync(
+    'obtain public_suffix_list',
+    () => fsCache.apply(
+      'https://publicsuffix.org/list/public_suffix_list.dat',
+      () => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text()),
+      {
+        // https://github.com/publicsuffix/list/blob/master/.github/workflows/tld-update.yml
+        // Though the action runs every 24 hours, the IANA list is updated every 7 days.
+        // So a 3 day TTL should be enough.
+        ttl: TTL.THREE_DAYS()
+      }
+    )
+  )
+);

+ 18 - 2
Build/lib/cache-filesystem.ts

@@ -127,12 +127,28 @@ export class Cache {
   }
 }
 
-// export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') });
+export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') });
 // process.on('exit', () => {
 //   fsCache.destroy();
 // });
 
-const separator = String.fromCharCode(0);
+const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;
+
+// Add some randomness to the cache ttl to avoid thundering herd
+export const TTL = {
+  TWLVE_HOURS: () => randomInt(9, 14) * 60 * 60 * 1000,
+  THREE_DAYS: () => randomInt(2, 4) * 24 * 60 * 60 * 1000,
+  ONE_WEEK: () => randomInt(5, 8) * 24 * 60 * 60 * 1000,
+  TWO_WEEKS: () => randomInt(12, 16) * 24 * 60 * 60 * 1000,
+  TEN_DAYS: () => randomInt(9, 11) * 24 * 60 * 60 * 1000
+};
 
+const separator = String.fromCharCode(0);
+// const textEncoder = new TextEncoder();
+// const textDecoder = new TextDecoder();
+// export const serializeString = (str: string) => textEncoder.encode(str);
+// export const deserializeString = (str: string) => textDecoder.decode(new Uint8Array(str.split(separator).map(Number)));
 export const serializeSet = (set: Set<string>) => Array.from(set).join(separator);
 export const deserializeSet = (str: string) => new Set(str.split(separator));
+export const serializeArray = (arr: string[]) => arr.join(separator);
+export const deserializeArray = (str: string) => str.split(separator);

+ 19 - 18
Build/lib/parse-filter.ts

@@ -9,14 +9,15 @@ import { traceAsync } from './trace-runner';
 import picocolors from 'picocolors';
 import { normalizeDomain } from './normalize-domain';
 import { fetchAssets } from './fetch-assets';
+import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';
 
 const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
 let foundDebugDomain = false;
 
-export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, _ttl: number | null = null) {
-  return traceAsync(`- processDomainLists: ${domainListsUrl}`, /*  () => fsCache.apply(
+export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
+  return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply(
     domainListsUrl,
-     */async () => {
+    async () => {
       const domainSets = new Set<string>();
 
       for await (const line of await fetchRemoteTextByLine(domainListsUrl)) {
@@ -32,19 +33,19 @@ export function processDomainLists(domainListsUrl: string, includeAllSubDomain =
       }
 
       return domainSets;
-    });/* ,
+    },
     {
       ttl,
       temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
       serializer: serializeSet,
       deserializer: deserializeSet
     }
-  )); */
+  ));
 }
-export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, _ttl: number | null = null) {
-  return traceAsync(`- processHosts: ${hostsUrl}`, /* () => fsCache.apply(
+export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) {
+  return traceAsync(`- processHosts: ${hostsUrl}`, () => fsCache.apply(
     hostsUrl,
-     */async () => {
+    async () => {
       const domainSets = new Set<string>();
 
       for await (const l of await fetchRemoteTextByLine(hostsUrl)) {
@@ -73,14 +74,14 @@ export function processHosts(hostsUrl: string, includeAllSubDomain = false, skip
       console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
 
       return domainSets;
-    });
-  /* {
+    },
+    {
       ttl,
       temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
       serializer: serializeSet,
       deserializer: deserializeSet
     }
-  ) */
+  ));
 }
 
 // eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
@@ -95,15 +96,15 @@ const enum ParseType {
 export async function processFilterRules(
   filterRulesUrl: string,
   fallbackUrls?: readonly string[] | undefined | null,
-  _ttl: number | null = null
+  ttl: number | null = null
 ): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
-  const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, /* () => fsCache.apply<[
+  const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, () => fsCache.apply<[
     white: string[],
     black: string[],
     warningMessages: string[]
   ]>(
     filterRulesUrl,
-     */async () => {
+    async () => {
       const whitelistDomainSets = new Set<string>();
       const blacklistDomainSets = new Set<string>();
 
@@ -168,7 +169,7 @@ export async function processFilterRules(
       // TODO-SUKKA: add cache here
       if (!fallbackUrls || fallbackUrls.length === 0) {
         for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) {
-        // don't trim here
+          // don't trim here
           lineCb(line);
         }
       } else {
@@ -191,14 +192,14 @@ export async function processFilterRules(
         Array.from(blacklistDomainSets),
         warningMessages
       ];
-    });
-    /* {
+    },
+    {
       ttl,
       temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
       serializer: JSON.stringify,
       deserializer: JSON.parse
     }
-  ) */
+  ));
 
   warningMessages.forEach(msg => {
     console.warn(

+ 16 - 0
Build/lib/random-int.bench.ts

@@ -0,0 +1,16 @@
+import { bench, group, run } from 'mitata';
+import { randomInt as nativeRandomInt } from 'crypto';
+
+const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;
+
+group('random-int', () => {
+  bench('crypto.randomInt', () => {
+    nativeRandomInt(3, 7);
+  });
+
+  bench('Math.random', () => {
+    randomInt(3, 7);
+  });
+});
+
+run();

+ 28 - 18
Build/lib/reject-data-source.ts

@@ -1,14 +1,20 @@
+import { TTL } from './cache-filesystem';
+
 export const HOSTS = [
   ['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true],
   ['https://someonewhocares.org/hosts/hosts', true],
-  ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false],
-  ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true],
+  // no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
+  ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false, false, TTL.THREE_DAYS()],
+  // have not been updated for more than a year, so we set a 14 days cache ttl
+  ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true, false, TTL.TWO_WEEKS()],
   ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', false],
-  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false],
+  ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-Extension.txt', false],
+  // ad-wars is not actively maintained, so we set a 7 days cache ttl
+  ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false, false, TTL.ONE_WEEK()],
   ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true],
   // CoinBlockerList
-  // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 10 days cache ttl
-  ['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, 10 * 24 * 60 * 60 * 1000],
+  // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
+  ['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, TTL.TWO_WEEKS()],
   // Curben's UrlHaus Malicious URL Blocklist
   // 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt',
   // 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
@@ -21,23 +27,24 @@ export const HOSTS = [
   // Curben's PUP Domains Blocklist
   // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
   // 'https://pup-filter.pages.dev/pup-filter-agh.txt'
-  // The PUP filter has paused the update since 2023-05, so we set a 7 days cache ttl
-  ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, 7 * 24 * 60 * 60 * 1000]
+  // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl
+  ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, TTL.TWO_WEEKS()]
 ] as const;
 
 export const DOMAIN_LISTS = [
   // BarbBlock
-  // The barbblock list has never been updated since 2019-05, so we set a 10 days cache ttl
-  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, 10 * 24 * 60 * 60 * 1000],
+  // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
+  ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()],
   // DigitalSide Threat-Intel - OSINT Hub
-  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true],
+  // Update once per day
+  ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, 24 * 60 * 60 * 1000],
   // AdGuard CNAME Filter Combined
-  // Update on a 7 days basis, so we add a 36 hours cache ttl
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, 36 * 60 * 60 * 1000],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, 36 * 60 * 60 * 1000],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, 36 * 60 * 60 * 1000],
-  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000]
+  // Update on a 7 days basis, so we add a 3 hours cache ttl
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()],
+  ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()]
 ] as const;
 
 export const ADGUARD_FILTERS = [
@@ -130,14 +137,17 @@ export const ADGUARD_FILTERS = [
   // GameConsoleAdblockList
   'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt',
   // PiHoleBlocklist
+  // Update almost once per 3 months, let's set a 10 days cache ttl
   [
     'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt',
     [
       'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'
-    ]
+    ],
+    TTL.TEN_DAYS()
   ],
   // Spam404
-  'https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt',
+  // Not actively maintained, let's use a 10 days cache ttl
+  ['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()],
   // Brave First Party & First Party CNAME
   'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt'
 ] as const;

BIN
bun.lockb


+ 1 - 0
package.json

@@ -39,6 +39,7 @@
     "eslint": "^8.56.0",
     "eslint-config-sukka": "4.1.10-beta.2",
     "eslint-formatter-sukka": "4.1.9",
+    "mitata": "^0.1.6",
     "typescript": "^5.3.3"
   },
   "resolutions": {