trie.ts 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633
  1. /**
  2. * Hostbane-Optimized Trie based on Mnemonist Trie
  3. */
  4. import { fastStringCompare } from 'foxts/fast-string-compare';
  5. import util from 'node:util';
  6. import { noop } from 'foxts/noop';
  7. import { fastStringArrayJoin } from 'foxts/fast-string-array-join';
  8. import { deleteBit, getBit, missingBit, setBit } from 'foxts/bitwise';
  9. import { domainToASCII } from 'node:url';
  10. const START = 1 << 1;
  11. const INCLUDE_ALL_SUBDOMAIN = 1 << 2;
  12. type TrieNode<Meta = any> = [
  13. /** end, includeAllSubdomain (.example.org, ||example.com) */ flag: number,
  14. /** parent */ TrieNode | null,
  15. /** children */ Map<string, TrieNode>,
  16. /** token */ token: string,
  17. /** meta */ Meta
  18. ];
  19. function deepTrieNodeToJSON<Meta = unknown>(node: TrieNode,
  20. unpackMeta: ((meta?: Meta) => string) | undefined) {
  21. const obj: Record<string, unknown> = {};
  22. obj['[start]'] = getBit(node[0], START);
  23. obj['[subdomain]'] = getBit(node[0], INCLUDE_ALL_SUBDOMAIN);
  24. if (node[4] != null) {
  25. if (unpackMeta) {
  26. obj['[meta]'] = unpackMeta(node[4]);
  27. } else {
  28. obj['[meta]'] = node[4];
  29. }
  30. }
  31. node[2].forEach((value, key) => {
  32. obj[key] = deepTrieNodeToJSON(value, unpackMeta);
  33. });
  34. return obj;
  35. }
  36. const createNode = <Meta = unknown>(token: string, parent: TrieNode | null = null): TrieNode => [1, parent, new Map<string, TrieNode>(), token, null] as TrieNode<Meta>;
  37. function hostnameToTokens(hostname: string, hostnameFromIndex: number): string[] {
  38. const tokens = hostname.split('.');
  39. const results: string[] = [];
  40. let token = '';
  41. for (let i = hostnameFromIndex, l = tokens.length; i < l; i++) {
  42. token = tokens[i];
  43. if (token.length > 0) {
  44. results.push(token);
  45. } else {
  46. throw new TypeError(JSON.stringify({ hostname, hostnameFromIndex }, null, 2));
  47. }
  48. }
  49. return results;
  50. }
  51. function walkHostnameTokens(hostname: string, onToken: (token: string) => boolean | null, hostnameFromIndex: number): boolean | null {
  52. const tokens = hostname.split('.');
  53. const l = tokens.length - 1;
  54. // we are at the first of hostname, no splitor there
  55. let token = '';
  56. for (let i = l; i >= hostnameFromIndex; i--) {
  57. token = tokens[i];
  58. if (token.length > 0) {
  59. const t = onToken(token);
  60. if (t === null) {
  61. return null;
  62. }
  63. // if the callback returns true, we should skip the rest
  64. if (t) {
  65. return true;
  66. }
  67. }
  68. }
  69. return false;
  70. }
  71. interface FindSingleChildLeafResult<Meta> {
  72. node: TrieNode<Meta>,
  73. toPrune: TrieNode<Meta> | null,
  74. tokenToPrune: string | null,
  75. parent: TrieNode<Meta>
  76. }
  77. abstract class Triebase<Meta = unknown> {
  78. protected readonly $root: TrieNode<Meta> = createNode('$root');
  79. protected $size = 0;
  80. get root() {
  81. return this.$root;
  82. }
  83. constructor(from?: string[] | Set<string> | null) {
  84. // Actually build trie
  85. if (Array.isArray(from)) {
  86. for (let i = 0, l = from.length; i < l; i++) {
  87. this.add(from[i]);
  88. }
  89. } else if (from) {
  90. from.forEach((value) => this.add(value));
  91. }
  92. }
  93. public abstract add(suffix: string, includeAllSubdomain?: boolean, meta?: Meta, hostnameFromIndex?: number): void;
  94. protected walkIntoLeafWithTokens(
  95. tokens: string[],
  96. onLoop: (node: TrieNode, parent: TrieNode, token: string) => void = noop
  97. ) {
  98. let node: TrieNode = this.$root;
  99. let parent: TrieNode = node;
  100. let token: string;
  101. let child: Map<string, TrieNode<Meta>> = node[2];
  102. // reverse lookup from end to start
  103. for (let i = tokens.length - 1; i >= 0; i--) {
  104. token = tokens[i];
  105. // if (token === '') {
  106. // break;
  107. // }
  108. parent = node;
  109. child = node[2];
  110. // cache node index access is 20% faster than direct access when doing twice
  111. if (child.has(token)) {
  112. node = child.get(token)!;
  113. } else {
  114. return null;
  115. }
  116. onLoop(node, parent, token);
  117. }
  118. return { node, parent };
  119. };
  120. protected walkIntoLeafWithSuffix(
  121. suffix: string,
  122. hostnameFromIndex: number,
  123. onLoop: (node: TrieNode, parent: TrieNode, token: string) => void = noop
  124. ) {
  125. let node: TrieNode = this.$root;
  126. let parent: TrieNode = node;
  127. let child: Map<string, TrieNode<Meta>> = node[2];
  128. const onToken = (token: string) => {
  129. // if (token === '') {
  130. // return true;
  131. // }
  132. parent = node;
  133. child = node[2];
  134. if (child.has(token)) {
  135. node = child.get(token)!;
  136. } else {
  137. return null;
  138. }
  139. onLoop(node, parent, token);
  140. return false;
  141. };
  142. if (walkHostnameTokens(suffix, onToken, hostnameFromIndex) === null) {
  143. return null;
  144. }
  145. return { node, parent };
  146. };
  147. public contains(suffix: string, includeAllSubdomain = suffix[0] === '.'): boolean {
  148. const hostnameFromIndex = suffix[0] === '.' ? 1 : 0;
  149. const res = this.walkIntoLeafWithSuffix(suffix, hostnameFromIndex);
  150. if (!res) return false;
  151. if (includeAllSubdomain) return getBit(res.node[0], INCLUDE_ALL_SUBDOMAIN);
  152. return true;
  153. };
  154. private static bfsResults: [node: TrieNode | null, suffix: string[]] = [null, []];
  155. private static dfs<Meta>(this: void, nodeStack: Array<TrieNode<Meta>>, suffixStack: string[][]) {
  156. const node = nodeStack.pop()!;
  157. const suffix = suffixStack.pop()!;
  158. node[2].forEach((childNode, k) => {
  159. // Pushing the child node to the stack for next iteration of DFS
  160. nodeStack.push(childNode);
  161. suffixStack.push([k, ...suffix]);
  162. });
  163. Triebase.bfsResults[0] = node;
  164. Triebase.bfsResults[1] = suffix;
  165. return Triebase.bfsResults;
  166. }
  167. private static dfsWithSort<Meta>(this: void, nodeStack: Array<TrieNode<Meta>>, suffixStack: string[][]) {
  168. const node = nodeStack.pop()!;
  169. const suffix = suffixStack.pop()!;
  170. const child = node[2];
  171. if (child.size) {
  172. const keys = Array.from(child.keys()).sort(Triebase.compare);
  173. for (let i = 0, l = keys.length; i < l; i++) {
  174. const key = keys[i];
  175. const childNode = child.get(key)!;
  176. // Pushing the child node to the stack for next iteration of DFS
  177. nodeStack.push(childNode);
  178. suffixStack.push([key, ...suffix]);
  179. }
  180. }
  181. Triebase.bfsResults[0] = node;
  182. Triebase.bfsResults[1] = suffix;
  183. return Triebase.bfsResults;
  184. }
  185. private walk(
  186. onMatches: (suffix: string[], subdomain: boolean, meta: Meta) => void,
  187. withSort = false,
  188. initialNode = this.$root,
  189. initialSuffix: string[] = []
  190. ) {
  191. const dfsImpl = withSort ? Triebase.dfsWithSort : Triebase.dfs;
  192. const nodeStack: Array<TrieNode<Meta>> = [];
  193. nodeStack.push(initialNode);
  194. // Resolving initial string (begin the start of the stack)
  195. const suffixStack: string[][] = [];
  196. suffixStack.push(initialSuffix);
  197. let node: TrieNode<Meta> = initialNode;
  198. let r;
  199. do {
  200. r = dfsImpl(nodeStack, suffixStack);
  201. node = r[0]!;
  202. const suffix = r[1];
  203. // If the node is a sentinel, we push the suffix to the results
  204. if (getBit(node[0], START)) {
  205. onMatches(suffix, getBit(node[0], INCLUDE_ALL_SUBDOMAIN), node[4]);
  206. }
  207. } while (nodeStack.length);
  208. };
  209. static compare(this: void, a: string, b: string) {
  210. if (a === b) return 0;
  211. return (a.length - b.length) || fastStringCompare(a, b);
  212. }
  213. protected getSingleChildLeaf(tokens: string[]): FindSingleChildLeafResult<Meta> | null {
  214. let toPrune: TrieNode | null = null;
  215. let tokenToPrune: string | null = null;
  216. const onLoop = (node: TrieNode, parent: TrieNode, token: string) => {
  217. // Keeping track of a potential branch to prune
  218. const child = node[2];
  219. const childSize = child.size + (getBit(node[0], INCLUDE_ALL_SUBDOMAIN) ? 1 : 0);
  220. if (toPrune !== null) { // the most near branch that could potentially being pruned
  221. if (childSize >= 1) {
  222. // The branch has some children, the branch need retain.
  223. // And we need to abort prune that parent branch, so we set it to null
  224. toPrune = null;
  225. tokenToPrune = null;
  226. }
  227. } else if (childSize < 1) {
  228. // There is only one token child, or no child at all, we can prune it safely
  229. // It is now the top-est branch that could potentially being pruned
  230. toPrune = parent;
  231. tokenToPrune = token;
  232. }
  233. };
  234. const res = this.walkIntoLeafWithTokens(tokens, onLoop);
  235. if (res === null) return null;
  236. return { node: res.node, toPrune, tokenToPrune, parent: res.parent };
  237. };
  238. /**
  239. * Method used to retrieve every item in the trie with the given prefix.
  240. */
  241. public find(
  242. inputSuffix: string,
  243. subdomainOnly = inputSuffix[0] === '.',
  244. hostnameFromIndex = inputSuffix[0] === '.' ? 1 : 0
  245. // /** @default true */ includeEqualWithSuffix = true
  246. ): string[] {
  247. const inputTokens = hostnameToTokens(inputSuffix, hostnameFromIndex);
  248. const res = this.walkIntoLeafWithTokens(inputTokens);
  249. if (res === null) return [];
  250. const results: string[] = [];
  251. const onMatches = subdomainOnly
  252. ? (suffix: string[], subdomain: boolean) => { // fast path (default option)
  253. const d = domainToASCII(fastStringArrayJoin(suffix, '.'));
  254. if (!subdomain && subStringEqual(inputSuffix, d, 1)) return;
  255. results.push(subdomain ? '.' + d : d);
  256. }
  257. : (suffix: string[], subdomain: boolean) => { // fast path (default option)
  258. const d = domainToASCII(fastStringArrayJoin(suffix, '.'));
  259. results.push(subdomain ? '.' + d : d);
  260. };
  261. this.walk(
  262. onMatches,
  263. false,
  264. res.node, // Performing DFS from prefix
  265. inputTokens
  266. );
  267. return results;
  268. };
  269. /**
  270. * Method used to delete a prefix from the trie.
  271. */
  272. public remove(suffix: string): boolean {
  273. const res = this.getSingleChildLeaf(hostnameToTokens(suffix, 0));
  274. if (res === null) return false;
  275. if (missingBit(res.node[0], START)) return false;
  276. this.$size--;
  277. const { node, toPrune, tokenToPrune } = res;
  278. if (tokenToPrune && toPrune) {
  279. toPrune[2].delete(tokenToPrune);
  280. } else {
  281. node[0] = deleteBit(node[0], START);
  282. }
  283. return true;
  284. };
  285. // eslint-disable-next-line @typescript-eslint/unbound-method -- safe
  286. public delete = this.remove;
  287. /**
  288. * Method used to assert whether the given prefix exists in the Trie.
  289. */
  290. public has(suffix: string, includeAllSubdomain = suffix[0] === '.'): boolean {
  291. const hostnameFromIndex = suffix[0] === '.' ? 1 : 0;
  292. const res = this.walkIntoLeafWithSuffix(suffix, hostnameFromIndex);
  293. if (res === null) return false;
  294. if (missingBit(res.node[0], START)) return false;
  295. if (includeAllSubdomain) return getBit(res.node[0], INCLUDE_ALL_SUBDOMAIN);
  296. return true;
  297. };
  298. public dumpWithoutDot(onSuffix: (suffix: string, subdomain: boolean) => void, withSort = false) {
  299. const handleSuffix = (suffix: string[], subdomain: boolean) => {
  300. onSuffix(domainToASCII(fastStringArrayJoin(suffix, '.')), subdomain);
  301. };
  302. this.walk(handleSuffix, withSort);
  303. }
  304. public dump(onSuffix: (suffix: string) => void, withSort?: boolean): void;
  305. public dump(onSuffix?: null, withSort?: boolean): string[];
  306. public dump(onSuffix?: ((suffix: string) => void) | null, withSort = false): string[] | void {
  307. const results: string[] = [];
  308. const handleSuffix = onSuffix
  309. ? (suffix: string[], subdomain: boolean) => {
  310. const d = domainToASCII(fastStringArrayJoin(suffix, '.'));
  311. onSuffix(subdomain ? '.' + d : d);
  312. }
  313. : (suffix: string[], subdomain: boolean) => {
  314. const d = domainToASCII(fastStringArrayJoin(suffix, '.'));
  315. results.push(subdomain ? '.' + d : d);
  316. };
  317. this.walk(handleSuffix, withSort);
  318. return results;
  319. };
  320. public dumpMeta(onMeta: (meta: Meta) => void, withSort?: boolean): void;
  321. public dumpMeta(onMeta?: null, withSort?: boolean): Meta[];
  322. public dumpMeta(onMeta?: ((meta: Meta) => void) | null, withSort = false): Meta[] | void {
  323. const results: Meta[] = [];
  324. const handleMeta = onMeta
  325. ? (_suffix: string[], _subdomain: boolean, meta: Meta) => onMeta(meta)
  326. : (_suffix: string[], _subdomain: boolean, meta: Meta) => results.push(meta);
  327. this.walk(handleMeta, withSort);
  328. return results;
  329. };
  330. public dumpWithMeta(onSuffix: (suffix: string, meta: Meta | undefined) => void, withSort?: boolean): void;
  331. public dumpWithMeta(onMeta?: null, withSort?: boolean): Array<[string, Meta | undefined]>;
  332. public dumpWithMeta(onSuffix?: ((suffix: string, meta: Meta | undefined) => void) | null, withSort = false): Array<[string, Meta | undefined]> | void {
  333. const results: Array<[string, Meta | undefined]> = [];
  334. const handleSuffix = onSuffix
  335. ? (suffix: string[], subdomain: boolean, meta: Meta | undefined) => {
  336. const d = domainToASCII(fastStringArrayJoin(suffix, '.'));
  337. return onSuffix(subdomain ? '.' + d : d, meta);
  338. }
  339. : (suffix: string[], subdomain: boolean, meta: Meta | undefined) => {
  340. const d = domainToASCII(fastStringArrayJoin(suffix, '.'));
  341. results.push([subdomain ? '.' + d : d, meta]);
  342. };
  343. this.walk(handleSuffix, withSort);
  344. return results;
  345. };
  346. public inspect(depth: number, unpackMeta?: (meta?: Meta) => any) {
  347. return fastStringArrayJoin(
  348. JSON.stringify(deepTrieNodeToJSON(this.$root, unpackMeta), null, 2).split('\n').map((line) => ' '.repeat(depth) + line),
  349. '\n'
  350. );
  351. }
  352. public [util.inspect.custom](depth: number) {
  353. return this.inspect(depth);
  354. };
  355. public merge(trie: Triebase<Meta>) {
  356. const handleSuffix = (suffix: string[], subdomain: boolean, meta: Meta) => {
  357. this.add(fastStringArrayJoin(suffix, '.'), subdomain, meta);
  358. };
  359. trie.walk(handleSuffix);
  360. return this;
  361. }
  362. }
  363. export class HostnameSmolTrie<Meta = unknown> extends Triebase<Meta> {
  364. public smolTree = true;
  365. add(suffix: string, includeAllSubdomain = suffix[0] === '.', meta?: Meta, hostnameFromIndex = suffix[0] === '.' ? 1 : 0): void {
  366. let node: TrieNode<Meta> = this.$root;
  367. let curNodeChildren: Map<string, TrieNode<Meta>> = node[2];
  368. const onToken = (token: string) => {
  369. curNodeChildren = node[2];
  370. if (curNodeChildren.has(token)) {
  371. node = curNodeChildren.get(token)!;
  372. // During the adding of `[start]blog|.skk.moe` and find out that there is a `[start].skk.moe` in the trie, skip adding the rest of the node
  373. if (getBit(node[0], INCLUDE_ALL_SUBDOMAIN)) {
  374. return true;
  375. }
  376. } else {
  377. const newNode = createNode(token, node);
  378. curNodeChildren.set(token, newNode);
  379. node = newNode;
  380. }
  381. return false;
  382. };
  383. // When walkHostnameTokens returns true, we should skip the rest
  384. if (walkHostnameTokens(suffix, onToken, hostnameFromIndex)) {
  385. return;
  386. }
  387. // If we are in smolTree mode, we need to do something at the end of the loop
  388. if (includeAllSubdomain) {
  389. // Trying to add `[.]sub.example.com` where there is already a `blog.sub.example.com` in the trie
  390. // Make sure parent `[start]sub.example.com` (without dot) is removed (SETINEL to false)
  391. // (/** parent */ node[2]!)[0] = false;
  392. // Removing the rest of the parent's child nodes
  393. node[2].clear();
  394. // The SENTINEL of this node will be set to true at the end of the function, so we don't need to set it here
  395. // we can use else-if here, because the children is now empty, we don't need to check the leading "."
  396. } else if (getBit(node[0], INCLUDE_ALL_SUBDOMAIN)) {
  397. // Trying to add `example.com` when there is already a `.example.com` in the trie
  398. // No need to increment size and set SENTINEL to true (skip this "new" item)
  399. return;
  400. }
  401. node[0] = setBit(node[0], START);
  402. if (includeAllSubdomain) {
  403. node[0] = setBit(node[0], INCLUDE_ALL_SUBDOMAIN);
  404. } else {
  405. node[0] = deleteBit(node[0], INCLUDE_ALL_SUBDOMAIN);
  406. }
  407. node[4] = meta!;
  408. }
  409. public whitelist(suffix: string, includeAllSubdomain = suffix[0] === '.', hostnameFromIndex = suffix[0] === '.' ? 1 : 0) {
  410. const tokens = hostnameToTokens(suffix, hostnameFromIndex);
  411. const res = this.getSingleChildLeaf(tokens);
  412. if (res === null) return;
  413. const { node, toPrune, tokenToPrune } = res;
  414. // Trying to whitelist `[start].sub.example.com` where there might already be a `[start]blog.sub.example.com` in the trie
  415. if (includeAllSubdomain) {
  416. // If there is a `[start]sub.example.com` here, remove it
  417. node[0] = deleteBit(node[0], INCLUDE_ALL_SUBDOMAIN);
  418. // Removing all the child nodes by empty the children
  419. node[2].clear();
  420. // we do not remove sub.example.com for now, we will do that later
  421. } else {
  422. // Trying to whitelist `example.com` when there is already a `.example.com` in the trie
  423. node[0] = deleteBit(node[0], INCLUDE_ALL_SUBDOMAIN);
  424. }
  425. if (includeAllSubdomain) {
  426. node[1]?.[2].delete(node[3]);
  427. } else if (missingBit(node[0], START) && node[1]) {
  428. return;
  429. }
  430. if (toPrune && tokenToPrune) {
  431. toPrune[2].delete(tokenToPrune);
  432. } else {
  433. node[0] = deleteBit(node[0], START);
  434. }
  435. cleanUpEmptyTrailNode(node);
  436. };
  437. }
  438. function cleanUpEmptyTrailNode(node: TrieNode<unknown>) {
  439. if (
  440. // the current node is not an "end node", a.k.a. not the start of a domain
  441. missingBit(node[0], START)
  442. // also no leading "." (no subdomain)
  443. && missingBit(node[0], INCLUDE_ALL_SUBDOMAIN)
  444. // child is empty
  445. && node[2].size === 0
  446. // has parent: we need to detele the cureent node from the parent
  447. // we also need to recursively clean up the parent node
  448. && node[1]
  449. ) {
  450. node[1][2].delete(node[3]);
  451. // finish of the current stack
  452. return cleanUpEmptyTrailNode(node[1]);
  453. }
  454. }
  455. export class HostnameTrie<Meta = unknown> extends Triebase<Meta> {
  456. get size() {
  457. return this.$size;
  458. }
  459. add(suffix: string, includeAllSubdomain = suffix[0] === '.', meta?: Meta, hostnameFromIndex = suffix[0] === '.' ? 1 : 0): void {
  460. let node: TrieNode<Meta> = this.$root;
  461. let child: Map<string, TrieNode<Meta>> = node[2];
  462. const onToken = (token: string) => {
  463. child = node[2];
  464. if (child.has(token)) {
  465. node = child.get(token)!;
  466. } else {
  467. const newNode = createNode(token, node);
  468. child.set(token, newNode);
  469. node = newNode;
  470. }
  471. return false;
  472. };
  473. // When walkHostnameTokens returns true, we should skip the rest
  474. if (walkHostnameTokens(suffix, onToken, hostnameFromIndex)) {
  475. return;
  476. }
  477. // if same entry has been added before, skip
  478. if (getBit(node[0], START)) {
  479. return;
  480. }
  481. this.$size++;
  482. node[0] = setBit(node[0], START);
  483. if (includeAllSubdomain) {
  484. node[0] = setBit(node[0], INCLUDE_ALL_SUBDOMAIN);
  485. } else {
  486. node[0] = deleteBit(node[0], INCLUDE_ALL_SUBDOMAIN);
  487. }
  488. node[4] = meta!;
  489. }
  490. }
  491. // function deepEqualArray(a: string[], b: string[]) {
  492. // let len = a.length;
  493. // if (len !== b.length) return false;
  494. // while (len--) {
  495. // if (a[len] !== b[len]) return false;
  496. // }
  497. // return true;
  498. // };
  499. function subStringEqual(needle: string, haystack: string, needleIndex = 0) {
  500. for (let i = 0, l = haystack.length; i < l; i++) {
  501. if (needle[i + needleIndex] !== haystack[i]) return false;
  502. }
  503. return true;
  504. }