trie.ts 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. /**
  2. * Suffix Trie based on Mnemonist Trie
  3. */
  4. import { fastStringArrayJoin } from './misc';
  5. // const { Error, Bun, JSON, Symbol } = globalThis;
  6. const noop = () => { /** noop */ };
  7. type TrieNode = [
  8. boolean, /** sentinel */
  9. TrieNode | null, /** parent */
  10. Map<string, TrieNode> /** children */
  11. ];
  12. const deepTrieNodeToJSON = (node: TrieNode) => {
  13. const obj: Record<string, any> = {};
  14. if (node[0]) {
  15. obj['[start]'] = node[0];
  16. }
  17. node[2].forEach((value, key) => {
  18. obj[key] = deepTrieNodeToJSON(value);
  19. });
  20. return obj;
  21. };
  22. const createNode = (parent: TrieNode | null = null): TrieNode => {
  23. return [false, parent, new Map<string, TrieNode>()] as TrieNode;
  24. };
  25. const hostnameToTokens = (hostname: string): string[] => {
  26. return hostname.split('.').reduce<string[]>((acc, token, index) => {
  27. if (index !== 0) {
  28. acc.push('.');
  29. }
  30. acc.push(token);
  31. return acc;
  32. }, []);
  33. };
  34. export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false, smolTree = false) => {
  35. let size = 0;
  36. const root: TrieNode = createNode();
  37. const isHostnameMode = (_token: string | string[]): _token is string[] => hostnameMode;
  38. const suffixToTokens = hostnameMode
  39. ? hostnameToTokens
  40. : (suffix: string) => suffix;
  41. /**
  42. * Method used to add the given suffix to the trie.
  43. */
  44. const add = smolTree
  45. ? (suffix: string): void => {
  46. let node: TrieNode = root;
  47. let token: string;
  48. const tokens = suffixToTokens(suffix);
  49. for (let i = tokens.length - 1; i >= 0; i--) {
  50. token = tokens[i];
  51. if (node[2].has(token)) {
  52. node = node[2].get(token)!;
  53. // During the adding of `[start]blog|.skk.moe` and find out that there is a `[start].skk.moe` in the trie
  54. // Dedupe the covered subdomain by skipping
  55. if (token === '.' && node[0]) {
  56. return;
  57. }
  58. } else {
  59. const newNode = createNode(node);
  60. node[2].set(token, newNode);
  61. node = newNode;
  62. }
  63. }
  64. // If we are in smolTree mode, we need to do something at the end of the loop
  65. if (tokens[0] === '.') {
  66. // Trying to add `[start].sub.example.com` where there is already a `[start]blog.sub.example.com` in the trie
  67. const parent = node[1]!;
  68. // Make sure parent `[start]sub.example.com` (without dot) is removed (SETINEL to false)
  69. parent[0] = false;
  70. // Removing the rest of the parent's child nodes
  71. node[2].clear();
  72. // The SENTINEL of this node will be set to true at the end of the function, so we don't need to set it here
  73. // we can use else-if here, because the children is now empty, we don't need to check the leading "."
  74. } else if (node[2].get('.')?.[0] === true) {
  75. // Trying to add `example.com` when there is already a `.example.com` in the trie
  76. // No need to increment size and set SENTINEL to true (skip this "new" item)
  77. return;
  78. }
  79. node[0] = true;
  80. }
  81. : (suffix: string): void => {
  82. let node: TrieNode = root;
  83. let token: string;
  84. const tokens = suffixToTokens(suffix);
  85. for (let i = tokens.length - 1; i >= 0; i--) {
  86. token = tokens[i];
  87. if (node[2].has(token)) {
  88. node = node[2].get(token)!;
  89. } else {
  90. const newNode = createNode(node);
  91. node[2].set(token, newNode);
  92. node = newNode;
  93. }
  94. }
  95. if (!node[0]) { // smol tree don't have size, so else-if here
  96. size++;
  97. node[0] = true;
  98. }
  99. };
  100. const walkIntoLeafWithTokens = (
  101. tokens: string | string[],
  102. onLoop: (node: TrieNode, parent: TrieNode, token: string) => void = noop
  103. ) => {
  104. let node: TrieNode = root;
  105. let parent: TrieNode = node;
  106. let token: string;
  107. for (let i = tokens.length - 1; i >= 0; i--) {
  108. token = tokens[i];
  109. if (hostnameMode && token === '') {
  110. break;
  111. }
  112. parent = node;
  113. if (node[2].has(token)) {
  114. node = node[2].get(token)!;
  115. } else {
  116. return null;
  117. }
  118. onLoop(node, parent, token);
  119. }
  120. return { node, parent };
  121. };
  122. const contains = (suffix: string): boolean => {
  123. const tokens = suffixToTokens(suffix);
  124. return walkIntoLeafWithTokens(tokens) !== null;
  125. };
  126. const walk = (
  127. onMatches: (suffix: string | string[]) => void,
  128. initialNode = root,
  129. initialSuffix: string | string[] = hostnameMode ? [] : ''
  130. ) => {
  131. const nodeStack: TrieNode[] = [initialNode];
  132. // Resolving initial string (begin the start of the stack)
  133. const suffixStack: Array<string | string[]> = [initialSuffix];
  134. let node: TrieNode = root;
  135. do {
  136. node = nodeStack.pop()!;
  137. const suffix = suffixStack.pop()!;
  138. node[2].forEach((childNode, k) => {
  139. // Pushing the child node to the stack for next iteration of DFS
  140. nodeStack.push(childNode);
  141. suffixStack.push(isHostnameMode(suffix) ? [k, ...suffix] : k + suffix);
  142. });
  143. // If the node is a sentinel, we push the suffix to the results
  144. if (node[0]) {
  145. onMatches(suffix);
  146. }
  147. } while (nodeStack.length);
  148. };
  149. interface FindSingleChildLeafResult {
  150. node: TrieNode,
  151. toPrune: TrieNode | null,
  152. tokenToPrune: string | null,
  153. parent: TrieNode
  154. }
  155. const getSingleChildLeaf = (tokens: string | string[]): FindSingleChildLeafResult | null => {
  156. let toPrune: TrieNode | null = null;
  157. let tokenToPrune: string | null = null;
  158. const onLoop = (node: TrieNode, parent: TrieNode, token: string) => {
  159. // Keeping track of a potential branch to prune
  160. // Even if the node size is 1, but the single child is ".", we should retain the branch
  161. // Since the "." could be special if it is the leaf-est node
  162. const onlyChild = node[2].size < 2 && (!hostnameMode || !node[2].has('.'));
  163. if (toPrune != null) { // the top-est branch that could potentially being pruned
  164. if (!onlyChild) {
  165. // The branch has moew than single child, retain the branch.
  166. // And we need to abort prune the parent, so we set it to null
  167. toPrune = null;
  168. tokenToPrune = null;
  169. }
  170. } else if (onlyChild) {
  171. // There is only one token child, or no child at all, we can prune it safely
  172. // It is now the top-est branch that could potentially being pruned
  173. toPrune = parent;
  174. tokenToPrune = token;
  175. }
  176. };
  177. const res = walkIntoLeafWithTokens(tokens, onLoop);
  178. if (res === null) return null;
  179. return { node: res.node, toPrune, tokenToPrune, parent: res.parent };
  180. };
  181. /**
  182. * Method used to retrieve every item in the trie with the given prefix.
  183. */
  184. const find = (inputSuffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => {
  185. if (smolTree) {
  186. throw new Error('A Trie with smolTree enabled cannot perform find!');
  187. }
  188. const inputTokens = suffixToTokens(inputSuffix);
  189. const res = walkIntoLeafWithTokens(inputTokens);
  190. if (res === null) return [];
  191. const matches: Array<string | string[]> = [];
  192. const onMatches = includeEqualWithSuffix
  193. ? (suffix: string | string[]) => matches.push(suffix)
  194. : (
  195. hostnameMode
  196. ? (suffix: string[]) => {
  197. if (suffix.some((t, i) => t !== inputTokens[i])) {
  198. matches.push(suffix);
  199. }
  200. }
  201. : (suffix: string) => {
  202. if (suffix !== inputTokens) {
  203. matches.push(suffix);
  204. }
  205. }
  206. );
  207. walk(
  208. onMatches as any,
  209. res.node, // Performing DFS from prefix
  210. inputTokens
  211. );
  212. return hostnameMode ? matches.map((m) => fastStringArrayJoin(m as string[], '')) : matches as string[];
  213. };
  214. /**
  215. * Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
  216. */
  217. const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
  218. if (smolTree) {
  219. throw new Error('A Trie with smolTree enabled cannot perform substractSetInPlaceFromFound!');
  220. }
  221. const inputTokens = suffixToTokens(inputSuffix);
  222. const res = walkIntoLeafWithTokens(inputTokens);
  223. if (res === null) return;
  224. const onMatches = hostnameMode
  225. ? (suffix: string[]) => set.delete(fastStringArrayJoin(suffix, ''))
  226. : (suffix: string) => set.delete(suffix);
  227. walk(
  228. onMatches as any,
  229. res.node, // Performing DFS from prefix
  230. inputTokens
  231. );
  232. };
  233. /**
  234. * Method used to delete a prefix from the trie.
  235. */
  236. const remove = (suffix: string): boolean => {
  237. const res = getSingleChildLeaf(suffixToTokens(suffix));
  238. if (res === null) return false;
  239. if (!res.node[0]) return false;
  240. size--;
  241. const { node, toPrune, tokenToPrune } = res;
  242. if (tokenToPrune && toPrune) {
  243. toPrune[2].delete(tokenToPrune);
  244. } else {
  245. node[0] = false;
  246. }
  247. return true;
  248. };
  249. /**
  250. * Method used to assert whether the given prefix exists in the Trie.
  251. */
  252. const has = (suffix: string): boolean => {
  253. const tokens = suffixToTokens(suffix);
  254. const res = walkIntoLeafWithTokens(tokens);
  255. return res
  256. ? res.node[0]
  257. : false;
  258. };
  259. const dump = () => {
  260. const results: string[] = [];
  261. walk(suffix => {
  262. results.push(
  263. isHostnameMode(suffix) ? fastStringArrayJoin(suffix, '') : suffix
  264. );
  265. });
  266. return results;
  267. };
  268. const whitelist = (suffix: string) => {
  269. if (!hostnameMode && !smolTree) {
  270. throw new Error('whitelist method is only available in hostname mode or smolTree mode.');
  271. }
  272. const tokens = suffixToTokens(suffix);
  273. const res = getSingleChildLeaf(tokens);
  274. if (res === null) return;
  275. const { node, toPrune, tokenToPrune, parent } = res;
  276. // Trying to whitelist `[start].sub.example.com` where there is already a `[start]blog.sub.example.com` in the trie
  277. if (tokens[0] === '.') {
  278. // If there is a `[start]sub.example.com` here, remove it
  279. parent[0] = false;
  280. // Removing all the child nodes by disconnecting "."
  281. parent[2].delete('.');
  282. }
  283. // Trying to whitelist `example.com` when there is already a `.example.com` in the trie
  284. const dotNode = node[2].get('.');
  285. if (dotNode) {
  286. dotNode[0] = false;
  287. }
  288. // if (dotNode?.s === true) {
  289. // dotnode[0] = false;
  290. // }
  291. // return early if not found
  292. if (!node[0]) return;
  293. if (tokenToPrune && toPrune) {
  294. toPrune[2].delete(tokenToPrune);
  295. } else {
  296. node[0] = false;
  297. }
  298. };
  299. // Actually build trie
  300. if (Array.isArray(from)) {
  301. for (let i = 0, l = from.length; i < l; i++) {
  302. add(from[i]);
  303. }
  304. } else if (from) {
  305. from.forEach(add);
  306. }
  307. return {
  308. add,
  309. contains,
  310. find,
  311. substractSetInPlaceFromFound,
  312. remove,
  313. delete: remove,
  314. has,
  315. dump,
  316. get size() {
  317. if (smolTree) {
  318. throw new Error('A Trie with smolTree enabled cannot have correct size!');
  319. }
  320. return size;
  321. },
  322. get root() {
  323. return root;
  324. },
  325. whitelist,
  326. [Bun.inspect.custom]: (depth: number) => JSON.stringify(deepTrieNodeToJSON(root), null, 2).split('\n').map((line) => ' '.repeat(depth) + line).join('\n'),
  327. hostnameMode,
  328. smolTree
  329. };
  330. };
  331. export type Trie = ReturnType<typeof createTrie>;
  332. export default createTrie;