trie.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. /**
  2. * Suffix Trie based on Mnemonist Trie
  3. */
  4. // import { Trie } from 'mnemonist';
  5. export const SENTINEL = Symbol('SENTINEL');
  6. const PARENT = Symbol('Parent Node');
  7. type TrieNode = {
  8. [SENTINEL]: boolean,
  9. [PARENT]: TrieNode | null,
  10. [Bun.inspect.custom]: () => string
  11. } & Map<string, TrieNode>;
  12. const deepTrieNodeToJSON = (node: TrieNode) => {
  13. const obj: Record<string, any> = {};
  14. if (node[SENTINEL]) {
  15. obj['[start]'] = node[SENTINEL];
  16. }
  17. node.forEach((value, key) => {
  18. obj[key] = deepTrieNodeToJSON(value);
  19. });
  20. return obj;
  21. };
  22. function trieNodeInspectCustom(this: TrieNode) {
  23. return JSON.stringify(deepTrieNodeToJSON(this), null, 2);
  24. }
  25. const createNode = (parent: TrieNode | null = null): TrieNode => {
  26. const node = new Map<string, TrieNode>() as TrieNode;
  27. node[SENTINEL] = false;
  28. node[PARENT] = parent;
  29. node[Bun.inspect.custom] = trieNodeInspectCustom;
  30. return node;
  31. };
  32. const hostnameToTokens = (hostname: string): string[] => {
  33. let buf = '';
  34. const tokens: string[] = [];
  35. for (let i = 0, l = hostname.length; i < l; i++) {
  36. const c = hostname[i];
  37. if (c === '.') {
  38. if (buf) {
  39. tokens.push(buf, /* . */ c);
  40. buf = '';
  41. } else {
  42. tokens.push(/* . */ c);
  43. }
  44. } else {
  45. buf += c;
  46. }
  47. }
  48. if (buf) {
  49. tokens.push(buf);
  50. }
  51. return tokens;
  52. };
  53. export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false, smolTree = false) => {
  54. let size = 0;
  55. const root: TrieNode = createNode();
  56. const isHostnameMode = (_token: string | string[]): _token is string[] => hostnameMode;
  57. const suffixToTokens = hostnameMode
  58. ? hostnameToTokens
  59. : (suffix: string) => suffix;
  60. /**
  61. * Method used to add the given prefix to the trie.
  62. */
  63. const add = (suffix: string): void => {
  64. let node: TrieNode = root;
  65. let token: string;
  66. const tokens = suffixToTokens(suffix);
  67. for (let i = tokens.length - 1; i >= 0; i--) {
  68. token = tokens[i];
  69. if (node.has(token)) {
  70. node = node.get(token)!;
  71. // During the adding of `[start]blog.skk.moe` and find out that there is a `[start].skk.moe` in the trie
  72. // Dedupe the covered subdomain by skipping
  73. if (smolTree && (node.get('.')?.[SENTINEL])) {
  74. return;
  75. }
  76. } else {
  77. const newNode = createNode(node);
  78. node.set(token, newNode);
  79. node = newNode;
  80. }
  81. }
  82. // If we are in smolTree mode, we need to do something at the end of the loop
  83. if (smolTree) {
  84. if (tokens[0] === '.') {
  85. // Trying to add `[start].sub.example.com` where there is already a `[start]blog.sub.example.com` in the trie
  86. const parent = node[PARENT]!;
  87. // Make sure parent `[start]sub.example.com` (without dot) is removed (SETINEL to false)
  88. parent[SENTINEL] = false;
  89. // Removing the rest of the parent's child nodes by disconnecting the old one and creating a new node
  90. const newNode = createNode(node);
  91. // The SENTINEL of this newNode will be set to true at the end of the function, so we don't need to set it here
  92. parent.set('.', newNode);
  93. // Now the real leaf-est node is the new node, change the pointer to it
  94. node = newNode;
  95. }
  96. if (node.get('.')?.[SENTINEL] === true) {
  97. // Trying to add `example.com` when there is already a `.example.com` in the trie
  98. // No need to increment size and set SENTINEL to true (skip this "new" item)
  99. return;
  100. }
  101. }
  102. // Do we need to increase size?
  103. if (!node[SENTINEL]) {
  104. size++;
  105. }
  106. node[SENTINEL] = true;
  107. };
  108. /**
  109. * @param {string} $suffix
  110. */
  111. const contains = (suffix: string): boolean => {
  112. let node: TrieNode | undefined = root;
  113. let token: string;
  114. const tokens = suffixToTokens(suffix);
  115. for (let i = tokens.length - 1; i >= 0; i--) {
  116. token = tokens[i];
  117. node = node.get(token);
  118. if (!node) return false;
  119. }
  120. return true;
  121. };
  122. /**
  123. * Method used to retrieve every item in the trie with the given prefix.
  124. */
  125. const find = (inputSuffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => {
  126. if (smolTree) {
  127. throw new Error('A Trie with smolTree enabled cannot perform find!');
  128. }
  129. let node: TrieNode | undefined = root;
  130. let token: string;
  131. const inputTokens = suffixToTokens(inputSuffix);
  132. for (let i = inputTokens.length - 1; i >= 0; i--) {
  133. token = inputTokens[i];
  134. if (hostnameMode && token === '') {
  135. break;
  136. }
  137. node = node.get(token);
  138. if (!node) return [];
  139. }
  140. const matches: Array<string | string[]> = [];
  141. // Performing DFS from prefix
  142. const nodeStack: TrieNode[] = [node];
  143. const suffixStack: Array<string | string[]> = [inputTokens];
  144. do {
  145. const suffix: string | string[] = suffixStack.pop()!;
  146. node = nodeStack.pop()!;
  147. if (node[SENTINEL]) {
  148. if (includeEqualWithSuffix) {
  149. matches.push(suffix);
  150. } else if (isHostnameMode(suffix)) {
  151. if (suffix.some((t, i) => t !== inputTokens[i])) {
  152. matches.push(suffix);
  153. }
  154. } else if (suffix !== inputTokens) {
  155. matches.push(suffix);
  156. }
  157. }
  158. node.forEach((childNode, k) => {
  159. nodeStack.push(childNode);
  160. if (isHostnameMode(suffix)) {
  161. suffixStack.push([k, ...suffix]);
  162. } else {
  163. suffixStack.push(k + suffix);
  164. }
  165. });
  166. } while (nodeStack.length);
  167. return hostnameMode ? matches.map((m) => (m as string[]).join('')) : matches as string[];
  168. };
  169. /**
  170. * Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
  171. */
  172. const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
  173. if (smolTree) {
  174. throw new Error('A Trie with smolTree enabled cannot perform substractSetInPlaceFromFound!');
  175. }
  176. let node: TrieNode | undefined = root;
  177. let token: string;
  178. const inputTokens = suffixToTokens(inputSuffix);
  179. // Find the leaf-est node, and early return if not any
  180. for (let i = inputTokens.length - 1; i >= 0; i--) {
  181. token = inputTokens[i];
  182. node = node.get(token);
  183. if (!node) return;
  184. }
  185. // Performing DFS from prefix
  186. const nodeStack: TrieNode[] = [node];
  187. const suffixStack: Array<string | string[]> = [inputTokens];
  188. do {
  189. const suffix = suffixStack.pop()!;
  190. node = nodeStack.pop()!;
  191. if (node[SENTINEL]) {
  192. // found match, delete it from set
  193. if (isHostnameMode(suffix)) {
  194. set.delete(suffix.join(''));
  195. } else if (suffix !== inputTokens) {
  196. set.delete(suffix);
  197. }
  198. }
  199. node.forEach((childNode, k) => {
  200. nodeStack.push(childNode);
  201. if (isHostnameMode(suffix)) {
  202. const stack = [k, ...suffix];
  203. suffixStack.push(stack);
  204. } else {
  205. suffixStack.push(k + suffix);
  206. }
  207. });
  208. } while (nodeStack.length);
  209. };
  210. /**
  211. * Method used to delete a prefix from the trie.
  212. */
  213. const remove = (suffix: string): boolean => {
  214. let node: TrieNode | undefined = root;
  215. let toPrune: TrieNode | null = null;
  216. let tokenToPrune: string | null = null;
  217. let parent: TrieNode = node;
  218. let token: string;
  219. const suffixTokens = suffixToTokens(suffix);
  220. for (let i = suffixTokens.length - 1; i >= 0; i--) {
  221. token = suffixTokens[i];
  222. parent = node;
  223. node = node.get(token);
  224. if (!node) return false;
  225. // Keeping track of a potential branch to prune
  226. // Even if the node size is 1, but the single child is ".", we should retain the branch
  227. // Since the "." could be special if it is the leaf-est node
  228. const onlyChild = node.size < 2 && (!hostnameMode || !node.has('.'));
  229. if (toPrune != null) { // the top-est branch that could potentially being pruned
  230. if (!onlyChild) {
  231. // The branch has moew than single child, retain the branch.
  232. // And we need to abort prune the parent, so we set it to null
  233. toPrune = null;
  234. tokenToPrune = null;
  235. }
  236. } else if (onlyChild) {
  237. // There is only one token child, or no child at all, we can prune it safely
  238. // It is now the top-est branch that could potentially being pruned
  239. toPrune = parent;
  240. tokenToPrune = token;
  241. }
  242. }
  243. if (!node[SENTINEL]) return false;
  244. size--;
  245. if (tokenToPrune && toPrune) {
  246. toPrune.delete(tokenToPrune);
  247. } else {
  248. node[SENTINEL] = false;
  249. }
  250. return true;
  251. };
  252. /**
  253. * Method used to assert whether the given prefix exists in the Trie.
  254. */
  255. const has = (suffix: string): boolean => {
  256. let node: TrieNode = root;
  257. const tokens = suffixToTokens(suffix);
  258. for (let i = tokens.length - 1; i >= 0; i--) {
  259. const token = tokens[i];
  260. if (!node.has(token)) {
  261. return false;
  262. }
  263. node = node.get(token)!;
  264. }
  265. return node[SENTINEL];
  266. };
  267. const dump = () => {
  268. const nodeStack: TrieNode[] = [];
  269. const suffixStack: Array<string | string[]> = [];
  270. nodeStack.push(root);
  271. // Resolving initial string (begin the start of the stack)
  272. suffixStack.push(hostnameMode ? [] : '');
  273. const results: string[] = [];
  274. let node: TrieNode;
  275. do {
  276. node = nodeStack.pop()!;
  277. const suffix = suffixStack.pop()!;
  278. node.forEach((childNode, k) => {
  279. // Pushing the child node to the stack for next iteration of DFS
  280. nodeStack.push(childNode);
  281. suffixStack.push(isHostnameMode(suffix) ? [k, ...suffix] : k + suffix);
  282. });
  283. // If the node is a sentinel, we push the suffix to the results
  284. if (node[SENTINEL]) {
  285. results.push(isHostnameMode(suffix) ? suffix.join('') : suffix);
  286. }
  287. } while (nodeStack.length);
  288. return results;
  289. };
  290. const whitelist = (suffix: string) => {
  291. if (!hostnameMode && !smolTree) {
  292. throw new Error('whitelist method is only available in hostname mode or smolTree mode.');
  293. }
  294. let node: TrieNode | undefined = root;
  295. let toPrune: TrieNode | null = null;
  296. let tokenToPrune: string | null = null;
  297. let parent: TrieNode = node;
  298. const tokens = suffixToTokens(suffix);
  299. let token: string;
  300. for (let i = tokens.length - 1; i >= 0; i--) {
  301. token = tokens[i];
  302. parent = node;
  303. node = node.get(token);
  304. if (!node) return;
  305. // Keeping track of a potential branch to prune
  306. // Even if the node size is 1, but the single child is ".", we should retain the branch
  307. // Since the "." could be special if it is the leaf-est node
  308. const onlyChild = node.size < 2 && !node.has('.');
  309. // const onlyChild = node.size < 2 && (!hostnameMode || !node.has('.'));
  310. if (toPrune !== null) { // the top-est branch that could potentially being pruned
  311. if (!onlyChild) {
  312. // The branch has moew than single child, retain the branch.
  313. // And we need to abort prune the parent, so we set it to null
  314. toPrune = null;
  315. tokenToPrune = null;
  316. }
  317. } else if (onlyChild) {
  318. // There is only one token child, or no child at all, we can prune it safely
  319. // It is now the top-est branch that could potentially being pruned
  320. toPrune = parent;
  321. tokenToPrune = token;
  322. }
  323. }
  324. // Trying to whitelist `[start].sub.example.com` where there is already a `[start]blog.sub.example.com` in the trie
  325. if (tokens[0] === '.') {
  326. // If there is a `[start]sub.example.com` here, remove it
  327. parent[SENTINEL] = false;
  328. // Removing all the child nodes by disconnecting "."
  329. parent.delete('.');
  330. }
  331. // Trying to whitelist `example.com` when there is already a `.example.com` in the trie
  332. const dotNode = node.get('.');
  333. if (dotNode?.[SENTINEL] === true) {
  334. dotNode[SENTINEL] = false;
  335. }
  336. // if (!node[SENTINEL]) return;
  337. if (tokenToPrune && toPrune) {
  338. toPrune.delete(tokenToPrune);
  339. } else {
  340. node[SENTINEL] = false;
  341. }
  342. };
  343. if (Array.isArray(from)) {
  344. for (let i = 0, l = from.length; i < l; i++) {
  345. add(from[i]);
  346. }
  347. } else if (from) {
  348. from.forEach(add);
  349. }
  350. return {
  351. add,
  352. contains,
  353. find,
  354. substractSetInPlaceFromFound,
  355. remove,
  356. delete: remove,
  357. has,
  358. dump,
  359. get size() {
  360. if (smolTree) {
  361. throw new Error('A Trie with smolTree enabled cannot have correct size!');
  362. }
  363. return size;
  364. },
  365. get root() {
  366. return root;
  367. },
  368. whitelist,
  369. [Bun.inspect.custom]: (depth: number) => JSON.stringify(deepTrieNodeToJSON(root), null, 2).split('\n').map((line) => ' '.repeat(depth) + line).join('\n'),
  370. hostnameMode,
  371. smolTree
  372. };
  373. };
  374. export type Trie = ReturnType<typeof createTrie>;
  375. export default createTrie;