trie.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. /**
  2. * Suffix Trie based on Mnemonist Trie
  3. */
  4. // import { Trie } from 'mnemonist';
  5. export const SENTINEL = Symbol('SENTINEL');
  6. const PARENT = Symbol('Parent Node');
  7. type TrieNode = {
  8. [SENTINEL]: boolean,
  9. [PARENT]: TrieNode | null,
  10. [Bun.inspect.custom]: () => string
  11. } & Map<string, TrieNode>;
  12. const deepTrieNodeToJSON = (node: TrieNode) => {
  13. const obj: Record<string, any> = {};
  14. if (node[SENTINEL]) {
  15. obj['[start]'] = node[SENTINEL];
  16. }
  17. node.forEach((value, key) => {
  18. obj[key] = deepTrieNodeToJSON(value);
  19. });
  20. return obj;
  21. };
  22. function trieNodeInspectCustom(this: TrieNode) {
  23. return JSON.stringify(deepTrieNodeToJSON(this), null, 2);
  24. }
  25. const createNode = (parent: TrieNode | null = null): TrieNode => {
  26. const node = new Map<string, TrieNode>() as TrieNode;
  27. node[SENTINEL] = false;
  28. node[PARENT] = parent;
  29. node[Bun.inspect.custom] = trieNodeInspectCustom;
  30. return node;
  31. };
  32. const hostnameToTokens = (hostname: string): string[] => {
  33. let buf = '';
  34. const tokens: string[] = [];
  35. for (let i = 0, l = hostname.length; i < l; i++) {
  36. const c = hostname[i];
  37. if (c === '.') {
  38. if (buf) {
  39. tokens.push(buf, /* . */ c);
  40. buf = '';
  41. } else {
  42. tokens.push(/* . */ c);
  43. }
  44. } else {
  45. buf += c;
  46. }
  47. }
  48. if (buf) {
  49. tokens.push(buf);
  50. }
  51. return tokens;
  52. };
  53. export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false, smolTree = false) => {
  54. let size = 0;
  55. const root: TrieNode = createNode();
  56. const suffixToTokens = hostnameMode
  57. ? hostnameToTokens
  58. : (suffix: string) => suffix;
  59. /**
  60. * Method used to add the given prefix to the trie.
  61. */
  62. const add = (suffix: string): void => {
  63. let node: TrieNode = root;
  64. let token: string;
  65. const tokens = suffixToTokens(suffix);
  66. for (let i = tokens.length - 1; i >= 0; i--) {
  67. token = tokens[i];
  68. if (node.has(token)) {
  69. node = node.get(token)!;
  70. // During the adding of `[start]blog.skk.moe` and find out that there is a `[start].skk.moe` in the trie
  71. // Dedupe the covered subdomain by skipping
  72. if (smolTree && (node.get('.')?.[SENTINEL])) {
  73. return;
  74. }
  75. } else {
  76. const newNode = createNode(node);
  77. node.set(token, newNode);
  78. node = newNode;
  79. }
  80. if (smolTree) {
  81. // Trying to add `[start].sub.example.com` where there is already a `[start]blog.sub.example.com` in the trie
  82. if (i === 1 && tokens[0] === '.') {
  83. // If there is a `[start]sub.example.com` here, remove it
  84. node[SENTINEL] = false;
  85. // Removing the rest of the child nodes by creating a new node and disconnecting the old one
  86. const newNode = createNode(node);
  87. node.set('.', newNode);
  88. node = newNode;
  89. break;
  90. }
  91. if (i === 0) {
  92. // Trying to add `example.com` when there is already a `.example.com` in the trie
  93. if (node.get('.')?.[SENTINEL] === true) {
  94. return;
  95. }
  96. }
  97. }
  98. }
  99. // Do we need to increase size?
  100. if (!node[SENTINEL]) {
  101. size++;
  102. }
  103. node[SENTINEL] = true;
  104. };
  105. /**
  106. * @param {string} $suffix
  107. */
  108. const contains = (suffix: string): boolean => {
  109. let node: TrieNode | undefined = root;
  110. let token: string;
  111. const tokens = suffixToTokens(suffix);
  112. for (let i = tokens.length - 1; i >= 0; i--) {
  113. token = tokens[i];
  114. node = node.get(token);
  115. if (!node) return false;
  116. }
  117. return true;
  118. };
  119. /**
  120. * Method used to retrieve every item in the trie with the given prefix.
  121. */
  122. const find = (inputSuffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => {
  123. if (smolTree) {
  124. throw new Error('A Trie with smolTree enabled cannot perform find!');
  125. }
  126. let node: TrieNode | undefined = root;
  127. let token: string;
  128. const inputTokens = suffixToTokens(inputSuffix);
  129. for (let i = inputTokens.length - 1; i >= 0; i--) {
  130. token = inputTokens[i];
  131. if (hostnameMode && token === '') {
  132. break;
  133. }
  134. node = node.get(token);
  135. if (!node) return [];
  136. }
  137. const matches: Array<string | string[]> = [];
  138. // Performing DFS from prefix
  139. const nodeStack: TrieNode[] = [node];
  140. const suffixStack: Array<string | string[]> = [inputTokens];
  141. do {
  142. const suffix: string | string[] = suffixStack.pop()!;
  143. node = nodeStack.pop()!;
  144. if (node[SENTINEL]) {
  145. if (includeEqualWithSuffix) {
  146. matches.push(suffix);
  147. } else if (hostnameMode) {
  148. if ((suffix as string[]).some((t, i) => t !== inputTokens[i])) {
  149. matches.push(suffix);
  150. }
  151. } else if (suffix !== inputTokens) {
  152. matches.push(suffix);
  153. }
  154. }
  155. node.forEach((childNode, k) => {
  156. nodeStack.push(childNode);
  157. if (hostnameMode) {
  158. suffixStack.push([k, ...suffix]);
  159. } else {
  160. suffixStack.push(k + (suffix as string));
  161. }
  162. });
  163. } while (nodeStack.length);
  164. return hostnameMode ? matches.map((m) => (m as string[]).join('')) : matches as string[];
  165. };
  166. /**
  167. * Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
  168. */
  169. const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
  170. if (smolTree) {
  171. throw new Error('A Trie with smolTree enabled cannot perform substractSetInPlaceFromFound!');
  172. }
  173. let node: TrieNode | undefined = root;
  174. let token: string;
  175. const inputTokens = suffixToTokens(inputSuffix);
  176. // Find the leaf-est node, and early return if not any
  177. for (let i = inputTokens.length - 1; i >= 0; i--) {
  178. token = inputTokens[i];
  179. node = node.get(token);
  180. if (!node) return;
  181. }
  182. // Performing DFS from prefix
  183. const nodeStack: TrieNode[] = [node];
  184. const suffixStack: Array<string | string[]> = [inputTokens];
  185. do {
  186. const suffix = suffixStack.pop()!;
  187. node = nodeStack.pop()!;
  188. if (node[SENTINEL]) {
  189. // found match, delete it from set
  190. if (hostnameMode) {
  191. set.delete((suffix as string[]).join(''));
  192. } else if (suffix !== inputTokens) {
  193. set.delete(suffix as string);
  194. }
  195. }
  196. node.forEach((childNode, k) => {
  197. nodeStack.push(childNode);
  198. if (hostnameMode) {
  199. const stack = [k, ...suffix];
  200. suffixStack.push(stack);
  201. } else {
  202. suffixStack.push(k + (suffix as string));
  203. }
  204. });
  205. } while (nodeStack.length);
  206. };
  207. /**
  208. * Method used to delete a prefix from the trie.
  209. */
  210. const remove = (suffix: string): boolean => {
  211. let node: TrieNode | undefined = root;
  212. let toPrune: TrieNode | null = null;
  213. let tokenToPrune: string | null = null;
  214. let parent: TrieNode = node;
  215. let token: string;
  216. const suffixTokens = suffixToTokens(suffix);
  217. for (let i = suffixTokens.length - 1; i >= 0; i--) {
  218. token = suffixTokens[i];
  219. parent = node;
  220. node = node.get(token);
  221. if (!node) {
  222. return false;
  223. }
  224. // Keeping track of a potential branch to prune
  225. // If the node is to be pruned, but they are more than one token child in it, we can't prune it
  226. // If there is only one token child, or no child at all, we can prune it safely
  227. const onlyChild = node.size === 1 && node.has(token);
  228. if (onlyChild) {
  229. toPrune = parent;
  230. tokenToPrune = token;
  231. } else if (toPrune !== null) { // not only child, retain the branch
  232. toPrune = null;
  233. tokenToPrune = null;
  234. }
  235. }
  236. if (!node[SENTINEL]) return false;
  237. size--;
  238. if (tokenToPrune && toPrune) {
  239. toPrune.delete(tokenToPrune);
  240. } else {
  241. node[SENTINEL] = false;
  242. }
  243. return true;
  244. };
  245. /**
  246. * Method used to assert whether the given prefix exists in the Trie.
  247. */
  248. const has = (suffix: string): boolean => {
  249. let node: TrieNode = root;
  250. const tokens = suffixToTokens(suffix);
  251. for (let i = tokens.length - 1; i >= 0; i--) {
  252. const token = tokens[i];
  253. if (!node.has(token)) {
  254. return false;
  255. }
  256. node = node.get(token)!;
  257. }
  258. return node[SENTINEL];
  259. };
  260. const dump = () => {
  261. const nodeStack: TrieNode[] = [];
  262. const suffixStack: Array<string | string[]> = [];
  263. nodeStack.push(root);
  264. // Resolving initial string (begin the start of the stack)
  265. suffixStack.push(hostnameMode ? [] : '');
  266. const results: string[] = [];
  267. let node: TrieNode;
  268. do {
  269. node = nodeStack.pop()!;
  270. const suffix = suffixStack.pop()!;
  271. node.forEach((childNode, k) => {
  272. nodeStack.push(childNode);
  273. if (hostnameMode) {
  274. suffixStack.push([k, ...suffix]);
  275. } else {
  276. suffixStack.push(k + (suffix as string));
  277. }
  278. });
  279. if (node[SENTINEL]) {
  280. results.push(hostnameMode ? (suffix as string[]).join('') : (suffix as string));
  281. }
  282. } while (nodeStack.length);
  283. return results;
  284. };
  285. const whitelist = (suffix: string) => {
  286. if (!hostnameMode && !smolTree) {
  287. throw new Error('whitelist method is only available in hostname mode or smolTree mode.');
  288. }
  289. let node: TrieNode | undefined = root;
  290. let toPrune: TrieNode | null = null;
  291. let tokenToPrune: string | null = null;
  292. let parent: TrieNode = node;
  293. const tokens = suffixToTokens(suffix);
  294. let token: string;
  295. for (let i = tokens.length - 1; i >= 0; i--) {
  296. token = tokens[i];
  297. parent = node;
  298. node = node.get(token);
  299. if (!node) return;
  300. // During the whitelist of `[start]blog.skk.moe` and find out that there is a `[start].skk.moe` in the trie
  301. // Dedupe the covered subdomain by skipping
  302. if (i > 1 && node.get('.')?.[SENTINEL] === true) {
  303. return;
  304. }
  305. // Trying to whitelist `[start].sub.example.com` where there is already a `[start]blog.sub.example.com` in the trie
  306. if (i === 1 && tokens[0] === '.') {
  307. // If there is a `[start]sub.example.com` here, remove it
  308. node[SENTINEL] = false;
  309. // Removing all the child nodes by disconnecting "."
  310. node.delete('.');
  311. } else if (i === 0) {
  312. // Trying to whitelist `example.com` when there is already a `.example.com` in the trie
  313. const dotNode = node.get('.');
  314. if (dotNode?.[SENTINEL] === true) {
  315. dotNode[SENTINEL] = false;
  316. }
  317. }
  318. // Keeping track of a potential branch to prune
  319. // If the node is to be pruned, but they are more than one token child in it, we can't prune it
  320. // If there is only one token child, or no child at all, we can prune it safely
  321. if (toPrune != null) { // the first branch that could potentially being pruned
  322. if (node.size > 1 || node.has('.')) {
  323. // not only child, retain the branch.
  324. // And we need to abort prune the parent, so we set it to null
  325. toPrune = null;
  326. tokenToPrune = null;
  327. }
  328. } else if (node.size < 2 && !node.has('.')) {
  329. toPrune = parent;
  330. tokenToPrune = token;
  331. }
  332. }
  333. if (!node[SENTINEL]) return false;
  334. if (tokenToPrune && toPrune) {
  335. toPrune.delete(tokenToPrune);
  336. } else {
  337. node[SENTINEL] = false;
  338. }
  339. };
  340. if (Array.isArray(from)) {
  341. for (let i = 0, l = from.length; i < l; i++) {
  342. add(from[i]);
  343. }
  344. } else if (from) {
  345. from.forEach(add);
  346. }
  347. return {
  348. add,
  349. contains,
  350. find,
  351. substractSetInPlaceFromFound,
  352. remove,
  353. delete: remove,
  354. has,
  355. dump,
  356. get size() {
  357. if (smolTree) {
  358. throw new Error('A Trie with smolTree enabled cannot have correct size!');
  359. }
  360. return size;
  361. },
  362. get root() {
  363. return root;
  364. },
  365. whitelist,
  366. [Bun.inspect.custom]: () => JSON.stringify(deepTrieNodeToJSON(root), null, 2),
  367. hostnameMode,
  368. smolTree
  369. };
  370. };
  371. export type Trie = ReturnType<typeof createTrie>;
  372. export default createTrie;