build-reject-domainset-worker.js 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. const Piscina = require('piscina');
  2. // pre check if fullset domain is starts with a "."
  3. // This avoid calling chatCodeAt repeatedly
  4. const { canExcludeFromDedupe } = require('../lib/parse-filter')
  5. // workerData is an array of string, sorted by length, short first
  6. const fullsetDomainStartsWithADot = Piscina.workerData
  7. const totalLen = fullsetDomainStartsWithADot.length;
  8. module.exports.dedupe = ({ chunk }) => {
  9. const chunkLength = chunk.length;
  10. const outputToBeRemoved = new Int8Array(chunkLength);
  11. for (let i = 0; i < chunkLength; i++) {
  12. const domainFromInput = chunk[i];
  13. if (canExcludeFromDedupe(domainFromInput)) {
  14. continue;
  15. }
  16. for (let j = 0; j < totalLen; j++) {
  17. const domainFromFullSet = fullsetDomainStartsWithADot[j];
  18. // domainFromFullSet is now startsWith a "."
  19. if (domainFromFullSet === domainFromInput) continue;
  20. const domainFromInputLen = domainFromInput.length;
  21. const domainFromFullSetLen = domainFromFullSet.length;
  22. // !domainFromInput.starsWith('.') && `.${domainFromInput}` === domainFromFullSet
  23. if (domainFromInput.charCodeAt(0) !== 46) {
  24. if (domainFromInputLen + 1 === domainFromFullSetLen) {
  25. let shouldBeRemoved = true;
  26. for (let k = 0; k < domainFromInputLen; k++) {
  27. if (domainFromFullSet.charCodeAt(k + 1) !== domainFromInput.charCodeAt(k)) {
  28. shouldBeRemoved = false;
  29. break;
  30. }
  31. }
  32. if (shouldBeRemoved) {
  33. outputToBeRemoved[i] = 1;
  34. break;
  35. }
  36. }
  37. }
  38. // domainFromInput is now startsWith a "."
  39. if (domainFromInputLen >= domainFromFullSetLen) {
  40. if (domainFromInput.endsWith(domainFromFullSet)) {
  41. outputToBeRemoved[i] = 1;
  42. break;
  43. }
  44. }
  45. }
  46. }
  47. return Piscina.move(outputToBeRemoved);
  48. };