test-jsonl-parser.mjs 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. 'use strict';
  2. import test from 'tape-six';
  3. import fs from 'fs';
  4. import path from 'path';
  5. import zlib from 'zlib';
  6. import {Writable} from 'stream';
  7. import {readString} from './helpers.mjs';
  8. import chain from '../src/index.js';
  9. import parser from '../src/jsonl/parser.js';
  10. const roundtrip = (t, resolve, len, quant) => {
  11. const objects = [];
  12. for (let n = 0; n < len; n += 1) {
  13. objects.push({
  14. stringWithTabsAndNewlines: "Did it work?\nNo...\t\tI don't think so...",
  15. anArray: [n + 1, n + 2, true, 'tabs?\t\t\t\u0001\u0002\u0003', false],
  16. n
  17. });
  18. }
  19. const json = [];
  20. for (let n = 0; n < objects.length; n += 1) {
  21. json.push(JSON.stringify(objects[n]));
  22. }
  23. const input = json.join('\n'),
  24. result = [];
  25. chain([
  26. readString(input, quant),
  27. parser(),
  28. new Writable({
  29. objectMode: true,
  30. write(chunk, _, callback) {
  31. result.push(chunk.value);
  32. callback(null);
  33. },
  34. final(callback) {
  35. t.deepEqual(objects, result);
  36. resolve();
  37. callback(null);
  38. }
  39. })
  40. ]);
  41. };
  42. test.asPromise('jsonl parser: smoke test', (t, resolve) => roundtrip(t, resolve));
  43. for (let i = 1; i <= 12; ++i) {
  44. test.asPromise('jsonl parser: roundtrip with a set of objects - ' + i, (t, resolve) => {
  45. roundtrip(t, resolve, i);
  46. });
  47. }
  48. for (let i = 1; i <= 12; ++i) {
  49. test.asPromise('jsonl parser: roundtrip with different window sizes - ' + i, (t, resolve) => {
  50. roundtrip(t, resolve, 10, i);
  51. });
  52. }
  53. test.asPromise('jsonl parser: read file', (t, resolve) => {
  54. if (!/^file:\/\//.test(import.meta.url)) throw Error('Cannot get the current working directory');
  55. const fileName = path.join(path.dirname(import.meta.url.substring(7)), './data/sample.jsonl.gz');
  56. let count = 0;
  57. chain([
  58. fs.createReadStream(fileName),
  59. zlib.createGunzip(),
  60. parser(),
  61. new Writable({
  62. objectMode: true,
  63. write(chunk, _, callback) {
  64. t.equal(count, chunk.key);
  65. ++count;
  66. callback(null);
  67. },
  68. final(callback) {
  69. t.equal(count, 100);
  70. resolve();
  71. callback(null);
  72. }
  73. })
  74. ]);
  75. });
  76. test.asPromise('jsonl parser: bad json', (t, resolve) => {
  77. const pipeline = chain([readString(' not json '), parser()]);
  78. pipeline.on('data', () => t.fail("We shouldn't be here."));
  79. pipeline.on('error', e => {
  80. t.ok(e);
  81. resolve();
  82. });
  83. pipeline.on('end', value => {
  84. t.fail("We shouldn't be here.");
  85. resolve();
  86. });
  87. });