test-jsonl-parser.mjs 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. 'use strict';
  2. import test from 'tape-six';
  3. import fs from 'fs';
  4. import path from 'path';
  5. import zlib from 'zlib';
  6. import {Writable} from 'stream';
  7. import {readString} from './helpers.mjs';
  8. import chain from '../src/index.js';
  9. import parser from '../src/jsonl/parser.js';
  10. const roundtrip = (t, resolve, len, quant) => {
  11. const objects = [];
  12. for (let n = 0; n < len; n += 1) {
  13. objects.push({
  14. stringWithTabsAndNewlines: "Did it work?\nNo...\t\tI don't think so...",
  15. anArray: [n + 1, n + 2, true, 'tabs?\t\t\t\u0001\u0002\u0003', false],
  16. n
  17. });
  18. }
  19. const json = [];
  20. for (let n = 0; n < objects.length; n += 1) {
  21. json.push(JSON.stringify(objects[n]));
  22. }
  23. const input = json.join('\n'),
  24. result = [];
  25. chain([
  26. readString(input, quant),
  27. parser(),
  28. new Writable({
  29. objectMode: true,
  30. write(chunk, _, callback) {
  31. result.push(chunk.value);
  32. callback(null);
  33. },
  34. final(callback) {
  35. t.deepEqual(objects, result);
  36. resolve();
  37. callback(null);
  38. }
  39. })
  40. ]);
  41. };
  42. test.asPromise('jsonl parser: smoke test', (t, resolve) => roundtrip(t, resolve));
  43. for (let i = 1; i <= 12; ++i) {
  44. test.asPromise('jsonl parser: roundtrip with a set of objects - ' + i, (t, resolve) => {
  45. roundtrip(t, resolve, i);
  46. });
  47. }
  48. for (let i = 1; i <= 12; ++i) {
  49. test.asPromise('jsonl parser: roundtrip with different window sizes - ' + i, (t, resolve) => {
  50. roundtrip(t, resolve, 10, i);
  51. });
  52. }
  53. test.asPromise('jsonl parser: read file', (t, resolve) => {
  54. if (!/^file:\/\//.test(import.meta.url)) throw Error('Cannot get the current working directory');
  55. const isWindows = path.sep === '\\',
  56. fileName = path.join(
  57. path.dirname(import.meta.url.substring(isWindows ? 8 : 7)),
  58. './data/sample.jsonl.gz'
  59. );
  60. let count = 0;
  61. chain([
  62. fs.createReadStream(fileName),
  63. zlib.createGunzip(),
  64. parser(),
  65. new Writable({
  66. objectMode: true,
  67. write(chunk, _, callback) {
  68. t.equal(count, chunk.key);
  69. ++count;
  70. callback(null);
  71. },
  72. final(callback) {
  73. t.equal(count, 100);
  74. resolve();
  75. callback(null);
  76. }
  77. })
  78. ]);
  79. });
  80. test.asPromise('jsonl parser: bad json', (t, resolve) => {
  81. const pipeline = chain([readString(' not json '), parser()]);
  82. pipeline.on('data', () => t.fail("We shouldn't be here."));
  83. pipeline.on('error', e => {
  84. t.ok(e);
  85. resolve();
  86. });
  87. pipeline.on('end', value => {
  88. t.fail("We shouldn't be here.");
  89. resolve();
  90. });
  91. });