1
/**
2
Convert CSV formatted data to TSV format.
3

4
This program converts comma-separated value data to tab-separated format.
5

6
Copyright (c) 2016-2020, eBay Inc.
7
Initially written by Jon Degenhardt
8

9
License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
10
*/
11

12
module tsv_utils.csv2tsv;
13

14
import std.stdio;
15
import std.exception : enforce;
16
import std.format : format;
17
import std.range;
18
import std.traits : isArray, Unqual;
19
import std.typecons : tuple;
20

21
immutable helpText = q"EOS
22
Synopsis: csv2tsv [options] [file...]
23

24
csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records
25
are read from files or standard input, converted records written to standard output.
26
Use '--help-verbose' for details the CSV formats accepted.
27

28
Options:
29
EOS";
30

31
immutable helpTextVerbose = q"EOS
32
Synopsis: csv2tsv [options] [file...]
33

34
csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records
35
are read from files or standard input, converted records written to standard output.
36

37
Both formats represent tabular data, each record on its own line, fields separated
38
by a delimiter character. The key difference is that CSV uses escape sequences to
39
represent newlines and field separators in the data, whereas TSV disallows these
40
characters in the data. The most common field delimiters are comma for CSV and tab
41
for TSV, but any character can be used.
42

43
Conversion to TSV is done by removing CSV escape syntax, changing field delimiters,
44
and replacing newlines and field delimiters in the data. By default, newlines and
45
field delimiters in the data are replaced by spaces. Most details are customizable.
46

47
There is no single spec for CSV, any number of variants can be found. The escape
48
syntax is common enough: fields containing newlines or field delimiters are placed
49
in double quotes. Inside a quoted field, a double quote is represented by a pair of
50
double quotes. As with field separators, the quoting character is customizable.
51

52
Behaviors of this program that often vary between CSV implementations:
53
  * Newlines are supported in quoted fields.
54
  * Double quotes are permitted in a non-quoted field. However, a field starting
55
    with a quote must follow quoting rules.
56
  * Each record can have a different numbers of fields.
57
  * The three common forms of newlines are supported: CR, CRLF, LF. Output is
58
    written using Unix newlines (LF).
59
  * A newline will be added if the file does not end with one.
60
  * A UTF-8 Byte Order Mark (BOM) at the start of a file will be removed.
61
  * No whitespace trimming is done.
62

63
This program does not validate CSV correctness, but will terminate with an error
64
upon reaching an inconsistent state. Improperly terminated quoted fields are the
65
primary cause.
66

67
UTF-8 input is assumed. Convert other encodings prior to invoking this tool.
68

69
Options:
70
EOS";
71

72
/** Container for command line options.
73
 */
74
struct Csv2tsvOptions
75
{
76
    string programName;
77
    bool helpVerbose = false;          // --help-verbose
78
    bool hasHeader = false;            // --H|header
79
    char csvQuoteChar = '"';           // --q|quote
80
    char csvDelimChar = ',';           // --c|csv-delim
81
    char tsvDelimChar = '\t';          // --t|tsv-delim
82
    string tsvDelimReplacement = " ";  // --r|tab-replacement
83
    string newlineReplacement = " ";   // --n|newline-replacement
84
    bool versionWanted = false;        // --V|version
85

86
    auto processArgs (ref string[] cmdArgs)
87
    {
88
        import std.algorithm : canFind;
89
        import std.getopt;
90
        import std.path : baseName, stripExtension;
91

92 1
        programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
93

94
        try
95
        {
96 1
            auto r = getopt(
97
                cmdArgs,
98
                "help-verbose",          "     Print full help.", &helpVerbose,
99
                std.getopt.config.caseSensitive,
100
                "H|header",              "     Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader,
101
                std.getopt.config.caseSensitive,
102
                "q|quote",               "CHR  Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar,
103
                "c|csv-delim",           "CHR  Field delimiter in CSV data. Default: comma (,).", &csvDelimChar,
104
                "t|tsv-delim",           "CHR  Field delimiter in TSV data. Default: TAB", &tsvDelimChar,
105
                "r|tab-replacement",     "STR  Replacement for TSV field delimiters (typically TABs) found in CSV input. Default: Space.", &tsvDelimReplacement,
106
                "n|newline-replacement", "STR  Replacement for newlines found in CSV input. Default: Space.", &newlineReplacement,
107
                std.getopt.config.caseSensitive,
108
                "V|version",             "     Print version information and exit.", &versionWanted,
109
                std.getopt.config.caseInsensitive,
110
                );
111

112 1
            if (r.helpWanted)
113
            {
114 1
                defaultGetoptPrinter(helpText, r.options);
115 1
                return tuple(false, 0);
116
            }
117 1
            else if (helpVerbose)
118
            {
119 1
                defaultGetoptPrinter(helpTextVerbose, r.options);
120 1
                return tuple(false, 0);
121
            }
122 1
            else if (versionWanted)
123
            {
124
                import tsv_utils.common.tsvutils_version;
125 1
                writeln(tsvutilsVersionNotice("csv2tsv"));
126 1
                return tuple(false, 0);
127
            }
128

129
            /* Consistency checks. */
130 1
            enforce(csvQuoteChar != '\n' && csvQuoteChar != '\r',
131 1
                    "CSV quote character cannot be newline (--q|quote).");
132

133 1
            enforce(csvQuoteChar != csvDelimChar,
134 1
                    "CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim).");
135

136 1
            enforce(csvQuoteChar != tsvDelimChar,
137 1
                    "CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim).");
138

139 1
            enforce(csvDelimChar != '\n' && csvDelimChar != '\r',
140 1
                    "CSV field delimiter cannot be newline (--c|csv-delim).");
141

142 1
            enforce(tsvDelimChar != '\n' && tsvDelimChar != '\r',
143 1
                    "TSV field delimiter cannot be newline (--t|tsv-delim).");
144

145 1
            enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement),
146 1
                    "Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement).");
147

148 1
            enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(newlineReplacement),
149 1
                    "Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement).");
150
        }
151
        catch (Exception exc)
152
        {
153 1
            stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
154 1
            return tuple(false, 1);
155
        }
156 1
        return tuple(true, 0);
157
    }
158
}
159

160
static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
161

162
version(unittest)
163
{
164
    // No main in unittest
165
}
166
else
167
{
168
    int main(string[] cmdArgs)
169
    {
170
        /* When running in DMD code coverage mode, turn on report merging. */
171
        version(D_Coverage) version(DigitalMars)
172
        {
173
            import core.runtime : dmd_coverSetMerge;
174 1
            dmd_coverSetMerge(true);
175
        }
176

177 1
        Csv2tsvOptions cmdopt;
178 1
        const r = cmdopt.processArgs(cmdArgs);
179 1
        if (!r[0]) return r[1];
180
        version(LDC_Profile)
181
        {
182
            import ldc.profile : resetAll;
183
            resetAll();
184
        }
185 1
        try csv2tsvFiles(cmdopt, cmdArgs[1..$]);
186
        catch (Exception exc)
187
        {
188 1
            writeln();
189 1
            stdin.flush();
190 1
            stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
191 1
            return 1;
192
        }
193

194 1
        return 0;
195
    }
196
}
197

198
/** csv2tsvFiles takes a list of input files and passes each to csv2tsv, which
199
 * runs on a single file. csv2tsvFiles manages header lines and sets up the
200
 * BufferedOutputRange passed to csv2tsv.
201
 */
202
void csv2tsvFiles(const ref Csv2tsvOptions cmdopt, const string[] inputFiles)
203
{
204
    import tsv_utils.common.utils : BufferedOutputRange;
205

206
    /* Buffer Sizes
207
     *
208
     * ReadBufferSize is the typical size used for buffered reads by most tsv-utils
209
     * programs. Nothing unusal there. However, the default sizes used by
210
     * BufferedOutputRange are overridden to allocate a larger initial buffer (the
211
     * reserve size) and to ensure buffers are flushed to standard output more
212
     * quickly (the max size).
213
     *
214
     * BufferedOutputRange is intended primarily for record oriented writes, where
215
     * output ends in newlines. When given a string ending in a newline, the buffer
216
     * is flushed if it is greater than 'flush size'. Otherwise buffers are flushed
217
     * after exceeding 'max size'.
218
     *
219
     * For csv2tsv's buffered conversion algorithm there are two very different cases:
220
     * 1) Extensive use of CSV escapes, where all fields are quoted.
221
     * 2) Limited use of CSV escapes, where few fields are quoted.
222
     *
223
     * The first case will translate to record oriented writes. In particular, if the
224
     * first field is quoted, the write to BufferedOutputRange will be on a newline
225
     * boundary. (A quoted field pushes accumulated data to BufferedOutputRange.) For
226
     * this case, the default flush behavior of BufferedOutputRange works well.
227
     *
228
     * In the second case, data gets pushed to BufferedOutputRange on arbitrary byte
229
     * boundaries. BufferedOutputRange won't flush to standard output until max size
230
     * bytes have been accumulated. The default max size is larger than optimal, so
231
     * instead max size is set to a size similar to the read buffer size. Reserve
232
     * is increased for the same reason.
233
     */
234
    enum ReadBufferSize = 1024L * 128L;
235
    enum OutputBufferFlushSize = 1024L * 10L;
236
    enum OutputBufferReserveSize = 1024L * 129L;
237
    enum OutputBufferMaxSize = 1024L * 128L;
238

239 1
    ubyte[ReadBufferSize] fileRawBuf;
240 1
    auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(
241
        stdout, OutputBufferFlushSize, OutputBufferReserveSize, OutputBufferMaxSize);
242 1
    bool firstFile = true;
243

244 1
    foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
245
    {
246 1
        auto inputStream = (filename == "-") ? stdin : filename.File;
247 1
        auto printFileName = (filename == "-") ? "stdin" : filename;
248

249 1
        auto skipLines = (firstFile || !cmdopt.hasHeader) ? 0 : 1;
250

251 1
        csv2tsv(inputStream, stdoutWriter, fileRawBuf, printFileName, skipLines,
252
                cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
253
                cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement,
254
                cmdopt.newlineReplacement);
255

256 1
        firstFile = false;
257
    }
258
}
259

260
/* csv2tsv buffered conversion algorithm
261

262
This version of csv2tsv uses a buffered approach to csv-to-tsv conversion. This is a
263
change from the original version, which used a character-at-a-time approach, with
264
characters coming from an infinite stream of characters. The character-at-a-time
265
approach was nice from a simplicity perspective, but the approach didn't optimize well.
266
Note that the original version read input in blocks and wrote to stdout in blocks, it
267
was the conversion algorithm itself that was character oriented.
268

269
The idea is to convert a buffer at a time, writing larger blocks to the output stream
270
rather than one character at a time. In addition, the read buffer is modified in-place
271
when the only change is to convert a single character. The notable case is converting
272
the field delimiter character, typically comma to TAB. The result is writing longer
273
blocks to the output stream (BufferedOutputRange).
274

275
Performance improvements from the new algorithm are notable. This is especially true
276
versus the previous version 2.0.0. Note though that the more recent versions of
277
csv2tsv were slower due to degradations coming from compiler and/or language version.
278
Version 1.1.19 was quite a bit faster. Regardless of version, the performance
279
improvement is especially good when run against "simple" CSV files, with limited
280
amounts of CSV escape syntax. In these files the main change is converting the field
281
delimiter character, typically comma to TAB.
282

283
In some benchmarks on Mac OS, the new version was 40% faster than csv2tsv 2.0.0 on
284
files with significant CSV escapes, and 60% faster on files with limited CSV escapes.
285
Versus csv2tsv version 1.1.19, the new version is 10% and 40% faster on the same
286
files. On the "simple CSV" file, where Unix 'tr' is an option, 'tr' was still faster,
287
by about 20%. But getting into the 'tr' ballpark while retaining safety of correct
288
csv2tsv conversion is a good result.
289

290
Algorithm notes:
291

292
The algorithm works by reading an input block, then examining each byte in-order to
293
identify needed modifications. The region of consecutive characters without a change
294
is tracked. Single character changes are done in-place, in the read buffer. This
295
allows assembling longer blocks before write is needed. The region being tracked is
296
written to the output stream when it can no longer be extended in a continuous
297
fashion. At this point a new region is started. When the current read buffer has
298
been processed the current region is written out and a new block of data read in.
299

300
The read buffer uses fixed size blocks. This means the algorithm is actually
301
operating on bytes (UTF-8 code units), and not characters. This works because all
302
delimiters and CSV escape syntax characters are single byte UTF-8 characters. These
303
are the only characters requiring interpretation. The main nuisance is the 2-byte
304
CRLF newline sequence, as this might be split across two read buffers. This is
305
handled by embedding 'CR' states in the finite state machine.
306

307
Processing CSV escapes will often cause the character removals and additions. These
308
will not be representable in a continuous stream of bytes without moving bytes around
309
Instead of moving bytes, these cases are handled by immediately  writing to the output
310
stream. This allows restarting a new block of contiguous characters. Handling by the
311
new algorithm is described below. Note that the length of the replacement characters
312
for TSV field and record delimiters (e.g. TAB, newline) affects the processing.
313

314
All replacement character lengths:
315

316
* Windows newline (CRLF) at the end of a line - Replace the CRLF with LF.
317

318
  Replace the CR with LF, add it to the current write region and terminate it. The
319
  next write region starts at the character after the LF.
320

321
* Double quote starting or ending a field - Drop the double quote.
322

323
  Terminate the current write region, next write region starts at the next character.
324

325
* Double quote pair inside a quoted field - Drop one of the double quotes.
326

327
  The algorithm drops the first double quote and keep the second. This avoids
328
  look-ahead and both field terminating double quote and double quote pair can
329
  handled the same way. Terminate the current write region without adding the double
330
  quote. The next write region starts at the next character.
331

332
Single byte replacement characters:
333

334
* Windows newline (CRLF) in a quoted field
335

336
  Replace the CR with the replacement char, add it to the current write region and
337
  terminate it. The next write region starts at the character after the LF.
338

339
Multi-byte replacement sequences:
340

341
* TSV Delimiter (TAB by default) in a field
342

343
  Terminate the current write region, write it out and the replacement. The next
344
  write region starts at the next character.
345

346
* LF, CR, or CRLF in a quoted field
347

348
  Terminate the current write region, write it and the replacement. The next write
349
  region starts at the next character.
350

351
csv2tsv API
352

353
At the API level, it is desirable to handle at both open files and input streams.
354
Open files are the key requirement, but handling input streams simplifies unit
355
testing, and in-memory conversion is likely to be useful anyway. Internally, it
356
should be easy enough to encapsulate the differences between input streams and files.
357
Reading files can be done using File.byChunk and reading from input streams can be
358
done using std.range.chunks.
359

360
This has been handled by creating a new range that can iterate either files or
361
input streams chunk-by-chunk.
362
*/
363

364
/** Defines the 'bufferable' input sources supported by inputSourceByChunk.
365
 *
366
 * This includes std.stdio.File objects and mutable dynamic ubyte arrays (inputRange
367
 * with slicing).
368
 *
369
 * Note: The mutable, dynamic arrays restriction is based on what is supported by
370
 * std.range.chunks. This could be extended to include any type of array with ubyte
371
 * elements, but it would require custom code in inputSourceByChunk. A test could be
372
 * added as '(isArray!(R) && is(Unqual!(typeof(R.init[0])) == ubyte))'.
373
 */
374
enum bool isBufferableInputSource(R) =
375
    isFileHandle!(Unqual!R) ||
376
    (isInputRange!R && is(ElementEncodingType!R == ubyte) && hasSlicing!R);
377

378
@safe unittest
379
{
380
    static assert(isBufferableInputSource!(File));
381
    static assert(isBufferableInputSource!(typeof(stdin)));
382
    static assert(isBufferableInputSource!(ubyte[]));
383
    static assert(!isBufferableInputSource!(char[]));
384
    static assert(!isBufferableInputSource!(string));
385

386 1
    ubyte[10] x1;
387 1
    const ubyte[1] x2;
388 1
    immutable ubyte[1] x3;
389 1
    ubyte[] x4 = new ubyte[](10);
390 1
    const ubyte[] x5 = new ubyte[](10);
391 1
    immutable ubyte[] x6 = new ubyte[](10);
392

393
    static assert(!isBufferableInputSource!(typeof(x1)));
394
    static assert(!isBufferableInputSource!(typeof(x2)));
395
    static assert(!isBufferableInputSource!(typeof(x3)));
396
    static assert(isBufferableInputSource!(typeof(x4)));
397
    static assert(!isBufferableInputSource!(typeof(x5)));
398
    static assert(!isBufferableInputSource!(typeof(x6)));
399

400
    static assert(is(Unqual!(ElementType!(typeof(x1))) == ubyte));
401
    static assert(is(Unqual!(ElementType!(typeof(x2))) == ubyte));
402
    static assert(is(Unqual!(ElementType!(typeof(x3))) == ubyte));
403
    static assert(is(Unqual!(ElementType!(typeof(x4))) == ubyte));
404
    static assert(is(Unqual!(ElementType!(typeof(x5))) == ubyte));
405
    static assert(is(Unqual!(ElementType!(typeof(x6))) == ubyte));
406

407
    struct S1
408
    {
409
        void popFront();
410
        @property bool empty();
411
        @property ubyte front();
412
    }
413

414
    struct S2
415
    {
416
        @property ubyte front();
417
        void popFront();
418
        @property bool empty();
419 1
        @property auto save() { return this; }
420
        @property size_t length();
421
        S2 opSlice(size_t, size_t);
422
    }
423

424
    static assert(isInputRange!S1);
425
    static assert(!isBufferableInputSource!S1);
426

427
    static assert(isInputRange!S2);
428
    static assert(is(ElementEncodingType!S2 == ubyte));
429
    static assert(hasSlicing!S2);
430
    static assert(isBufferableInputSource!S2);
431

432
    /* For code coverage. */
433 1
    S2 s2;
434 1
    auto x = s2.save;
435
}
436

437
/** inputSourceByChunk returns a range that reads either a file handle (File) or a
438
 * ubyte[] array a chunk at a time.
439
 *
440
 * This is a cover for File.byChunk that allows passing an in-memory array as well.
441
 * At present the motivation is primarily to enable unit testing of chunk-based
442
 * algorithms using in-memory strings. At present the in-memory input types are
443
 * limited. In the future this may be changed to accept any type of character or
444
 * ubyte array.
445
 *
446
 * inputSourceByChunk takes either a File open for reading or a ubyte[] array
447
 * containing input data. Data is read a buffer at a time. The buffer can be
448
 * user provided, or allocated by inputSourceByChunk based on a caller provided
449
 * buffer size.
450
 *
451
 * A ubyte[] input source must satisfy isBufferableInputSource, which at present
452
 * means that it is a dynamic, mutable ubyte[].
453
 *
454
 * The chunks are returned as an input range.
455
 */
456
auto inputSourceByChunk(InputSource)(InputSource source, size_t size)
457
{
458 1
    return inputSourceByChunk(source, new ubyte[](size));
459
}
460

461
/// Ditto
462
auto inputSourceByChunk(InputSource)(InputSource source, ubyte[] buffer)
463
if (isBufferableInputSource!InputSource)
464
{
465
    static if (isFileHandle!(Unqual!InputSource))
466
    {
467 1
        return source.byChunk(buffer);
468
    }
469
    else
470
    {
471
        static struct BufferedChunk
472
        {
473
            private Chunks!InputSource _chunks;
474
            private ubyte[] _buffer;
475

476
            private void readNextChunk()
477
            {
478 1
                if (_chunks.empty)
479
                {
480 1
                    _buffer.length = 0;
481
                }
482
                else
483
                {
484 1
                    size_t len = _chunks.front.length;
485 1
                    _buffer[0 .. len] = _chunks.front[];
486 1
                    _chunks.popFront;
487

488
                    /* Only the last chunk should be shorter than the buffer. */
489 1
                    assert(_buffer.length == len || _chunks.empty);
490

491 1
                    if (_buffer.length != len) _buffer.length = len;
492
                }
493
            }
494

495 1
            this(InputSource source, ubyte[] buffer)
496
            {
497 1
                enforce(buffer.length > 0, "buffer size must be larger than 0");
498 1
                _chunks = source.chunks(buffer.length);
499 1
                _buffer = buffer;
500 1
                readNextChunk();
501
            }
502

503
            @property bool empty()
504
            {
505 1
                return (_buffer.length == 0);
506
            }
507

508
            @property ubyte[] front()
509
            {
510 1
                assert(!empty, "Attempting to fetch the front of an empty inputSourceByChunks");
511 1
                return _buffer;
512
            }
513

514
            void popFront()
515
            {
516 1
                assert(!empty, "Attempting to popFront an empty inputSourceByChunks");
517 1
                readNextChunk();
518
            }
519
        }
520

521 1
        return BufferedChunk(source, buffer);
522
    }
523
}
524

525
unittest  // inputSourceByChunk
526
{
527
    import tsv_utils.common.unittest_utils;   // tsv-utils unit test helpers
528
    import std.file : mkdir, rmdirRecurse;
529
    import std.path : buildPath;
530

531 1
    auto testDir = makeUnittestTempDir("csv2tsv_inputSourceByChunk");
532 1
    scope(exit) testDir.rmdirRecurse;
533

534
    import std.algorithm : equal, joiner;
535
    import std.format;
536
    import std.string : representation;
537

538 1
    auto charData = "abcde,ßÀß,あめりか物語,012345";
539 1
    ubyte[] ubyteData = charData.dup.representation;
540

541 1
    ubyte[1024] rawBuffer;  // Must be larger than largest bufferSize in tests.
542

543
    void writeFileData(string filePath, ubyte[] data)
544
    {
545
        import std.stdio;
546

547 1
        auto f = filePath.File("w");
548 1
        f.rawWrite(data);
549 1
        f.close;
550
    }
551

552 1
    foreach (size_t dataSize; 0 .. ubyteData.length)
553
    {
554 1
        auto data = ubyteData[0 .. dataSize];
555 1
        auto filePath = buildPath(testDir, format("data_%d.txt", dataSize));
556 1
        writeFileData(filePath, data);
557

558 1
        foreach (size_t bufferSize; 1 .. dataSize + 2)
559
        {
560 1
            assert(data.inputSourceByChunk(bufferSize).joiner.equal(data),
561
                   format("[Test-A] dataSize: %d, bufferSize: %d", dataSize, bufferSize));
562

563 1
            assert (rawBuffer.length >= bufferSize);
564

565 1
            ubyte[] buffer = rawBuffer[0 .. bufferSize];
566 1
            assert(data.inputSourceByChunk(buffer).joiner.equal(data),
567
                   format("[Test-B] dataSize: %d, bufferSize: %d", dataSize, bufferSize));
568

569
            {
570 1
                auto inputStream = filePath.File;
571 1
                assert(inputStream.inputSourceByChunk(bufferSize).joiner.equal(data),
572
                       format("[Test-C] dataSize: %d, bufferSize: %d", dataSize, bufferSize));
573 1
                inputStream.close;
574
            }
575

576
            {
577 1
                auto inputStream = filePath.File;
578 1
                assert(inputStream.inputSourceByChunk(buffer).joiner.equal(data),
579
                       format("[Test-D] dataSize: %d, bufferSize: %d", dataSize, bufferSize));
580 1
                inputStream.close;
581
            }
582
        }
583
    }
584
}
585

586
/** Read CSV from an input source, covert to TSV and write to an output source.
587
 *
588
 * Params:
589
 *   inputSource           =  A "bufferable" input source, either a file open for
590
 *                            read, or a dynamic, mutable ubyte array.
591
 *   outputStream          =  An output range to write TSV bytes to.
592
 *   readBuffer            =  A buffer to use for reading.
593
 *   filename              =  Name of file to use when reporting errors. A descriptive
594
 *                            name can be used in lieu of a file name.
595
 *   skipLines             =  Number of lines to skip before outputting records.
596
 *                            Typically used to skip writing header lines.
597
 *   csvQuote              =  The quoting character used in the CSV input.
598
 *   csvDelim              =  The field delimiter character used in the CSV input.
599
 *   tsvDelim              =  The field delimiter character to use in the TSV output.
600
 *   tsvDelimReplacement   =  String to use when replacing TSV field delimiters
601
 *                            (e.g. TABs) found in the CSV data fields.
602
 *   tsvNewlineReplacement =  String to use when replacing newlines found in the CSV
603
 *                            data fields.
604
 *   discardBOM            =  If true (the default), a UTF-8 Byte Order Mark found at the
605
 *                            start of the input stream will be dropped.
606
 *
607
 * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and
608
 *         line number where the error was identified.
609
 */
610
void csv2tsv(InputSource, OutputRange)(
611
    InputSource inputSource,
612
    auto ref OutputRange outputStream,
613
    ubyte[] readBuffer,
614
    string filename = "(none)",
615
    size_t skipLines = 0,
616
    const char csvQuote = '"',
617
    const char csvDelim = ',',
618
    const char tsvDelim = '\t',
619
    const string tsvDelimReplacement = " ",
620
    const string tsvNewlineReplacement = " ",
621
    bool discardBOM = true,
622
)
623
if (isBufferableInputSource!InputSource &&
624
    isOutputRange!(OutputRange, char))
625
{
626
    import std.conv: hexString;
627

628 1
    assert (readBuffer.length >= 1);
629

630
    enum char LF = '\n';
631
    enum char CR = '\r';
632

633
    enum ubyte[3] UTF8_BOM = cast(ubyte[3])hexString!"efbbbf";
634

635
    /* Process state information - These variables are defined either in the outer
636
     * context or within one of the foreach loops.
637
     *
638
     *   * recordNum - The current CSV input line/record number. Starts at one.
639
     *   * fieldNum - Field number in the current line/record. Field numbers are
640
     *     one-upped. The field number set to zero at the start of a new record,
641
     *     prior to processing the first character of the first field on the record.
642
     *   * byteIndex - Read buffer index of the current byte being processed.
643
     *   * csvState - The current state of CSV processing. In particular, the state
644
     *     of the finite state machine.
645
     *   * writeRegionStart - Read buffer index where the next write starts from.
646
     *   * nextIndex - The index of the current input ubyte being processed. The
647
     *     current write region extends from the writeRegionStart to nextIndex.
648
     *   * nextChar - The current input ubyte. The ubyte/char at nextIndex.
649
     */
650

651
    enum CSVState
652
    {
653
     FieldEnd,           // Start of input or after consuming a field or record delimiter.
654
     NonQuotedField,     // Processing a non-quoted field
655
     QuotedField,        // Processing a quoted field
656
     QuoteInQuotedField, // Last char was a quote in a quoted field
657
     CRAtFieldEnd,       // Last char was a CR terminating a record/line
658
     CRInQuotedField,    // Last char was a CR in a quoted field
659
    }
660

661 1
    CSVState csvState = CSVState.FieldEnd;
662 1
    size_t recordNum = 1;
663 1
    size_t fieldNum = 0;
664

665 1
    foreach (chunkIndex, inputChunkComplete; inputSource.inputSourceByChunk(readBuffer).enumerate)
666
    {
667 1
        size_t writeRegionStart = 0;
668

669
        /* Discard byte order marks at the start of input.
670
         * Note: Slicing the chunk in this fashion generates very good code, better
671
         * other approaches like manipulating indices.
672
         */
673 1
        auto inputChunk =
674
            (discardBOM &&
675 1
             chunkIndex == 0 &&
676 1
             inputChunkComplete.length >= UTF8_BOM.length &&
677 1
             inputChunkComplete[0 .. UTF8_BOM.length] == UTF8_BOM
678
            )
679 1
            ? inputChunkComplete[UTF8_BOM.length .. $]
680 1
            : inputChunkComplete[];
681

682
        /* flushCurrentRegion flushes the current write region and moves the start of
683
         * the next write region one byte past the end of the current region. If
684
         * appendChars are provided they are ouput as well.
685
         *
686
         * This routine is called when the current character (byte) terminates the
687
         * current write region and should not itself be output. That is why the next
688
         * write region always starts one byte past the current region end.
689
         *
690
         * This routine is also called when the 'skiplines' region has been processed.
691
         * This is done to flush the region without actually writing it. This is done
692
         * by explicit checks in the finite state machine when newline characters
693
         * that terminate a record are processed. It would be nice to refactor this.
694
         */
695
        void flushCurrentRegion(size_t regionEnd, const char[] appendChars = "")
696
        {
697 1
            assert(regionEnd <= inputChunk.length);
698

699 1
            if (recordNum > skipLines)
700
            {
701 1
                if (regionEnd > writeRegionStart)
702
                {
703 1
                    outputStream.put(inputChunk[writeRegionStart .. regionEnd]);
704
                }
705 1
                if (appendChars.length > 0)
706
                {
707 1
                    outputStream.put(appendChars);
708
                }
709
            }
710

711 1
            writeRegionStart = regionEnd + 1;
712
        }
713

714 1
        foreach (size_t nextIndex, char nextChar; inputChunk)
715
        {
716 1
        OuterSwitch: final switch (csvState)
717
            {
718 1
            case CSVState.FieldEnd:
719
                /* Start of input or after consuming a field terminator. */
720 1
                ++fieldNum;
721

722
                /* Note: Can't use switch due to the 'goto case' to the OuterSwitch.  */
723 1
                if (nextChar == csvQuote)
724
                {
725 1
                    flushCurrentRegion(nextIndex);
726 1
                    csvState = CSVState.QuotedField;
727 1
                    break OuterSwitch;
728
                }
729
                else
730
                {
731
                    /* Processing state change only. Don't consume the character. */
732 1
                    csvState = CSVState.NonQuotedField;
733 1
                    goto case CSVState.NonQuotedField;
734
                }
735

736 1
            case CSVState.NonQuotedField:
737 1
                switch (nextChar)
738
                {
739 1
                default:
740 1
                    break OuterSwitch;
741 1
                case csvDelim:
742 1
                    inputChunk[nextIndex] = tsvDelim;
743 1
                    csvState = CSVState.FieldEnd;
744 1
                    break OuterSwitch;
745 1
                case LF:
746 1
                    if (recordNum == skipLines) flushCurrentRegion(nextIndex);
747 1
                    ++recordNum;
748 1
                    fieldNum = 0;
749 1
                    csvState = CSVState.FieldEnd;
750 1
                    break OuterSwitch;
751 1
                case CR:
752 1
                    inputChunk[nextIndex] = LF;
753 1
                    if (recordNum == skipLines) flushCurrentRegion(nextIndex);
754 1
                    ++recordNum;
755 1
                    fieldNum = 0;
756 1
                    csvState = CSVState.CRAtFieldEnd;
757 1
                    break OuterSwitch;
758 1
                case tsvDelim:
759 1
                    if (tsvDelimReplacement.length == 1)
760
                    {
761 1
                        inputChunk[nextIndex] = tsvDelimReplacement[0];
762
                    }
763
                    else
764
                    {
765 1
                        flushCurrentRegion(nextIndex, tsvDelimReplacement);
766
                    }
767 1
                    break OuterSwitch;
768
                }
769

770 1
            case CSVState.QuotedField:
771 1
                switch (nextChar)
772
                {
773 1
                default:
774 1
                    break OuterSwitch;
775 1
                case csvQuote:
776
                    /*
777
                     * Flush the current region, without the double quote. Switch state
778
                     * to QuoteInQuotedField, which determines whether to output a quote.
779
                     */
780 1
                    flushCurrentRegion(nextIndex);
781 1
                    csvState = CSVState.QuoteInQuotedField;
782 1
                    break OuterSwitch;
783

784 1
                case tsvDelim:
785 1
                    if (tsvDelimReplacement.length == 1)
786
                    {
787 1
                        inputChunk[nextIndex] = tsvDelimReplacement[0];
788
                    }
789
                    else
790
                    {
791 1
                        flushCurrentRegion(nextIndex, tsvDelimReplacement);
792
                    }
793 1
                    break OuterSwitch;
794 1
                case LF:
795
                    /* Newline in a quoted field. */
796 1
                    if (tsvNewlineReplacement.length == 1)
797
                    {
798 1
                        inputChunk[nextIndex] = tsvNewlineReplacement[0];
799
                    }
800
                    else
801
                    {
802 1
                        flushCurrentRegion(nextIndex, tsvNewlineReplacement);
803
                    }
804 1
                    break OuterSwitch;
805 1
                case CR:
806
                    /* Carriage Return in a quoted field. */
807 1
                    if (tsvNewlineReplacement.length == 1)
808
                    {
809 1
                        inputChunk[nextIndex] = tsvNewlineReplacement[0];
810
                    }
811
                    else
812
                    {
813 1
                        flushCurrentRegion(nextIndex, tsvNewlineReplacement);
814
                    }
815 1
                    csvState = CSVState.CRInQuotedField;
816 1
                    break OuterSwitch;
817
                }
818

819 1
            case CSVState.QuoteInQuotedField:
820
                /* Just processed a quote in a quoted field. The buffer, without the
821
                 * quote, was just flushed. Only legal characters here are quote,
822
                 * comma (field delimiter), newline (record delimiter).
823
                 */
824 1
                switch (nextChar)
825
                {
826 1
                case csvQuote:
827 1
                    csvState = CSVState.QuotedField;
828 1
                    break OuterSwitch;
829 1
                case csvDelim:
830 1
                    inputChunk[nextIndex] = tsvDelim;
831 1
                    csvState = CSVState.FieldEnd;
832 1
                    break OuterSwitch;
833 1
                case LF:
834 1
                    if (recordNum == skipLines) flushCurrentRegion(nextIndex);
835 1
                    ++recordNum;
836 1
                    fieldNum = 0;
837 1
                    csvState = CSVState.FieldEnd;
838 1
                    break OuterSwitch;
839 1
                case CR:
840 1
                    inputChunk[nextIndex] = LF;
841 1
                    if (recordNum == skipLines) flushCurrentRegion(nextIndex);
842 1
                    ++recordNum;
843 1
                    fieldNum = 0;
844 1
                    csvState = CSVState.CRAtFieldEnd;
845 1
                    break OuterSwitch;
846 1
                default:
847 1
                    throw new Exception(
848
                        format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
849 1
                               (filename == "-") ? "Standard Input" : filename,
850
                               recordNum));
851
                }
852

853 1
            case CSVState.CRInQuotedField:
854 1
                if (nextChar == LF)
855
                {
856 1
                    flushCurrentRegion(nextIndex);
857 1
                    csvState = CSVState.QuotedField;
858 1
                    break OuterSwitch;
859
                }
860
                else {
861
                    /* Naked CR. State change only, don't consume current character. */
862 1
                    csvState = CSVState.QuotedField;
863 1
                    goto case CSVState.QuotedField;
864
                }
865

866 1
            case CSVState.CRAtFieldEnd:
867 1
                if (nextChar == LF)
868
                {
869 1
                    flushCurrentRegion(nextIndex);
870 1
                    csvState = CSVState.FieldEnd;
871 1
                    break OuterSwitch;
872
                }
873
                else {
874
                    /* Naked CR. State change only, don't consume current character. */
875 1
                    csvState = CSVState.FieldEnd;
876 1
                    goto case CSVState.FieldEnd;
877
                }
878
            }
879
        }
880

881
        /* End of buffer. */
882 1
        if (writeRegionStart < inputChunk.length && recordNum > skipLines)
883
        {
884 1
            outputStream.put(inputChunk[writeRegionStart .. $]);
885
        }
886

887 1
        writeRegionStart = 0;
888
    }
889

890 1
    enforce(csvState != CSVState.QuotedField,
891 1
            format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
892 1
                   (filename == "-") ? "Standard Input" : filename,
893
                   recordNum));
894

895
    /* Output a newline if the CSV input did not have a terminating newline. */
896 1
    if (fieldNum > 0 && recordNum > skipLines) put(outputStream, '\n');
897
}
898

899
unittest
900
{
901
    /* Unit tests for the csv2tsv function.
902
     *
903
     * These unit tests exercise different CSV combinations and escaping cases. The CSV
904
     * data content is the same for each corresponding test string, except the delimiters
905
     * have been changed. e.g csv6a and csv6b have the same data content.
906
     *
907
     * A property used in these tests is that changing the CSV delimiters doesn't change
908
     * the resulting TSV. However, changing the TSV delimiters will change the TSV result,
909
     * as TSV doesn't support having it's delimiters in the data. This allows having a
910
     * single TSV expected set that is generated by CSVs with different delimter sets.
911
     *
912
     * This test set does not test main, file handling, or error messages. These are
913
     * handled by tests run against the executable.
914
     *
915
     * Note: unittest is non @safe due to the casts from string to ubyte[]. This can
916
     * probably be rewritten to use std.string.representation instead, which is @safe.
917
     */
918

919
    /* Default CSV. */
920 1
    auto csv1a = "a,b,c";
921 1
    auto csv2a = "a,bc,,,def";
922 1
    auto csv3a = ",a, b , cd ,";
923 1
    auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石";
924 1
    auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\"";
925 1
    auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\"";
926 1
    auto csv7a = "\",\",\",,\",\",,,\"";
927 1
    auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\"";
928 1
    auto csv9a = "\"ab, de\tfg\"\"\nhij\"";
929 1
    auto csv10a = "";
930 1
    auto csv11a = ",";
931 1
    auto csv12a = ",,";
932 1
    auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\"";
933 1
    auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\"";
934 1
    auto csv15a = "\"ab, de\tfg\"\"\rhij\"";
935 1
    auto csv16a = "\"ab, de\tfg\"\"\r\nhij\"";
936 1
    auto csv17a = "ab\",ab\"cd";
937 1
    auto csv18a = "\n\n\n";
938 1
    auto csv19a = "\t";
939 1
    auto csv20a = "\t\t";
940 1
    auto csv21a = "a\n";
941 1
    auto csv22a = "a,\n";
942 1
    auto csv23a = "a,b\n";
943 1
    auto csv24a = ",\n";
944 1
    auto csv25a = "#";
945 1
    auto csv26a = "^";
946 1
    auto csv27a = "#^#";
947 1
    auto csv28a = "^#^";
948 1
    auto csv29a = "$";
949 1
    auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n";
950 1
    auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n";
951 1
    auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\"";
952

953
    // Newlines terminating a line ending a non-quoted field
954 1
    auto csv33a = "\rX\r\nX\n\r\nX\r\n";
955

956
    // Newlines inside a quoted field and terminating a line following a quoted field
957 1
    auto csv34a = "\"\r\",\"X\r\",\"X\rY\",\"\rY\"\r\"\r\n\",\"X\r\n\",\"X\r\nY\",\"\r\nY\"\r\n\"\n\",\"X\n\",\"X\nY\",\"\nY\"\n";
958

959
    // CR at field end
960 1
    auto csv35a = "abc,def\r\"ghi\",\"jkl\"\r\"mno\",pqr\r";
961

962
    /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */
963 1
    auto csv1b = "a^b^c";
964 1
    auto csv2b = "a^bc^^^def";
965 1
    auto csv3b = "^a^ b ^ cd ^";
966 1
    auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石";
967 1
    auto csv5b = "#\n#^#\n\n#^#\n\n\n#";
968 1
    auto csv6b = "#\t#^#\t\t#^#\t\t\t#";
969 1
    auto csv7b = "#,#^#,,#^#,,,#";
970 1
    auto csv8b = "##^#\"#^#\"\"#";
971 1
    auto csv9b = "#ab, de\tfg\"\nhij#";
972 1
    auto csv10b = "";
973 1
    auto csv11b = "^";
974 1
    auto csv12b = "^^";
975 1
    auto csv13b = "#\r#^#\r\r#^#\r\r\r#";
976 1
    auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#";
977 1
    auto csv15b = "#ab, de\tfg\"\rhij#";
978 1
    auto csv16b = "#ab, de\tfg\"\r\nhij#";
979 1
    auto csv17b = "ab\"^ab\"cd";
980 1
    auto csv18b = "\n\n\n";
981 1
    auto csv19b = "\t";
982 1
    auto csv20b = "\t\t";
983 1
    auto csv21b = "a\n";
984 1
    auto csv22b = "a^\n";
985 1
    auto csv23b = "a^b\n";
986 1
    auto csv24b = "^\n";
987 1
    auto csv25b = "####";
988 1
    auto csv26b = "#^#";
989 1
    auto csv27b = "###^###";
990 1
    auto csv28b = "#^##^#";
991 1
    auto csv29b = "$";
992 1
    auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n";
993 1
    auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n";
994 1
    auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#";
995 1
    auto csv33b = "\rX\r\nX\n\r\nX\r\n";
996 1
    auto csv34b = "#\r#^#X\r#^#X\rY#^#\rY#\r#\r\n#^#X\r\n#^#X\r\nY#^#\r\nY#\r\n#\n#^#X\n#^#X\nY#^#\nY#\n";
997 1
    auto csv35b = "abc^def\r#ghi#^#jkl#\r#mno#^pqr\r";
998

999
    /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/
1000 1
    auto tsv1 = "a\tb\tc\n";
1001 1
    auto tsv2 = "a\tbc\t\t\tdef\n";
1002 1
    auto tsv3 = "\ta\t b \t cd \t\n";
1003 1
    auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n";
1004 1
    auto tsv5 = " \t  \t   \n";
1005 1
    auto tsv6 = " \t  \t   \n";
1006 1
    auto tsv7 = ",\t,,\t,,,\n";
1007 1
    auto tsv8 = "\t\"\t\"\"\n";
1008 1
    auto tsv9 = "ab, de fg\" hij\n";
1009 1
    auto tsv10 = "";
1010 1
    auto tsv11 = "\t\n";
1011 1
    auto tsv12 = "\t\t\n";
1012 1
    auto tsv13 = " \t  \t   \n";
1013 1
    auto tsv14 = " \t  \t   \n";
1014 1
    auto tsv15 = "ab, de fg\" hij\n";
1015 1
    auto tsv16 = "ab, de fg\" hij\n";
1016 1
    auto tsv17 = "ab\"\tab\"cd\n";
1017 1
    auto tsv18 = "\n\n\n";
1018 1
    auto tsv19 = " \n";
1019 1
    auto tsv20 = "  \n";
1020 1
    auto tsv21 = "a\n";
1021 1
    auto tsv22 = "a\t\n";
1022 1
    auto tsv23 = "a\tb\n";
1023 1
    auto tsv24 = "\t\n";
1024 1
    auto tsv25 = "#\n";
1025 1
    auto tsv26 = "^\n";
1026 1
    auto tsv27 = "#^#\n";
1027 1
    auto tsv28 = "^#^\n";
1028 1
    auto tsv29 = "$\n";
1029 1
    auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n";
1030 1
    auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n";
1031 1
    auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n";
1032 1
    auto tsv33 = "\nX\nX\n\nX\n";
1033 1
    auto tsv34 = " \tX \tX Y\t Y\n \tX \tX Y\t Y\n \tX \tX Y\t Y\n";
1034 1
    auto tsv35 = "abc\tdef\nghi\tjkl\nmno\tpqr\n";
1035

1036
    /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab.
1037
     * This will also result in different replacements when TAB and $ appear in the CSV.
1038
     */
1039 1
    auto tsv1_x = "a$b$c\n";
1040 1
    auto tsv2_x = "a$bc$$$def\n";
1041 1
    auto tsv3_x = "$a$ b $ cd $\n";
1042 1
    auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n";
1043 1
    auto tsv5_x = " $  $   \n";
1044 1
    auto tsv6_x = "\t$\t\t$\t\t\t\n";
1045 1
    auto tsv7_x = ",$,,$,,,\n";
1046 1
    auto tsv8_x = "$\"$\"\"\n";
1047 1
    auto tsv9_x = "ab, de\tfg\" hij\n";
1048 1
    auto tsv10_x = "";
1049 1
    auto tsv11_x = "$\n";
1050 1
    auto tsv12_x = "$$\n";
1051 1
    auto tsv13_x = " $  $   \n";
1052 1
    auto tsv14_x = " $  $   \n";
1053 1
    auto tsv15_x = "ab, de\tfg\" hij\n";
1054 1
    auto tsv16_x = "ab, de\tfg\" hij\n";
1055 1
    auto tsv17_x = "ab\"$ab\"cd\n";
1056 1
    auto tsv18_x = "\n\n\n";
1057 1
    auto tsv19_x = "\t\n";
1058 1
    auto tsv20_x = "\t\t\n";
1059 1
    auto tsv21_x = "a\n";
1060 1
    auto tsv22_x = "a$\n";
1061 1
    auto tsv23_x = "a$b\n";
1062 1
    auto tsv24_x = "$\n";
1063 1
    auto tsv25_x = "#\n";
1064 1
    auto tsv26_x = "^\n";
1065 1
    auto tsv27_x = "#^#\n";
1066 1
    auto tsv28_x = "^#^\n";
1067 1
    auto tsv29_x = " \n";
1068 1
    auto tsv30_x = " $ \n $  $  \n^# $ #^$# ^$^ #\n";
1069 1
    auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
1070 1
    auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
1071 1
    auto tsv33_x = "\nX\nX\n\nX\n";
1072 1
    auto tsv34_x = " $X $X Y$ Y\n $X $X Y$ Y\n $X $X Y$ Y\n";
1073 1
    auto tsv35_x = "abc$def\nghi$jkl\nmno$pqr\n";
1074

1075
    /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab,
1076
     * and with the delimiter/newline replacement string being |--|. Basically, newlines
1077
     * and '$' in the original data are replaced by |--|.
1078
     */
1079 1
    auto tsv1_y = "a$b$c\n";
1080 1
    auto tsv2_y = "a$bc$$$def\n";
1081 1
    auto tsv3_y = "$a$ b $ cd $\n";
1082 1
    auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n";
1083 1
    auto tsv5_y = "|--|$|--||--|$|--||--||--|\n";
1084 1
    auto tsv6_y = "\t$\t\t$\t\t\t\n";
1085 1
    auto tsv7_y = ",$,,$,,,\n";
1086 1
    auto tsv8_y = "$\"$\"\"\n";
1087 1
    auto tsv9_y = "ab, de\tfg\"|--|hij\n";
1088 1
    auto tsv10_y = "";
1089 1
    auto tsv11_y = "$\n";
1090 1
    auto tsv12_y = "$$\n";
1091 1
    auto tsv13_y = "|--|$|--||--|$|--||--||--|\n";
1092 1
    auto tsv14_y = "|--|$|--||--|$|--||--||--|\n";
1093 1
    auto tsv15_y = "ab, de\tfg\"|--|hij\n";
1094 1
    auto tsv16_y = "ab, de\tfg\"|--|hij\n";
1095 1
    auto tsv17_y = "ab\"$ab\"cd\n";
1096 1
    auto tsv18_y = "\n\n\n";
1097 1
    auto tsv19_y = "\t\n";
1098 1
    auto tsv20_y = "\t\t\n";
1099 1
    auto tsv21_y = "a\n";
1100 1
    auto tsv22_y = "a$\n";
1101 1
    auto tsv23_y = "a$b\n";
1102 1
    auto tsv24_y = "$\n";
1103 1
    auto tsv25_y = "#\n";
1104 1
    auto tsv26_y = "^\n";
1105 1
    auto tsv27_y = "#^#\n";
1106 1
    auto tsv28_y = "^#^\n";
1107 1
    auto tsv29_y = "|--|\n";
1108 1
    auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n";
1109 1
    auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
1110 1
    auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
1111 1
    auto tsv33_y = "\nX\nX\n\nX\n";
1112 1
    auto tsv34_y = "|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n";
1113 1
    auto tsv35_y = "abc$def\nghi$jkl\nmno$pqr\n";
1114

1115
    /* The TSV results for CSV sets 1a and 1b, but with the TAB replacement as |TAB|
1116
     * and newline replacement |NL|.
1117
     */
1118 1
    auto tsv1_z = "a\tb\tc\n";
1119 1
    auto tsv2_z = "a\tbc\t\t\tdef\n";
1120 1
    auto tsv3_z = "\ta\t b \t cd \t\n";
1121 1
    auto tsv4_z = "ß\tßÀß\tあめりか物語\t书名: 五色石\n";
1122 1
    auto tsv5_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n";
1123 1
    auto tsv6_z = "<TAB>\t<TAB><TAB>\t<TAB><TAB><TAB>\n";
1124 1
    auto tsv7_z = ",\t,,\t,,,\n";
1125 1
    auto tsv8_z = "\t\"\t\"\"\n";
1126 1
    auto tsv9_z = "ab, de<TAB>fg\"<NL>hij\n";
1127 1
    auto tsv10_z = "";
1128 1
    auto tsv11_z = "\t\n";
1129 1
    auto tsv12_z = "\t\t\n";
1130 1
    auto tsv13_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n";
1131 1
    auto tsv14_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n";
1132 1
    auto tsv15_z = "ab, de<TAB>fg\"<NL>hij\n";
1133 1
    auto tsv16_z = "ab, de<TAB>fg\"<NL>hij\n";
1134 1
    auto tsv17_z = "ab\"\tab\"cd\n";
1135 1
    auto tsv18_z = "\n\n\n";
1136 1
    auto tsv19_z = "<TAB>\n";
1137 1
    auto tsv20_z = "<TAB><TAB>\n";
1138 1
    auto tsv21_z = "a\n";
1139 1
    auto tsv22_z = "a\t\n";
1140 1
    auto tsv23_z = "a\tb\n";
1141 1
    auto tsv24_z = "\t\n";
1142 1
    auto tsv25_z = "#\n";
1143 1
    auto tsv26_z = "^\n";
1144 1
    auto tsv27_z = "#^#\n";
1145 1
    auto tsv28_z = "^#^\n";
1146 1
    auto tsv29_z = "$\n";
1147 1
    auto tsv30_z = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n";
1148 1
    auto tsv31_z = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n";
1149 1
    auto tsv32_z = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n";
1150 1
    auto tsv33_z = "\nX\nX\n\nX\n";
1151 1
    auto tsv34_z = "<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n";
1152 1
    auto tsv35_z = "abc\tdef\nghi\tjkl\nmno\tpqr\n";
1153

1154
    /* Aggregate the test data into parallel arrays. */
1155 1
    auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a,
1156
                     csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a,
1157
                     csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a,
1158
                     csv31a, csv32a, csv33a, csv34a, csv35a];
1159

1160 1
    auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b,
1161
                     csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b,
1162
                     csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b,
1163
                     csv31b, csv32b, csv33b, csv34b, csv35b];
1164

1165 1
    auto tsvSet1  = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10,
1166
                     tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20,
1167
                     tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30,
1168
                     tsv31, tsv32, tsv33, tsv34, tsv35];
1169

1170 1
    auto tsvSet1_x  = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x,
1171
                       tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x,
1172
                       tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x,
1173
                       tsv31_x, tsv32_x, tsv33_x, tsv34_x, tsv35_x];
1174

1175 1
    auto tsvSet1_y  = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y,
1176
                       tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y,
1177
                       tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y,
1178
                       tsv31_y, tsv32_y, tsv33_y, tsv34_y, tsv35_y];
1179

1180 1
    auto tsvSet1_z  = [tsv1_z, tsv2_z, tsv3_z, tsv4_z, tsv5_z, tsv6_z, tsv7_z, tsv8_z, tsv9_z, tsv10_z,
1181
                       tsv11_z, tsv12_z, tsv13_z, tsv14_z, tsv15_z, tsv16_z, tsv17_z, tsv18_z, tsv19_z, tsv20_z,
1182
                       tsv21_z, tsv22_z, tsv23_z, tsv24_z, tsv25_z, tsv26_z, tsv27_z, tsv28_z, tsv29_z, tsv30_z,
1183
                       tsv31_z, tsv32_z, tsv33_z, tsv34_z, tsv35_z];
1184

1185
    /* The tests. */
1186 1
    auto bufferSizeTests = [1, 2, 3, 8, 128];
1187

1188 1
    foreach (bufferSize; bufferSizeTests)
1189
    {
1190 1
        ubyte[] readBuffer = new ubyte[](bufferSize);
1191

1192 1
        foreach (i, csva, csvb, tsv, tsv_x, tsv_y, tsv_z; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y, tsvSet1_z))
1193
        {
1194
            import std.conv : to;
1195

1196
            /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */
1197 1
            ubyte[] csvInputA = cast(ubyte[])csva;
1198 1
            ubyte[] csvInputB = cast(ubyte[])csvb;
1199

1200
            /* CSV Set A vs TSV expected. */
1201 1
            auto tsvResultA = appender!(char[])();
1202 1
            csv2tsv(csvInputA, tsvResultA, readBuffer, "csvInputA_defaultTSV");
1203 1
            assert(tsv == tsvResultA.data,
1204
                   format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1205
                          i + 1, csva, tsv, tsvResultA.data));
1206

1207
            /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/
1208 1
            auto tsvResultB = appender!(char[])();
1209 1
            csv2tsv(csvInputB, tsvResultB, readBuffer, "csvInputB_defaultTSV", 0, '#', '^');
1210 1
            assert(tsv == tsvResultB.data,
1211
                   format("Unittest failure. tsv != tsvResultB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1212
                          i + 1, csvb, tsv, tsvResultB.data));
1213

1214
            /* CSV Set A and TSV with $ separator.*/
1215 1
            csvInputA = cast(ubyte[])csva;
1216 1
            auto tsvResult_XA = appender!(char[])();
1217 1
            csv2tsv(csvInputA, tsvResult_XA, readBuffer, "csvInputA_TSV_WithDollarDelimiter", 0, '"', ',', '$');
1218 1
            assert(tsv_x == tsvResult_XA.data,
1219
                   format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1220
                          i + 1, csva, tsv_x, tsvResult_XA.data));
1221

1222
            /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/
1223 1
            csvInputB = cast(ubyte[])csvb;
1224 1
            auto tsvResult_XB = appender!(char[])();
1225 1
            csv2tsv(csvInputB, tsvResult_XB, readBuffer, "csvInputB__TSV_WithDollarDelimiter", 0, '#', '^', '$');
1226 1
            assert(tsv_x == tsvResult_XB.data,
1227
                   format("Unittest failure. tsv_x != tsvResult_XB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1228
                          i + 1, csvb, tsv_x, tsvResult_XB.data));
1229

1230
            /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */
1231 1
            csvInputA = cast(ubyte[])csva;
1232 1
            auto tsvResult_YA = appender!(char[])();
1233 1
            csv2tsv(csvInputA, tsvResult_YA, readBuffer, "csvInputA_TSV_WithDollarAndDelimReplacement", 0, '"', ',', '$', "|--|", "|--|");
1234 1
            assert(tsv_y == tsvResult_YA.data,
1235
                   format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1236
                          i + 1, csva, tsv_y, tsvResult_YA.data));
1237

1238
            /* CSV Set B and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/
1239 1
            csvInputB = cast(ubyte[])csvb;
1240 1
            auto tsvResult_YB = appender!(char[])();
1241 1
            csv2tsv(csvInputB, tsvResult_YB, readBuffer, "csvInputB__TSV_WithDollarAndDelimReplacement", 0, '#', '^', '$', "|--|", "|--|");
1242 1
            assert(tsv_y == tsvResult_YB.data,
1243
                   format("Unittest failure. tsv_y != tsvResult_YB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1244
                          i + 1, csvb, tsv_y, tsvResult_YB.data));
1245

1246
            /* CSV Set A and TSV with TAB replacement as <TAB> and newline replacement as <NL>. Same TSV as CSV Set A.*/
1247 1
            csvInputA = cast(ubyte[])csva;
1248 1
            auto tsvResult_ZA = appender!(char[])();
1249 1
            csv2tsv(csvInputA, tsvResult_ZA, readBuffer, "csvInputA_TSV_WithDifferentTABandNLReplacements", 0, '"', ',', '\t', "<TAB>", "<NL>");
1250 1
            assert(tsv_z == tsvResult_ZA.data,
1251
                   format("Unittest failure. tsv_z != tsvResult_ZA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1252
                          i + 1, csva, tsv_z, tsvResult_ZA.data));
1253
        }
1254
    }
1255
}
1256

1257
// csv2tsv skiplines tests
1258
unittest
1259
{
1260
    import std.string : representation;
1261

1262 1
    auto csv1 = "";
1263 1
    auto csv2 = "a";
1264

1265 1
    auto csv3 = "\n";
1266 1
    auto csv4 = "\n\n";
1267 1
    auto csv5 = "\n\n\n";
1268

1269 1
    auto csv6 = "a\n";
1270 1
    auto csv7 = "a\nb\n";
1271 1
    auto csv8 = "a\nb\nc\n";
1272

1273 1
    auto csv9 = "\"\n\"\n";
1274 1
    auto csv10 = "\"\n\"\n\"\n\"\n";
1275 1
    auto csv11 = "\"\n\"\n\"\n\"\n\"\n\"\n";
1276

1277 1
    auto csv12 = "\r";
1278 1
    auto csv13 = "\r\r";
1279 1
    auto csv14 = "\r\r\r";
1280

1281 1
    auto csv15 = "a\r";
1282 1
    auto csv16 = "a\rb\r";
1283 1
    auto csv17 = "a\rb\rc\r";
1284

1285 1
    auto csv18 = "\"\r\"\r";
1286 1
    auto csv19 = "\"\r\"\r\"\r\"\r";
1287 1
    auto csv20 = "\"\r\"\r\"\r\"\r\"\r\"\r";
1288

1289 1
    auto csv21 = "\r\n";
1290 1
    auto csv22 = "\r\n\r\n";
1291 1
    auto csv23 = "\r\n\r\n\r\n";
1292

1293 1
    auto csv24 = "a\r\n";
1294 1
    auto csv25 = "a\r\nb\r\n";
1295 1
    auto csv26 = "a\r\nb\r\nc\r\n";
1296

1297 1
    auto csv27 = "\"\r\n\"\r\n";
1298 1
    auto csv28 = "\"\r\n\"\r\n\"\r\n\"\r\n";
1299 1
    auto csv29 = "\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n";
1300

1301
    /* The Skip 1 expected results. */
1302 1
    auto tsv1Skip1 = "";
1303 1
    auto tsv2Skip1 = "";
1304

1305 1
    auto tsv3Skip1 = "";
1306 1
    auto tsv4Skip1 = "\n";
1307 1
    auto tsv5Skip1 = "\n\n";
1308

1309 1
    auto tsv6Skip1 = "";
1310 1
    auto tsv7Skip1 = "b\n";
1311 1
    auto tsv8Skip1 = "b\nc\n";
1312

1313 1
    auto tsv9Skip1 = "";
1314 1
    auto tsv10Skip1 = " \n";
1315 1
    auto tsv11Skip1 = " \n \n";
1316

1317 1
    auto tsv12Skip1 = "";
1318 1
    auto tsv13Skip1 = "\n";
1319 1
    auto tsv14Skip1 = "\n\n";
1320

1321 1
    auto tsv15Skip1 = "";
1322 1
    auto tsv16Skip1 = "b\n";
1323 1
    auto tsv17Skip1 = "b\nc\n";
1324

1325 1
    auto tsv18Skip1 = "";
1326 1
    auto tsv19Skip1 = " \n";
1327 1
    auto tsv20Skip1 = " \n \n";
1328

1329 1
    auto tsv21Skip1 = "";
1330 1
    auto tsv22Skip1 = "\n";
1331 1
    auto tsv23Skip1 = "\n\n";
1332

1333 1
    auto tsv24Skip1 = "";
1334 1
    auto tsv25Skip1 = "b\n";
1335 1
    auto tsv26Skip1 = "b\nc\n";
1336

1337 1
    auto tsv27Skip1 = "";
1338 1
    auto tsv28Skip1 = " \n";
1339 1
    auto tsv29Skip1 = " \n \n";
1340

1341
    /* The Skip 2 expected results. */
1342 1
    auto tsv1Skip2 = "";
1343 1
    auto tsv2Skip2 = "";
1344

1345 1
    auto tsv3Skip2 = "";
1346 1
    auto tsv4Skip2 = "";
1347 1
    auto tsv5Skip2 = "\n";
1348

1349 1
    auto tsv6Skip2 = "";
1350 1
    auto tsv7Skip2 = "";
1351 1
    auto tsv8Skip2 = "c\n";
1352

1353 1
    auto tsv9Skip2 = "";
1354 1
    auto tsv10Skip2 = "";
1355 1
    auto tsv11Skip2 = " \n";
1356

1357 1
    auto tsv12Skip2 = "";
1358 1
    auto tsv13Skip2 = "";
1359 1
    auto tsv14Skip2 = "\n";
1360

1361 1
    auto tsv15Skip2 = "";
1362 1
    auto tsv16Skip2 = "";
1363 1
    auto tsv17Skip2 = "c\n";
1364

1365 1
    auto tsv18Skip2 = "";
1366 1
    auto tsv19Skip2 = "";
1367 1
    auto tsv20Skip2 = " \n";
1368

1369 1
    auto tsv21Skip2 = "";
1370 1
    auto tsv22Skip2 = "";
1371 1
    auto tsv23Skip2 = "\n";
1372

1373 1
    auto tsv24Skip2 = "";
1374 1
    auto tsv25Skip2 = "";
1375 1
    auto tsv26Skip2 = "c\n";
1376

1377 1
    auto tsv27Skip2 = "";
1378 1
    auto tsv28Skip2 = "";
1379 1
    auto tsv29Skip2 = " \n";
1380

1381 1
    auto csvSet =
1382
        [csv1, csv2, csv3, csv4, csv5, csv6, csv7, csv8, csv9, csv10,
1383
         csv11, csv12, csv13, csv14, csv15, csv16, csv17, csv18, csv19, csv20,
1384
         csv21, csv22, csv23, csv24, csv25, csv26, csv27, csv28, csv29];
1385

1386 1
    auto tsvSkip1Set =
1387
        [tsv1Skip1, tsv2Skip1, tsv3Skip1, tsv4Skip1, tsv5Skip1, tsv6Skip1, tsv7Skip1, tsv8Skip1, tsv9Skip1, tsv10Skip1,
1388
         tsv11Skip1, tsv12Skip1, tsv13Skip1, tsv14Skip1, tsv15Skip1, tsv16Skip1, tsv17Skip1, tsv18Skip1, tsv19Skip1, tsv20Skip1,
1389
         tsv21Skip1, tsv22Skip1, tsv23Skip1, tsv24Skip1, tsv25Skip1, tsv26Skip1, tsv27Skip1, tsv28Skip1, tsv29Skip1];
1390

1391 1
    auto tsvSkip2Set =
1392
        [tsv1Skip2, tsv2Skip2, tsv3Skip2, tsv4Skip2, tsv5Skip2, tsv6Skip2, tsv7Skip2, tsv8Skip2, tsv9Skip2, tsv10Skip2,
1393
         tsv11Skip2, tsv12Skip2, tsv13Skip2, tsv14Skip2, tsv15Skip2, tsv16Skip2, tsv17Skip2, tsv18Skip2, tsv19Skip2, tsv20Skip2,
1394
         tsv21Skip2, tsv22Skip2, tsv23Skip2, tsv24Skip2, tsv25Skip2, tsv26Skip2, tsv27Skip2, tsv28Skip2, tsv29Skip2];
1395

1396 1
    auto bufferSizeTests = [1, 2, 3, 4, 8, 128];
1397

1398 1
    foreach (bufferSize; bufferSizeTests)
1399
    {
1400 1
        ubyte[] readBuffer = new ubyte[](bufferSize);
1401

1402 1
        foreach (i, csv, tsvSkip1, tsvSkip2; lockstep(csvSet, tsvSkip1Set, tsvSkip2Set))
1403
        {
1404 1
            ubyte[] csvInput = csv.dup.representation;
1405 1
            auto csvToTSVSkip1 = appender!(char[])();
1406 1
            auto csvToTSVSkip2 = appender!(char[])();
1407

1408 1
            csv2tsv(csvInput, csvToTSVSkip1, readBuffer, "csvToTSVSkip1", 1);
1409

1410 1
            assert(tsvSkip1 == csvToTSVSkip1.data,
1411
                   format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1412
                          i + 1, bufferSize, csv, tsvSkip1, csvToTSVSkip1.data));
1413

1414 1
            csv2tsv(csvInput, csvToTSVSkip2, readBuffer, "csvToTSVSkip2", 2);
1415

1416 1
            assert(tsvSkip2 == csvToTSVSkip2.data,
1417
                   format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1418
                          i + 1, bufferSize, csv, tsvSkip2, csvToTSVSkip2.data));
1419
        }
1420
    }
1421
}
1422

1423
// csv2tsv BOM tests. Note: std.range.lockstep prevents use of @safe
1424
unittest
1425
{
1426
    import std.conv : hexString;
1427
    import std.string : representation;
1428

1429
    enum utf8BOM = hexString!"efbbbf";
1430

1431 1
    auto csv1 = "";
1432 1
    auto csv2 = "a";
1433 1
    auto csv3 = "ab";
1434 1
    auto csv4 = "a,b";
1435 1
    auto csv5 = "a,b\ncdef,ghi\njklmn,opqrs\ntuv,wxyz";
1436

1437 1
    auto csv1BOM = utf8BOM ~ csv1;
1438 1
    auto csv2BOM = utf8BOM ~ csv2;
1439 1
    auto csv3BOM = utf8BOM ~ csv3;
1440 1
    auto csv4BOM = utf8BOM ~ csv4;
1441 1
    auto csv5BOM = utf8BOM ~ csv5;
1442

1443 1
    auto tsv1 = "";
1444 1
    auto tsv2 = "a\n";
1445 1
    auto tsv3 = "ab\n";
1446 1
    auto tsv4 = "a\tb\n";
1447 1
    auto tsv5 = "a\tb\ncdef\tghi\njklmn\topqrs\ntuv\twxyz\n";
1448

1449
    /* Note: csv1 is the empty string, so tsv1 does not have a trailing newline.
1450
     * However, with the BOM prepended the tsv gets a trailing newline.
1451
     */
1452 1
    auto tsv1BOM = utf8BOM ~ tsv1 ~ "\n";
1453 1
    auto tsv2BOM = utf8BOM ~ tsv2;
1454 1
    auto tsv3BOM = utf8BOM ~ tsv3;
1455 1
    auto tsv4BOM = utf8BOM ~ tsv4;
1456 1
    auto tsv5BOM = utf8BOM ~ tsv5;
1457

1458 1
    auto csvSet = [csv1, csv2, csv3, csv4, csv5];
1459 1
    auto csvBOMSet = [csv1BOM, csv2BOM, csv3BOM, csv4BOM, csv5BOM];
1460

1461 1
    auto tsvSet = [tsv1, tsv2, tsv3, tsv4, tsv5];
1462 1
    auto tsvBOMSet = [tsv1BOM, tsv2BOM, tsv3BOM, tsv4BOM, tsv5BOM];
1463

1464 1
    auto bufferSizeTests = [1, 2, 3, 4, 8, 128];
1465

1466 1
    foreach (bufferSize; bufferSizeTests)
1467
    {
1468 1
        ubyte[] readBuffer = new ubyte[](bufferSize);
1469

1470 1
        foreach (i, csv, csvBOM, tsv, tsvBOM; lockstep(csvSet, csvBOMSet, tsvSet, tsvBOMSet))
1471
        {
1472 1
            ubyte[] csvInput = csv.dup.representation;
1473 1
            ubyte[] csvBOMInput = csvBOM.dup.representation;
1474

1475 1
            auto csvToTSV = appender!(char[])();
1476 1
            auto csvToTSV_NoBOMRemoval = appender!(char[])();
1477 1
            auto csvBOMToTSV = appender!(char[])();
1478 1
            auto csvBOMToTSV_NoBOMRemoval = appender!(char[])();
1479

1480 1
            csv2tsv(csvInput, csvToTSV, readBuffer, "csvToTSV", 0, '"', ',', '\t', " ", " ", true);
1481 1
            assert(tsv == csvToTSV.data,
1482
                   format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1483
                          i + 1, bufferSize, csv, tsv, csvToTSV.data));
1484

1485 1
            csv2tsv(csvInput, csvToTSV_NoBOMRemoval, readBuffer, "csvToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false);
1486 1
            assert(tsv == csvToTSV_NoBOMRemoval.data,
1487
                   format("Unittest failure. tsv != csvToTSV_NoBOMRemoval.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1488
                          i + 1, bufferSize, csv, tsv, csvToTSV_NoBOMRemoval.data));
1489

1490 1
            csv2tsv(csvBOMInput, csvBOMToTSV, readBuffer, "csvBOMToTSV", 0, '"', ',', '\t', " ", " ", true);
1491 1
            if (readBuffer.length < utf8BOM.length)
1492
            {
1493
                /* Removing BOMs, but didn't provide enough buffer, so no removal. */
1494 1
                assert(tsvBOM == csvBOMToTSV.data,
1495
                       format("Unittest failure. tsvBOM != csvBOMToTSV.data. (Small buffer) Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1496
                              i + 1, bufferSize, csv, tsv, csvBOMToTSV.data));
1497
            }
1498
            else
1499
            {
1500 1
                assert(tsv == csvBOMToTSV.data,
1501
                       format("Unittest failure. tsv != csvBOMToTSV.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1502
                              i + 1, bufferSize, csv, tsv, csvBOMToTSV.data));
1503
            }
1504

1505 1
            csv2tsv(csvBOMInput, csvBOMToTSV_NoBOMRemoval, readBuffer, "csvBOMToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false);
1506 1
            assert(tsvBOM == csvBOMToTSV_NoBOMRemoval.data,
1507
                   format("Unittest failure. tsvBOM != csvBOMToTSV_NoBOMRemoval.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1508
                          i + 1, bufferSize, csv, tsv, csvBOMToTSV_NoBOMRemoval.data));
1509
        }
1510
    }
1511
}

Read our documentation on viewing source code .

Loading