1
/**
2
Command line tool for splitting a files (or files) into multiple output files.
3
Several methods for splitting are available, including splitting by line count,
4
splitting by random assignment, and splitting by random assignment based on
5
key fields.
6

7
Copyright (c) 2020, eBay Inc.
8
Initially written by Jon Degenhardt
9

10
License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
11
*/
12
module tsv_utils.tsv_split;
13

14
import std.exception : enforce;
15
import std.format : format;
16
import std.range;
17
import std.stdio;
18
import std.typecons : tuple, Flag;
19

20
static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
21

22
version(unittest)
23
{
24
    // When running unit tests, use main from -main compiler switch.
25
}
26
else
27
{
28
    /** Main program.
29
     *
30
     * Invokes command line argument processing and calls tsvSplit to do the real
31
     * work. Errors occurring during processing are caught and reported to the user.
32
     */
33
    int main(string[] cmdArgs)
34
    {
35
        /* When running in DMD code coverage mode, turn on report merging. */
36
        version(D_Coverage) version(DigitalMars)
37
        {
38
            import core.runtime : dmd_coverSetMerge;
39 1
            dmd_coverSetMerge(true);
40
        }
41

42 1
        TsvSplitOptions cmdopt;
43 1
        const r = cmdopt.processArgs(cmdArgs);
44 1
        if (!r[0]) return r[1];
45
        version(LDC_Profile)
46
        {
47
            import ldc.profile : resetAll;
48
            resetAll();
49
        }
50
        try
51
        {
52 1
            tsvSplit(cmdopt);
53
        }
54
        catch (Exception exc)
55
        {
56 1
            stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
57 1
            return 1;
58
        }
59 1
        return 0;
60
    }
61
}
62

63
immutable helpText = q"EOS
64
Synopsis: tsv-split [options] [file...]
65

66
Split input lines into multiple output files. There are three modes of
67
operation:
68

69
* Fixed number of lines per file (--l|lines-per-file NUM): Each input
70
  block of NUM lines is written to a new file. Similar to Unix 'split'.
71

72
* Random assignment (--n|num-files NUM): Each input line is written to a
73
  randomly selected output file. Random selection is from NUM files.
74

75
* Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
76
  Input lines are written to output files using fields as a key. Each
77
  unique key is randomly assigned to one of NUM output files. All lines
78
  with the same key are written to the same file.
79

80
By default, files are written to the current directory and have names
81
of the form 'part_NNN<suffix>', with 'NNN' being a number and <suffix>
82
being the extension of the first input file. If the input file is
83
'file.txt', the names will take the form 'part_NNN.txt'. The output
84
directory and file names are customizable.
85

86
Fields are specified using field number or field name. Field names
87
require that the input file has a header line.
88

89
Use '--help-verbose' for more detailed information.
90

91
Options:
92
EOS";
93

94
immutable helpTextVerbose = q"EOS
95
Synopsis: tsv-split [options] [file...]
96

97
Split input lines into multiple output files. There are three modes of
98
operation:
99

100
* Fixed number of lines per file (--l|lines-per-file NUM): Each input
101
  block of NUM lines is written to a new file. Similar to Unix 'split'.
102

103
* Random assignment (--n|num-files NUM): Each input line is written to a
104
  randomly selected output file. Random selection is from NUM files.
105

106
* Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
107
  Input lines are written to output files using fields as a key. Each
108
  unique key is randomly assigned to one of NUM output files. All lines
109
  with the same key are written to the same file.
110

111
Output files: By default, files are written to the current directory and
112
have names of the form 'part_NNN<suffix>', with 'NNN' being a number and
113
<suffix> being the extension of the first input file. If the input file is
114
'file.txt', the names will take the form 'part_NNN.txt'. The suffix is
115
empty when reading from standard input. The numeric part defaults to 3
116
digits for '--l|lines-per-files'. For '--n|num-files' enough digits are
117
used so all filenames are the same length. The output directory and file
118
names are customizable.
119

120
Header lines: There are two ways to handle input with headers: write a
121
header to all output files (--H|header), or exclude headers from all
122
output files ('--I|header-in-only'). The best choice depends on the
123
follow-up processing. All tsv-utils tools support header lines in multiple
124
input files, but many other tools do not. For example, GNU parallel works
125
best on files without header lines.
126

127
Random assignment (--n|num-files): Random distribution of records to a set
128
of files is a common task. When data fits in memory the preferred approach
129
is usually to shuffle the data and split it into fixed sized blocks. E.g.
130
'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches
131
are needed when data is too large for convenient shuffling. tsv-split's
132
random assignment feature is useful in this case. Each input line is
133
written to a randomly selected output file. Note that output files will
134
have similar but not identical numbers of records.
135

136
Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This
137
splits a data set into multiple files sharded by key. All lines with the
138
same key are written to the same file. This partitioning enables parallel
139
computation based on the key. For example, statistical calculation
140
('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields').
141
These operations can be parallelized using tools like GNU parallel, which
142
simplifies concurrent operations on multiple files. Fields are specified
143
using field number or field name. Field names require that the input file
144
has a header line. Use '--help-fields' for details about field names.
145

146
Random seed: By default, each tsv-split invocation using random assignment
147
or random assignment by key produces different assignments to the output
148
files. Using '--s|static-seed' changes this so multiple runs produce the
149
same assignments. This works by using the same random seed each run. The
150
seed can be specified using '--v|seed-value'.
151

152
Appending to existing files: By default, an error is triggered if an
153
output file already exists. '--a|append' changes this so that lines are
154
appended to existing files. (Header lines are not appended to files with
155
data.) This is useful when adding new data to files created by a previous
156
tsv-split run. Random assignment should use the same '--n|num-files' value
157
each run, but different random seeds (avoid '--s|static-seed'). Random
158
assignment by key should use the same '--n|num-files', '--k|key-fields',
159
and seed ('--s|static-seed' or '--v|seed-value') each run.
160

161
Max number of open files: Random assignment and random assignment by key
162
are dramatically faster when all output files are kept open. However,
163
keeping a large numbers of open files can bump into system limits or limit
164
resources available to other processes. By default, tsv-split uses up to
165
4096 open files or the system per-process limit, whichever is smaller.
166
This can be changed using '--max-open-files', though it cannot be set
167
larger than the system limit. The system limit varies considerably between
168
systems. On many systems it is unlimited. On MacOS it is often set to 256.
169
Use Unix 'ulimit' to display and modify the limits:
170
* 'ulimit -n' - Show the "soft limit". The per-process maximum.
171
* 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit.
172
* 'ulimit -Sn NUM' - Change the "soft limit" to NUM.
173

174
Examples:
175

176
  # Split a 10 million line file into 1000 files, 10,000 lines each.
177
  # Output files are part_000.txt, part_001.txt, ... part_999.txt.
178
  tsv-split data.txt --lines-per-file 10000
179

180
  # Same as the previous example, but write files to a subdirectory.
181
  tsv-split data.txt --dir split_files --lines-per-file 10000
182

183
  # Split a file into 10,000 line files, writing a header line to each
184
  tsv-split data.txt -H --lines-per-file 10000
185

186
  # Same as the previous example, but dropping the header line.
187
  tsv-split data.txt -I --lines-per-file 10000
188

189
  # Randomly assign lines to 1000 files
190
  tsv-split data.txt --num-files 1000
191

192
  # Randomly assign lines to 1000 files while keeping unique entries
193
  # from the 'url' field together.
194
  tsv-split data.tsv -H -k url --num-files 1000
195

196
  # Randomly assign lines to 1000 files. Later, randomly assign lines
197
  # from a second data file to the same output files.
198
  tsv-split data1.tsv -n 1000
199
  tsv-split data2.tsv -n 1000 --append
200

201
  # Randomly assign lines to 1000 files using field 3 as a key.
202
  # Later, add a second file to the same output files.
203
  tsv-split data1.tsv -n 1000 -k 3 --static-seed
204
  tsv-split data2.tsv -n 1000 -k 3 --static-seed --append
205

206
  # Change the system per-process open file limit for one command.
207
  # The parens create a sub-shell. The current shell is not changed.
208
  ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt )
209

210
Options:
211
EOS";
212

213
/** Container for command line options and derived data.
214
 *
215
 * TsvSplitOptions handles several aspects of command line options. On the input side,
216
 * it defines the command line options available, performs validation, and sets up any
217
 * derived state based on the options provided. These activities are handled by the
218
 * processArgs() member.
219
 *
220
 * Once argument processing is complete, TsvSplitOptions is used as a container
221
 * holding the specific processing options used by the splitting algorithms.
222
 */
223
struct TsvSplitOptions
224
{
225
    import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader;
226

227
    enum invalidFileSuffix = "///////";
228

229
    string programName;                        /// Program name
230
    InputSourceRange inputSources;             /// Input files
231
    bool headerInOut = false;                  /// --H|header
232
    bool headerIn = false;                     /// --I|header-in-only
233
    size_t linesPerFile = 0;                   /// --l|lines-per-file
234
    uint numFiles = 0;                         /// --n|num-files
235
    size_t[] keyFields;                        /// Derived: --k|key-fields
236
    string dir;                                /// --dir
237
    string prefix = "part_";                   /// --prefix
238
    string suffix = invalidFileSuffix;         /// --suffix
239
    uint digitWidth = 0;                       /// --w|digit-width
240
    bool appendToExistingFiles = false;        /// --a|append
241
    bool staticSeed = false;                   /// --s|static-seed
242
    uint seedValueOptionArg = 0;               /// --v|seed-value
243
    char delim = '\t';                         /// --d|delimiter
244
    uint maxOpenFilesArg = 0;                  /// --max-open-files
245
    bool hasHeader = false;                    /// Derived. True if either '--H|header' or '--I|header-in-only' is set.
246
    bool keyIsFullLine = false;                /// Derived. True if '--f|fields 0' is specfied.
247
    bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
248
    uint seed = 0;                             /// Derived from --static-seed, --seed-value
249
    uint maxOpenOutputFiles;                   /// Derived.
250

251
    /** Process tsv-split command line arguments.
252
     *
253
     * Defines the command line options, performs validation, and derives additional
254
     * state. std.getopt.getopt is called to do the main option processing followed
255
     * additional validation and derivation.
256
     *
257
     * Help text is printed to standard output if help was requested. Error text is
258
     * written to stderr if invalid input is encountered.
259
     *
260
     * A tuple is returned. First value is true if command line arguments were
261
     * successfully processed and execution should continue, or false if an error
262
     * occurred or the user asked for help. If false, the second value is the
263
     * appropriate exit code (0 or 1).
264
     *
265
     * Returning true (execution continues) means args have been validated and derived
266
     * values calculated. Field indices will have been converted to zero-based.
267
     */
268
    auto processArgs(ref string[] cmdArgs)
269
    {
270
        import std.algorithm : all, canFind, each, min;
271
        import std.conv : to;
272
        import std.file : exists, isDir;
273
        import std.getopt;
274
        import std.math : isNaN;
275
        import std.path : baseName, expandTilde, extension, stripExtension;
276
        import std.typecons : Yes, No;
277
        import tsv_utils.common.fieldlist;
278

279 1
        bool helpVerbose = false;                  // --help-verbose
280 1
        bool helpFields = false;                   // --help-fields
281 1
        bool versionWanted = false;                // --V|version
282 1
        string keyFieldsArg;                       // --k|key-fields
283

284 1
        string keyFieldsOptionString = "k|key-fields";
285

286 1
        programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
287

288
        try
289
        {
290 1
            arraySep = ",";    // Use comma to separate values in command line options
291 1
            auto r = getopt(
292
                cmdArgs,
293
                "help-verbose",    "     Print more detailed help.", &helpVerbose,
294
                "help-fields",     "     Print help on specifying fields.", &helpFields,
295

296
                std.getopt.config.caseSensitive,
297
                "H|header",         "     Input files have a header line. Write the header to each output file.", &headerInOut,
298
                "I|header-in-only", "     Input files have a header line. Do not write the header to output files.", &headerIn,
299
                std.getopt.config.caseInsensitive,
300

301
                "l|lines-per-file", "NUM  Number of lines to write to each output file (excluding the header line).", &linesPerFile,
302
                "n|num-files",      "NUM  Number of output files to generate.", &numFiles,
303

304
                keyFieldsOptionString,
305
                "<field-list>  Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
306
                &keyFieldsArg,
307

308
                "dir",              "STR  Directory to write to. Default: Current working directory.", &dir,
309
                "prefix",           "STR  Filename prefix. Default: 'part_'", &prefix,
310
                "suffix",           "STR  Filename suffix. Default: First input file extension. None for standard input.", &suffix,
311
                "w|digit-width",    "NUM  Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth,
312
                "a|append",         "     Append to existing files.", &appendToExistingFiles,
313

314
                "s|static-seed",    "     Use the same random seed every run.", &staticSeed,
315

316
                std.getopt.config.caseSensitive,
317
                "v|seed-value",     "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
318
                std.getopt.config.caseInsensitive,
319

320
                "d|delimiter",      "CHR  Field delimiter.", &delim,
321
                "max-open-files",   "NUM  Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg,
322

323
                std.getopt.config.caseSensitive,
324
                "V|version",        "     Print version information and exit.", &versionWanted,
325
                std.getopt.config.caseInsensitive,
326
                );
327

328 1
            if (r.helpWanted)
329
            {
330 1
                defaultGetoptPrinter(helpText, r.options);
331 1
                return tuple(false, 0);
332
            }
333 1
            else if (helpVerbose)
334
            {
335 1
                defaultGetoptPrinter(helpTextVerbose, r.options);
336 1
                return tuple(false, 0);
337
            }
338 1
            else if (helpFields)
339
            {
340 1
                writeln(fieldListHelpText);
341 1
                return tuple(false, 0);
342
            }
343 1
            else if (versionWanted)
344
            {
345
                import tsv_utils.common.tsvutils_version;
346 1
                writeln(tsvutilsVersionNotice("tsv-split"));
347 1
                return tuple(false, 0);
348
            }
349

350
            /* Remaining command line args are files.
351
             */
352 1
            string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
353 1
            cmdArgs.length = 1;
354

355
            /* Validation and derivations - Do as much validation prior to header line
356
             * processing as possible (avoids waiting on stdin).
357
             *
358
             * Note: keyFields depends on header line processing, but keyFieldsArg
359
             * can be used to detect whether the command line argument was specified.
360
             */
361

362 1
            enforce(!(headerInOut && headerIn),
363 1
                    "Use only one of '--H|header' and '--I|header-in-only'.");
364

365 1
            hasHeader = headerInOut || headerIn;
366

367 1
            enforce(linesPerFile != 0 || numFiles != 0,
368 1
                    "Either '--l|lines-per-file' or '--n|num-files' is required.");
369

370 1
            enforce(linesPerFile == 0 || numFiles == 0,
371 1
                    "'--l|lines-per-file' and '--n|num-files' cannot be used together.");
372

373 1
            enforce(linesPerFile == 0 || keyFieldsArg.length == 0,
374 1
                    "'--l|lines-per-file' and '--k|key-fields' cannot be used together.");
375

376 1
            enforce(numFiles != 1, "'--n|num-files must be two or more.");
377

378 1
            if (!dir.empty)
379
            {
380 1
                dir = dir.expandTilde;
381 1
                enforce(dir.exists, format("Directory does not exist: --dir '%s'", dir));
382 1
                enforce(dir.isDir, format("Path is not a directory: --dir '%s'", dir));
383
            }
384

385
            /* Seed. */
386
            import std.random : unpredictableSeed;
387

388 1
            usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
389

390 1
            if (usingUnpredictableSeed) seed = unpredictableSeed;
391 1
            else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
392 1
            else if (staticSeed) seed = 2438424139;
393 0
            else assert(0, "Internal error, invalid seed option states.");
394

395
            /* Maximum number of open files. Mainly applies when --num-files is used.
396
             *
397
             * Derive maxOpenOutputFiles. Inputs:
398
             * - Internal default limit: 4096. This is a somewhat conservative setting.
399
             * - rlimit open files limit. Defined by '$ ulimit -n'.
400
             * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit,
401
             *   but only up to the rlimit value.
402
             * - Four open files are reserved for stdin, stdout, stderr, and one input
403
             *   file.
404
             */
405

406 1
            immutable uint internalDefaultMaxOpenFiles = 4096;
407 1
            immutable uint numReservedOpenFiles = 4;
408 1
            immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit();
409

410 1
            enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles,
411 1
                    format("'--max-open-files' must be at least %d.",
412
                           numReservedOpenFiles + 1));
413

414 1
            enforce(maxOpenFilesArg <= rlimitOpenFilesLimit,
415 1
                    format("'--max-open-files' value (%d) greater current system limit (%d)." ~
416
                           "\nRun 'ulimit -n' to see the soft limit." ~
417
                           "\nRun 'ulimit -Hn' to see the hard limit." ~
418
                           "\nRun 'ulimit -Sn NUM' to change the soft limit.",
419
                           maxOpenFilesArg, rlimitOpenFilesLimit));
420

421 1
            enforce(rlimitOpenFilesLimit > numReservedOpenFiles,
422 0
                    format("System open file limit too small. Current value: %d. Must be %d or more." ~
423
                           "\nRun 'ulimit -n' to see the soft limit." ~
424
                           "\nRun 'ulimit -Hn' to see the hard limit." ~
425
                           "\nRun 'ulimit -Sn NUM' to change the soft limit.",
426
                           rlimitOpenFilesLimit, numReservedOpenFiles + 1));
427

428 1
            immutable uint openFilesLimit =
429
                (maxOpenFilesArg != 0)
430 1
                ? maxOpenFilesArg
431 1
                : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit);
432

433 1
            assert(openFilesLimit > numReservedOpenFiles);
434

435 1
            maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles;
436

437
            /* Suffix - If not provided, use the extension of the first input file.
438
             * No suffix if reading from standard input.
439
             */
440 1
            if (suffix == invalidFileSuffix) suffix = filepaths[0].extension;
441

442
            /* Ensure forward slash is not included in the filename prefix and suffix.
443
             * Forward slash is an invalid Unix filename character. However, open file
444
             * calls could match a directory path, resulting in unintended file
445
             * creation.
446
             *
447
             * The other invalid filename character on Unix is the NULL character.
448
             * However, the NULL character cannot be entered via Unix command lines,
449
             * so there is no need to test for it explicitly.
450
             */
451 1
            enforce(!prefix.canFind('/'),
452 1
                    "'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
453

454 1
            enforce(!suffix.canFind('/'),
455 1
                    "'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
456

457
            /* Digit width - If not specified, or specified as zero, the width is
458
             * determined by the number of files for --num-files, or defaulted to 3
459
             * for --lines-per-file.
460
             */
461 1
            if (digitWidth == 0)
462
            {
463 1
                if (numFiles > 0)
464
                {
465 1
                    digitWidth = 1;
466 1
                    uint n = numFiles - 1;
467 1
                    while (n >= 10)
468
                    {
469 1
                        n /= 10;
470 1
                        ++digitWidth;
471
                    }
472
                }
473
                else
474
                {
475 1
                    digitWidth = 3;
476
                }
477
            }
478 1
            assert(digitWidth != 0);
479

480
            /*
481
             * Create the inputSourceRange and perform header line processing.
482
             */
483 1
            ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
484 1
            inputSources = inputSourceRange(filepaths, readHeader);
485

486 1
            string[] headerFields;
487

488 1
            if (hasHeader) headerFields = inputSources.front.header.split(delim).to!(string[]);
489

490 1
            if (!keyFieldsArg.empty)
491
            {
492 1
                keyFields =
493
                    keyFieldsArg
494
                    .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
495
                    (hasHeader, headerFields, keyFieldsOptionString)
496
                    .array;
497
            }
498

499 1
            if (keyFields.length > 0)
500
            {
501 1
                if (keyFields.length == 1 && keyFields[0] == 0)
502
                {
503 1
                    keyIsFullLine = true;
504
                }
505
                else
506
                {
507 1
                    enforce(keyFields.all!(x => x != 0),
508 1
                            "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
509

510 1
                    keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
511
                }
512
            }
513

514
        }
515
        catch (Exception exc)
516
        {
517 1
            stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
518 1
            return tuple(false, 1);
519
        }
520 1
        return tuple(true, 0);
521
    }
522
}
523

524
/* TsvSplitOptions unit tests (command-line argument processing).
525
 *
526
 * Basic tests. Many cases are covered in executable tests, including all error cases,
527
 * as errors write to stderr.
528
 */
529
unittest
530
{
531
    import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
532
    import std.conv : to;
533
    import std.file : mkdir, rmdirRecurse;
534
    import std.path : buildPath;
535

536
    /* A dummy file is used so we don't have to worry about the cases where command
537
     * line processing might open a file. Don't want to use standard input for this,
538
     * at least in cases where it might try to read to get the header line.
539
     */
540 1
    auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
541 1
    scope(exit) testDir.rmdirRecurse;
542

543 1
    string somefile_txt = buildPath(testDir, "somefile.txt");
544 1
    somefile_txt.File("w").writeln("Hello World!");
545

546
    {
547 1
        auto args = ["unittest", "--lines-per-file", "10", somefile_txt];
548 1
        TsvSplitOptions cmdopt;
549 1
        const r = cmdopt.processArgs(args);
550

551 1
        assert(cmdopt.linesPerFile == 10);
552 1
        assert(cmdopt.keyFields.empty);
553 1
        assert(cmdopt.numFiles == 0);
554 1
        assert(cmdopt.hasHeader == false);
555
    }
556
    {
557 1
        auto args = ["unittest", "--num-files", "20", somefile_txt];
558 1
        TsvSplitOptions cmdopt;
559 1
        const r = cmdopt.processArgs(args);
560

561 1
        assert(cmdopt.linesPerFile == 0);
562 1
        assert(cmdopt.keyFields.empty);
563 1
        assert(cmdopt.numFiles == 20);
564 1
        assert(cmdopt.hasHeader == false);
565
    }
566
    {
567 1
        auto args = ["unittest", "-n", "5", "--key-fields", "1-3", somefile_txt];
568 1
        TsvSplitOptions cmdopt;
569 1
        const r = cmdopt.processArgs(args);
570

571 1
        assert(cmdopt.linesPerFile == 0);
572 1
        assert(cmdopt.keyFields == [0, 1, 2]);
573 1
        assert(cmdopt.numFiles == 5);
574 1
        assert(cmdopt.hasHeader == false);
575 1
        assert(cmdopt.keyIsFullLine == false);
576
    }
577
    {
578 1
        auto args = ["unittest", "-n", "5", "-k", "0", somefile_txt];
579 1
        TsvSplitOptions cmdopt;
580 1
        const r = cmdopt.processArgs(args);
581

582 1
        assert(cmdopt.linesPerFile == 0);
583 1
        assert(cmdopt.numFiles == 5);
584 1
        assert(cmdopt.hasHeader == false);
585 1
        assert(cmdopt.keyIsFullLine == true);
586
    }
587
    {
588 1
        auto args = ["unittest", "-n", "2", "--header", somefile_txt];
589 1
        TsvSplitOptions cmdopt;
590 1
        const r = cmdopt.processArgs(args);
591

592 1
        assert(cmdopt.headerInOut == true);
593 1
        assert(cmdopt.hasHeader == true);
594 1
        assert(cmdopt.headerIn == false);
595
    }
596
    {
597 1
        auto args = ["unittest", "-n", "2", "--header-in-only", somefile_txt];
598 1
        TsvSplitOptions cmdopt;
599 1
        const r = cmdopt.processArgs(args);
600

601 1
        assert(cmdopt.headerInOut == false);
602 1
        assert(cmdopt.hasHeader == true);
603 1
        assert(cmdopt.headerIn == true);
604
    }
605

606
    static void testSuffix(string[] args, string expectedSuffix)
607
    {
608 1
        TsvSplitOptions cmdopt;
609 1
        auto savedArgs = args.to!string;
610 1
        const r = cmdopt.processArgs(args);
611

612 1
        assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs));
613 1
        assert(cmdopt.suffix == expectedSuffix,
614
               format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n   cmdopt.processArgs(%s)",
615
                      expectedSuffix, cmdopt.suffix, savedArgs));
616
    }
617

618
    /* In these tests, don't use headers and when files are listed, use 'somefile_txt' first.
619
     * This makes sure there is no attempt to read standard input and that there won't be an
620
     * open failure trying to find a file.
621
     */
622 1
    testSuffix(["unittest", "-n", "2"], "");
623 1
    testSuffix(["unittest", "-n", "2", "--", "-"], "");
624 1
    testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123");
625 1
    testSuffix(["unittest", "-n", "2", somefile_txt], ".txt");
626 1
    testSuffix(["unittest", "-n", "2", somefile_txt, "anotherfile.pqr"], ".txt");
627 1
    testSuffix(["unittest", "-n", "2", "--suffix", ".X", somefile_txt, "anotherfile.pqr"], ".X");
628 1
    testSuffix(["unittest", "-n", "2", "--suffix", "", somefile_txt], "");
629 1
    testSuffix(["unittest", "-n", "2", "--", "-", somefile_txt], "");
630 1
    testSuffix(["unittest", "-n", "2", "--", somefile_txt, "-"], ".txt");
631

632
    static void testDigitWidth(string[] args, uint expected)
633
    {
634 1
        TsvSplitOptions cmdopt;
635 1
        auto savedArgs = args.to!string;
636 1
        const r = cmdopt.processArgs(args);
637

638 1
        assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs));
639 1
        assert(cmdopt.digitWidth == expected,
640
               format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n   cmdopt.processArgs(%s)",
641
                      expected, cmdopt.digitWidth, savedArgs));
642
    }
643

644 1
    testDigitWidth(["unittest", "-n", "2", somefile_txt], 1);
645 1
    testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0", somefile_txt], 1);
646 1
    testDigitWidth(["unittest", "-n", "10", somefile_txt], 1);
647 1
    testDigitWidth(["unittest", "-n", "11", somefile_txt], 2);
648 1
    testDigitWidth(["unittest", "-n", "555", somefile_txt], 3);
649 1
    testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2", somefile_txt], 2);
650 1
    testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4", somefile_txt], 4);
651 1
    testDigitWidth(["unittest", "-l", "10", somefile_txt], 3);
652 1
    testDigitWidth(["unittest", "-l", "10000", somefile_txt], 3);
653 1
    testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0", somefile_txt], 3);
654 1
    testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1", somefile_txt], 1);
655 1
    testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5", somefile_txt], 5);
656
}
657

658
/** Get the rlimit current number of open files the process is allowed.
659
 *
660
 * This routine returns the current soft limit on the number of open files the process
661
 * is allowed. This is the number returned by the command: '$ ulimit -n'.
662
 *
663
 * This routine translates this value to a 'uint', as tsv-split uses 'uint' for
664
 * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'.
665
 * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'.
666
 *
667
 * An exception is thrown if call to 'getrlimit' fails.
668
 */
669
uint rlimitCurrOpenFilesLimit()
670
{
671
    import core.sys.posix.sys.resource :
672
        rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR;
673
    import std.conv : to;
674

675 1
    uint currOpenFileLimit = uint.max;
676

677 1
    rlimit rlimitMaxOpenFiles;
678

679 1
    enforce(getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) == 0,
680 0
            "Internal error: getrlimit call failed");
681

682 1
    if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY &&
683 1
        rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR &&
684 1
        rlimitMaxOpenFiles.rlim_cur >= 0 &&
685 1
        rlimitMaxOpenFiles.rlim_cur <= uint.max)
686
    {
687 1
        currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint;
688
    }
689

690 1
    return currOpenFileLimit;
691
}
692

693
/** Invokes the proper split routine based on the command line arguments.
694
 *
695
 * This routine is the top-level control after command line argument processing is
696
 * done. It's primary job is to set up data structures and invoke the correct
697
 * processing routine based on the command line arguments.
698
 */
699
void tsvSplit(ref TsvSplitOptions cmdopt)
700
{
701
    /* Check that the input files were setup as expected. Should at least have one
702
     * input, stdin if nothing else. */
703 1
    assert(!cmdopt.inputSources.empty);
704

705 1
    if (cmdopt.linesPerFile != 0)
706
    {
707 1
        splitByLineCount(cmdopt);
708
    }
709
    else
710
    {
711
        /* Randomly distribute input lines to a specified number of files. */
712

713 1
        auto outputFiles =
714
            SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix,
715
                             cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles,
716
                             cmdopt.inputSources.front.header);
717

718 1
        if (!cmdopt.appendToExistingFiles)
719
        {
720 1
            string existingFile = outputFiles.checkIfFilesExist;
721 1
            enforce(existingFile.length == 0,
722 1
                    format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.",
723
                           existingFile));
724
        }
725

726 1
        if (cmdopt.keyFields.length == 0)
727
        {
728 1
            splitLinesRandomly(cmdopt, outputFiles);
729
        }
730
        else
731
        {
732 1
            splitLinesByKey(cmdopt, outputFiles);
733
        }
734
    }
735
}
736

737
/** A SplitOutputFiles struct holds a collection of output files.
738
 *
739
 * This struct manages a collection of output files used when writing to multiple
740
 * files at once. This includes constructing filenames, opening and closing files,
741
 * and writing data and header lines.
742
 *
743
 * Both random assignment (splitLinesRandomly) and random assignment by key
744
 * (splitLinesByKey) use a SplitOutputFiles struct to manage output files.
745
 *
746
 * The main properties of the output file set are specified in the constuctor. The
747
 * exception is the header line. This is not known until the first input file is
748
 * read, so it is specified in a separate 'setHeader' call.
749
 *
750
 * Individual output files are written to based on their zero-based index in the
751
 * output collection. The caller selects the output file number to write to and
752
 * calls 'writeDataLine' to write a line. The header is written if needed.
753
 */
754
struct SplitOutputFiles
755
{
756
    import std.conv : to;
757
    import std.file : exists;
758
    import std.path : buildPath;
759
    import std.stdio : File;
760

761
    static struct OutputFile
762
    {
763
        string filename;
764
        File ofile;
765
        bool hasData;
766
        bool isOpen;    // Track separately due to https://github.com/dlang/phobos/pull/7397
767
    }
768

769
    private uint _numFiles;
770
    private bool _writeHeaders;
771
    private uint _maxOpenFiles;
772

773
    private OutputFile[] _outputFiles;
774
    private uint _numOpenFiles = 0;
775
    private string _header;
776

777 1
    this(uint numFiles, string dir, string filePrefix, string fileSuffix,
778
         uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles, string header)
779
    {
780 1
        assert(numFiles >= 2);
781 1
        assert(maxOpenFiles >= 1);
782

783 1
        _numFiles = numFiles;
784 1
        _writeHeaders = writeHeaders;
785 1
        _maxOpenFiles = maxOpenFiles;
786 1
        _header = header;
787

788 1
        _outputFiles.length = numFiles;
789

790
        /* Filename assignment. */
791 1
        foreach (i, ref f; _outputFiles)
792
        {
793 1
            f.filename =
794
                buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix));
795
        }
796
    }
797

798
    /* Destructor ensures all files are closed.
799
     *
800
     * Note: A dual check on whether the file is open is made. This is to avoid a
801
     * Phobos bug where std.File doesn't properly maintain the state of open files
802
     * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397.
803
     */
804
    ~this()
805
    {
806 1
        foreach (ref f; _outputFiles)
807
        {
808 1
            if (f.isOpen && f.ofile.isOpen)
809
            {
810 1
                assert(_numOpenFiles >= 1);
811

812 1
                f.ofile.close;
813 1
                f.isOpen = false;
814 1
                _numOpenFiles--;
815
            }
816
        }
817
    }
818

819
    /* Check if any of the files already exist.
820
     *
821
     * Returns the empty string if none of the files exist. Otherwise returns the
822
     * filename of the first existing file found. This is to facilitate error
823
     * message generation.
824
     */
825
    string checkIfFilesExist()
826
    {
827 1
        foreach (f; _outputFiles) if (f.filename.exists) return f.filename;
828 1
        return "";
829
    }
830

831
    /* Picks a random file to close. Used when the open file handle limit has been
832
     * reached.
833
     */
834
    private void closeSomeFile()
835
    {
836
        import std.random : uniform;
837 1
        assert(_numOpenFiles > 0);
838

839 1
        immutable uint start = uniform(0, _numFiles);
840

841 1
        foreach (i; cycle(iota(_numFiles), start).take(_numFiles))
842
        {
843 1
            if (_outputFiles[i].isOpen)
844
            {
845 1
                _outputFiles[i].ofile.close;
846 1
                _outputFiles[i].isOpen = false;
847 1
                _numOpenFiles--;
848

849 1
                return;
850
            }
851
        }
852

853 0
        assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close.");
854
    }
855

856
    /* Write a line to the specified file number.
857
     *
858
     * A header is written to the file if headers are being written and this is the
859
     * first data written to the file.
860
     */
861
    void writeDataLine(uint fileNum, const char[] data)
862
    {
863 1
        assert(fileNum < _numFiles);
864 1
        assert(fileNum < _outputFiles.length);
865 1
        assert(_numOpenFiles <= _maxOpenFiles);
866

867 1
        OutputFile* outputFile = &_outputFiles[fileNum];
868

869 1
        if (!outputFile.isOpen)
870
        {
871 1
            if (_numOpenFiles == _maxOpenFiles) closeSomeFile();
872 1
            assert(_numOpenFiles < _maxOpenFiles);
873

874 1
            outputFile.ofile = outputFile.filename.File("a");
875 1
            outputFile.isOpen = true;
876 1
            _numOpenFiles++;
877

878 1
            if (!outputFile.hasData)
879
            {
880 1
                ulong filesize = outputFile.ofile.size;
881 1
                outputFile.hasData = (filesize > 0 && filesize != ulong.max);
882
            }
883
        }
884

885 1
        if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header);
886

887 1
        outputFile.ofile.writeln(data);
888 1
        outputFile.hasData = true;
889
    }
890
}
891

892
/** Write input lines to multiple files, randomly selecting an output file for each line.
893
 */
894
void splitLinesRandomly(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
895
{
896
    import std.random : Random = Mt19937, uniform;
897
    import tsv_utils.common.utils : bufferedByLine, InputSourceRange;
898

899
    /* inputSources must be an InputSourceRange and include at least stdin. */
900 1
    assert(!cmdopt.inputSources.empty);
901
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
902

903 1
    auto randomGenerator = Random(cmdopt.seed);
904

905
    /* Process each line. */
906 1
    foreach (inputStream; cmdopt.inputSources)
907
    {
908 1
        foreach (line; inputStream.file.bufferedByLine)
909
        {
910 1
            immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator);
911 1
            outputFiles.writeDataLine(outputFileNum, line);
912
        }
913
    }
914
}
915

916
/** Write input lines to multiple output files using fields as a random selection key.
917
 *
918
 * Each input line is written to an output file. The output file is chosen using
919
 * fields as a key. Each unique key is assigned to a file. All lines having the
920
 * same key are written to the same file.
921
 */
922
void splitLinesByKey(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
923
{
924
    import std.algorithm : splitter;
925
    import std.conv : to;
926
    import std.digest.murmurhash;
927
    import tsv_utils.common.utils : bufferedByLine, InputFieldReordering,
928
        InputSourceRange, throwIfWindowsNewlineOnUnix;
929

930 1
    assert(cmdopt.keyFields.length > 0);
931

932
    /* inputSources must be an InputSourceRange and include at least stdin. */
933 1
    assert(!cmdopt.inputSources.empty);
934
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
935

936 1
    immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
937

938
    /* Create a mapping for the key fields. */
939 1
    auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
940

941
    /* Process each line. */
942 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
943 1
    foreach (inputStream; cmdopt.inputSources)
944
    {
945 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
946

947 1
        foreach (fileLineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine))
948
        {
949 1
            if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
950

951
            /* Murmurhash works by successively adding individual keys, then finalizing.
952
             * Adding individual keys is simpler if the full-line-as-key and individual
953
             * fields as keys cases are separated.
954
             */
955 1
            auto hasher = MurmurHash3!32(cmdopt.seed);
956

957 1
            if (cmdopt.keyIsFullLine)
958
            {
959 1
                hasher.put(cast(ubyte[]) line);
960
            }
961
            else
962
            {
963 1
                assert(keyFieldsReordering !is null);
964

965
                /* Gather the key field values and assemble the key. */
966 1
                keyFieldsReordering.initNewLine;
967 1
                foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
968
                {
969 1
                    keyFieldsReordering.processNextField(fieldIndex, fieldValue);
970 1
                    if (keyFieldsReordering.allFieldsFilled) break;
971
                }
972

973 1
                enforce(keyFieldsReordering.allFieldsFilled,
974 1
                        format("Not enough fields in line. File: %s, Line: %s",
975
                               inputStream.name, fileLineNum));
976

977 1
                foreach (count, key; keyFieldsReordering.outputFields.enumerate)
978
                {
979 1
                    if (count > 0) hasher.put(delimArray);
980 1
                    hasher.put(cast(ubyte[]) key);
981
                }
982
            }
983

984 1
            hasher.finish;
985 1
            immutable uint outputFileNum = hasher.get % cmdopt.numFiles;
986 1
            outputFiles.writeDataLine(outputFileNum, line);
987
        }
988
    }
989
}
990

991
/** Write input lines to multiple files, splitting based on line count.
992
 *
993
 * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses
994
 * should use the default value.
995
 */
996
void splitByLineCount(ref TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 128L)
997
{
998
    import std.file : exists;
999
    import std.path : buildPath;
1000
    import std.stdio : File;
1001
    import tsv_utils.common.utils : InputSourceRange;
1002

1003 1
    assert (readBufferSize > 0);
1004 1
    ubyte[] readBuffer = new ubyte[readBufferSize];
1005

1006
    /* inputSources must be an InputSourceRange and include at least stdin. */
1007 1
    assert(!cmdopt.inputSources.empty);
1008
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1009

1010 1
    string header = !cmdopt.headerInOut ? "" :
1011 1
        cmdopt.inputSources.front.header(Yes.keepTerminator);
1012 1
    size_t nextOutputFileNum = 0;
1013 1
    File outputFile;
1014 1
    string outputFileName;
1015 1
    bool isOutputFileOpen = false;           // Open file status tracked separately due to phobos bugs
1016 1
    size_t outputFileRemainingLines;
1017

1018
    /* nextNewlineIndex finds the index of the next newline character. It is an
1019
     * alternative to std.algorithm.countUntil. Invoking 'find' directly results
1020
     * 'memchr' being used (faster). The current 'countUntil' implementation does
1021
     * forward to find, but the way it is done avoids the memchr call optimization.
1022
     */
1023
    static long nextNewlineIndex(const ubyte[] buffer)
1024
    {
1025
        import std.algorithm : find;
1026 1
        immutable ubyte newlineChar = '\n';
1027 1
        immutable size_t buflen = buffer.length;
1028 1
        immutable size_t findlen = buffer.find(newlineChar).length;
1029

1030 1
        return findlen > 0 ? buflen - findlen : -1;
1031
    }
1032

1033 1
    foreach (inputStream; cmdopt.inputSources)
1034
    {
1035 1
        foreach (ref ubyte[] inputChunk; inputStream.file.byChunk(readBuffer))
1036
        {
1037 1
            size_t nextOutputChunkStart = 0;
1038 1
            auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $];
1039

1040 1
            while (!remainingInputChunk.empty)
1041
            {
1042
                /* See if the next output file needs to be opened. */
1043 1
                if (!isOutputFileOpen)
1044
                {
1045 1
                    outputFileName =
1046
                        buildPath(cmdopt.dir,
1047
                                  format("%s%.*d%s", cmdopt.prefix,
1048
                                         cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix));
1049

1050 1
                    enforce(cmdopt.appendToExistingFiles || !outputFileName.exists,
1051 1
                            format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.",
1052
                                   outputFileName));
1053

1054 1
                    outputFile = outputFileName.File("ab");
1055 1
                    outputFile.setvbuf(1024L * 64L, _IOFBF);
1056 1
                    isOutputFileOpen = true;
1057 1
                    ++nextOutputFileNum;
1058 1
                    outputFileRemainingLines = cmdopt.linesPerFile;
1059

1060 1
                    if (cmdopt.headerInOut)
1061
                    {
1062 1
                        ulong filesize = outputFile.size;
1063 1
                        if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header);
1064
                    }
1065
                }
1066

1067
                /* Find more newlines for the current output file. */
1068

1069 1
                assert(outputFileRemainingLines > 0);
1070

1071 1
                size_t nextOutputChunkEnd = nextOutputChunkStart;
1072

1073 1
                while (outputFileRemainingLines != 0 && !remainingInputChunk.empty)
1074
                {
1075
                    /* Note: newLineIndex is relative to 'remainingInputChunk', not
1076
                     * 'inputChunk'. Updates to variables referring to 'inputChunk'
1077
                     * need to reflect this. In particular, 'nextOutputChunkEnd'.
1078
                     */
1079 1
                    immutable newlineIndex = nextNewlineIndex(remainingInputChunk);
1080

1081 1
                    if (newlineIndex == -1)
1082
                    {
1083 1
                        nextOutputChunkEnd = inputChunk.length;
1084
                    }
1085
                    else
1086
                    {
1087 1
                        --outputFileRemainingLines;
1088 1
                        nextOutputChunkEnd += (newlineIndex + 1);
1089
                    }
1090

1091 1
                    remainingInputChunk = inputChunk[nextOutputChunkEnd .. $];
1092
                }
1093

1094 1
                assert(nextOutputChunkStart < nextOutputChunkEnd);
1095 1
                assert(nextOutputChunkEnd <= inputChunk.length);
1096

1097 1
                outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]);
1098

1099 1
                if (outputFileRemainingLines == 0)
1100
                {
1101 1
                    outputFile.close;
1102 1
                    isOutputFileOpen = false;
1103
                }
1104

1105 1
                nextOutputChunkStart = nextOutputChunkEnd;
1106

1107 1
                assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart);
1108
            }
1109
        }
1110
    }
1111
}
1112

1113
/* splitByLineCount unit tests.
1114
 *
1115
 * These tests are primarily for buffer management. There are edge cases involving the
1116
 * interaction buffer size, input file size, lines-per-file, and newline placement
1117
 * that are difficult to test against the executable.
1118
 */
1119
unittest
1120
{
1121
    import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1122
    import std.algorithm : min;
1123
    import std.array : appender;
1124
    import std.conv : to;
1125
    import std.file : exists, mkdir, rmdirRecurse;
1126
    import std.path : buildPath;
1127
    import std.process : escapeShellCommand, executeShell;
1128

1129
    /* Test setup
1130
     *
1131
     * A set of twenty file input files is created, with names: input_NxM.txt, where
1132
     * N is the number of characters in each row and M is the number of rows (lines).
1133
     * The resulting files are put in the "lc_input" directory ('inputDir' variable)
1134
     * and have names:
1135
     *    input_0x2.txt, input_0x3.txt, ... input_5x5.txt.
1136
     *
1137
     * A standalone block of code produces the expected result files for splitting an
1138
     * input file into a set of output files. This duplicates the splitByLineCount
1139
     * output. This is done for lines-per-file counts 1 to 5. Each result set is place
1140
     * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories
1141
     * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4".
1142
     *
1143
     * splitByLine is called for all the same input files and lines-per-file settings used
1144
     * to produce the expected output. This is done via testSplitByLineCount, which calls
1145
     * command line argument processing and splitByLine, similar to how the main program
1146
     * works. The results are written to a subdirectory. The subdirectory is compared to
1147
     * the expected output directory using the system 'diff' command.
1148
     *
1149
     * splitByLine is multiple times for each expected output case. The different calls
1150
     * iterate over a series of small ReadBufferSizes. This is how tests for edge cases
1151
     * in the readBufferSize vs line lengths, newline placement, etc., is accomplished.
1152
     *
1153
     * Note: One way to understand what is going on is to comment out the line:
1154
     *
1155
     *    scope(exit) testDir.rmdirRecurse;
1156
     *
1157
     * Then run the test (e.g. 'make test') and look at the directory structure left
1158
     * behind. Print out the 'testDir' directory to see where it is located.
1159
     */
1160

1161
    /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the
1162
     * call to splitByLineCount and calls 'diff' to compare the output directory to the
1163
     * expected directory. An assert is thrown if the directories do not match.
1164
     */
1165
    static void testSplitByLineCount(string[] cmdArgs, string expectedDir,
1166
                                 size_t readBufferSize = 1024L * 512L)
1167
    {
1168
        import std.array : appender;
1169

1170 1
        assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty.");
1171

1172
        auto formatAssertMessage(T...)(string msg, T formatArgs)
1173
        {
1174 0
            auto formatString = "[testSplitByLineCount] %s: " ~ msg;
1175 0
            return format(formatString, cmdArgs[0], formatArgs);
1176
        }
1177

1178 1
        TsvSplitOptions cmdopt;
1179 1
        auto savedCmdArgs = cmdArgs.to!string;
1180 1
        auto r = cmdopt.processArgs(cmdArgs);
1181 1
        assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1182 1
        assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required.");
1183 1
        assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required.");
1184

1185 1
        splitByLineCount(cmdopt, readBufferSize);
1186

1187
        /* Diff command setup. */
1188 1
        auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir];
1189 1
        auto diffResult = executeShell(escapeShellCommand(diffCmdArgs));
1190 1
        assert(diffResult.status == 0,
1191
               format("[testSplitByLineCount]\n  cmd: %s\n  readBufferSize: %d\n  expectedDir: %s\n------ Diff ------%s\n-------",
1192
                      savedCmdArgs, readBufferSize, expectedDir, diffResult.output));
1193
    }
1194

1195 1
    auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
1196 1
    scope(exit) testDir.rmdirRecurse;
1197

1198 1
    auto inputDir = buildPath(testDir, "lc_input");
1199 1
    auto outputDir = buildPath(testDir, "lc_output");
1200 1
    auto expectedDir = buildPath(testDir, "lc_expected");
1201

1202 1
    mkdir(inputDir);
1203 1
    mkdir(outputDir);
1204 1
    mkdir(expectedDir);
1205

1206
    static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines)
1207
    {
1208 1
        return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines));
1209
    }
1210

1211 1
    string[5] outputRowData =
1212
        [
1213
            "abcde",
1214
            "fghij",
1215
            "klmno",
1216
            "pqrst",
1217
            "uvwxy"
1218
        ];
1219

1220
    /* The main test loop. Iterates over input line lengths, numbers of rows,
1221
     * lines-per-file, and finally readBufferSize lengths. All combos are tested.
1222
     */
1223 1
    foreach (inputLineLength; 0 .. 6)
1224
    {
1225 1
        foreach (inputFileNumLines; 2 .. 6)
1226
        {
1227 1
            auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1228

1229
            {
1230 1
                auto ofile = inputFile.File("w");
1231 1
                auto output = appender!(char[])();
1232 1
                foreach (m; 0 .. inputFileNumLines)
1233
                {
1234 1
                    put(output, outputRowData[m][0 .. inputLineLength]);
1235 1
                    put(output, '\n');
1236
                }
1237 1
                ofile.write(output.data);
1238 1
                ofile.close;
1239
            }
1240

1241
            /* Iterate over the different lines-per-file lengths.
1242
             * - Create an expected output directory and files for each.
1243
             * - Test with different readBufferSize values.
1244
             */
1245 1
            foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines))
1246
            {
1247 1
                auto expectedSubDir =
1248
                    buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength,
1249
                                                  inputFileNumLines, outputFileNumLines));
1250 1
                mkdir(expectedSubDir);
1251

1252 1
                size_t filenum = 0;
1253 1
                size_t linesWritten = 0;
1254 1
                while (linesWritten < inputFileNumLines)
1255
                {
1256 1
                    auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum));
1257 1
                    auto f = expectedFile.File("w");
1258 1
                    auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1259 1
                    foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1260
                    {
1261 1
                        f.writeln(line[0 .. inputLineLength]);
1262
                    }
1263 1
                    linesWritten += linesToWrite;
1264 1
                    ++filenum;
1265 1
                    f.close;
1266
                }
1267

1268
                /* Test the different readBufferSizes.
1269
                 * - An output directory is created for the run and deleted afterward.
1270
                 * - First test the default size.
1271
                 * - Then iterate overs small readBufferSize values.
1272
                 */
1273 1
                auto outputSubDir =
1274
                    buildPath(outputDir, format("%dx%d_by_%d", inputLineLength,
1275
                                                inputFileNumLines, outputFileNumLines));
1276 1
                mkdir(outputSubDir);
1277

1278 1
                testSplitByLineCount(
1279
                    ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1280
                     "--digit-width", "1", inputFile],
1281
                    expectedSubDir);
1282

1283 1
                outputSubDir.rmdirRecurse;
1284

1285 1
                foreach (readBufSize; 1 .. 8)
1286
                {
1287 1
                     mkdir(outputSubDir);
1288

1289 1
                     testSplitByLineCount(
1290
                         ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1291
                          "--digit-width", "1", inputFile],
1292
                         expectedSubDir, readBufSize);
1293

1294 1
                     outputSubDir.rmdirRecurse;
1295
                }
1296
            }
1297
        }
1298
    }
1299

1300
    {
1301
        /* Tests for the special case where readBufferSize is smaller than the header
1302
         * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file.
1303
         */
1304 1
        immutable inputLineLength = 5;
1305 1
        immutable inputFileNumLines = 4;
1306 1
        immutable outputFileNumLines = 1;
1307

1308 1
        auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1309 1
        assert(inputFile.exists);
1310

1311 1
        auto expectedSubDirHeader =
1312
            buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength,
1313
                                          inputFileNumLines, outputFileNumLines));
1314

1315 1
        auto expectedSubDirHeaderInOnly =
1316
            buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1317
                                          inputFileNumLines, outputFileNumLines));
1318

1319 1
        mkdir(expectedSubDirHeader);
1320 1
        mkdir(expectedSubDirHeaderInOnly);
1321

1322
        /* Generate the expected results. Cheat by starting with linesWritten = 1. This
1323
         * automatically excludes the header line, but keeps the loop code consistent
1324
         * with the main test loop.
1325
         */
1326 1
        size_t filenum = 0;
1327 1
        size_t linesWritten = 1;
1328 1
        while (linesWritten < inputFileNumLines)
1329
        {
1330 1
            auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum));
1331 1
            auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly,
1332
                                                      format("part_%d.txt", filenum));
1333 1
            auto fHeader = expectedFileHeader.File("w");
1334 1
            auto fHeaderInOnly = expectedFileHeaderInOnly.File("w");
1335 1
            auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1336

1337 1
            fHeader.writeln(outputRowData[0][0 .. inputLineLength]);
1338 1
            foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1339
            {
1340 1
                fHeader.writeln(line[0 .. inputLineLength]);
1341 1
                fHeaderInOnly.writeln(line[0 .. inputLineLength]);
1342
            }
1343 1
            linesWritten += linesToWrite;
1344 1
            ++filenum;
1345 1
            fHeader.close;
1346 1
            fHeaderInOnly.close;
1347
        }
1348

1349
        /* Now run the tests. */
1350 1
        auto outputSubDirHeader =
1351
            buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength,
1352
                                        inputFileNumLines, outputFileNumLines));
1353 1
        auto outputSubDirHeaderInOnly =
1354
            buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1355
                                        inputFileNumLines, outputFileNumLines));
1356

1357 1
        foreach (readBufSize; 1 .. 6)
1358
        {
1359 1
            mkdir(outputSubDirHeader);
1360 1
            mkdir(outputSubDirHeaderInOnly);
1361

1362 1
            testSplitByLineCount(
1363
                ["test", "--header", "--lines-per-file", outputFileNumLines.to!string,
1364
                 "--dir", outputSubDirHeader, "--digit-width", "1", inputFile],
1365
                expectedSubDirHeader, readBufSize);
1366

1367 1
            testSplitByLineCount(
1368
                ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string,
1369
                 "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile],
1370
                expectedSubDirHeaderInOnly, readBufSize);
1371

1372 1
            outputSubDirHeader.rmdirRecurse;
1373 1
            outputSubDirHeaderInOnly.rmdirRecurse;
1374
        }
1375
    }
1376
}

Read our documentation on viewing source code .

Loading