1
/**
2
Command line tool for shuffling or sampling lines from input streams. Several methods
3
are available, including weighted and unweighted shuffling, simple and weighted random
4
sampling, sampling with replacement, Bernoulli sampling, and distinct sampling.
5

6
Copyright (c) 2017-2020, eBay Inc.
7
Initially written by Jon Degenhardt
8

9
License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10
*/
11
module tsv_utils.tsv_sample;
12

13
import std.array : appender, Appender, RefAppender;
14
import std.exception : enforce;
15
import std.format : format;
16
import std.range;
17
import std.stdio;
18
import std.typecons : tuple, Flag;
19

20
static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
21

22
version(unittest)
23
{
24
    // When running unit tests, use main from -main compiler switch.
25
}
26
else
27
{
28
    /** Main program.
29
     *
30
     * Invokes command line argument processing and calls tsvSample to do the real
31
     * work. Errors occurring during processing are caught and reported to the user.
32
     */
33
    int main(string[] cmdArgs)
34
    {
35
        /* When running in DMD code coverage mode, turn on report merging. */
36
        version(D_Coverage) version(DigitalMars)
37
        {
38
            import core.runtime : dmd_coverSetMerge;
39 1
            dmd_coverSetMerge(true);
40
        }
41

42 1
        TsvSampleOptions cmdopt;
43 1
        const r = cmdopt.processArgs(cmdArgs);
44 1
        if (!r[0]) return r[1];
45
        version(LDC_Profile)
46
        {
47
            import ldc.profile : resetAll;
48
            resetAll();
49
        }
50
        try
51
        {
52
            import tsv_utils.common.utils : BufferedOutputRange;
53 1
            auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
54

55 1
            tsvSample(cmdopt, bufferedOutput);
56
        }
57
        catch (Exception exc)
58
        {
59 1
            stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
60 1
            return 1;
61
        }
62 1
        return 0;
63
    }
64
}
65

66
immutable helpText = q"EOS
67
Synopsis: tsv-sample [options] [file...]
68

69
Sample input lines or randomize their order. Several modes of operation
70
are available:
71
* Shuffling (the default): All input lines are output in random order. All
72
  orderings are equally likely.
73
* Random sampling (--n|num N): A random sample of N lines are selected and
74
  written to standard output. By default, selected lines are written in
75
  random order. All sample sets and orderings are equally likely. Use
76
  --i|inorder to write the selected lines in the original input order.
77
* Weighted random sampling (--n|num N, --w|weight-field F): A weighted
78
  sample of N lines is produced. Weights are taken from field F. Lines are
79
  output in weighted selection order. Use --i|inorder to write in original
80
  input order. Omit --n|num to shuffle all lines (weighted shuffling).
81
* Sampling with replacement (--r|replace, --n|num N): All input lines are
82
  read in, then lines are repeatedly selected at random and written out.
83
  This continues until N lines are output. Individual lines can be written
84
  multiple times. Output continues forever if N is zero or not provided.
85
* Bernoulli sampling (--p|prob P): A random subset of lines is selected
86
  based on probability P, a 0.0-1.0 value. This is a streaming operation.
87
  A decision is made on each line as it is read. Line order is not changed.
88
* Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
89
  based on the values in the key fields. A subset of keys are chosen based
90
  on the inclusion probability (a 'distinct' set of keys). All lines with
91
  one of the selected keys are output. Line order is not changed.
92

93
Fields are specified using field number or field name. Field names require
94
that the input file has a header line.
95

96
Use '--help-verbose' for detailed information.
97

98
Options:
99
EOS";
100

101
immutable helpTextVerbose = q"EOS
102
Synopsis: tsv-sample [options] [file...]
103

104
Sample input lines or randomize their order. Several modes of operation
105
are available:
106
* Shuffling (the default): All input lines are output in random order. All
107
  orderings are equally likely.
108
* Random sampling (--n|num N): A random sample of N lines are selected and
109
  written to standard output. By default, selected lines are written in
110
  random order. All sample sets and orderings are equally likely. Use
111
  --i|inorder to write the selected lines in the original input order.
112
* Weighted random sampling (--n|num N, --w|weight-field F): A weighted
113
  sample of N lines is produced. Weights are taken from field F. Lines are
114
  output in weighted selection order. Use --i|inorder to write in original
115
  input order. Omit --n|num to shuffle all lines (weighted shuffling).
116
* Sampling with replacement (--r|replace, --n|num N): All input lines are
117
  read in, then lines are repeatedly selected at random and written out.
118
  This continues until N lines are output. Individual lines can be written
119
  multiple times. Output continues forever if N is zero or not provided.
120
* Bernoulli sampling (--p|prob P): A random subset of lines is selected
121
  based on probability P, a 0.0-1.0 value. This is a streaming operation.
122
  A decision is made on each line as it is read. Line order is not changed.
123
* Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
124
  based on the values in the key fields. A subset of keys are chosen based
125
  on the inclusion probability (a 'distinct' set of keys). All lines with
126
  one of the selected keys are output. Line order is not changed.
127

128
Fields: Fields are specified by field number or name. Field names require
129
the input file to have a header line. Use '--help-fields' for details.
130

131
Sample size: The '--n|num' option controls the sample size for all
132
sampling methods. In the case of simple and weighted random sampling it
133
also limits the amount of memory required.
134

135
Controlling the random seed: By default, each run produces a different
136
randomization or sampling. Using '--s|static-seed' changes this so
137
multiple runs produce the same results. This works by using the same
138
random seed each run. The random seed can be specified using
139
'--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
140
value is a no-op and ignored.)
141

142
Memory use: Bernoulli sampling and distinct sampling make decisions on
143
each line as it is read, there is no memory accumulation. These algorithms
144
can run on arbitrary size inputs. Sampling with replacement reads all
145
lines into memory and is limited by available memory. Shuffling also reads
146
all lines into memory and is similarly limited. Random sampling uses
147
reservoir sampling, and only needs to hold the sample size (--n|num) in
148
memory. The input data can be of any length.
149

150
Weighted sampling: Weighted random sampling is done using an algorithm
151
described by Pavlos Efraimidis and Paul Spirakis. Weights should be
152
positive values representing the relative weight of the entry in the
153
collection. Counts and similar can be used as weights, it is *not*
154
necessary to normalize to a [0,1] interval. Negative values are not
155
meaningful and given the value zero. Input order is not retained, instead
156
lines are output ordered by the randomized weight that was assigned. This
157
means that a smaller valid sample can be produced by taking the first N
158
lines of output. For more info on the sampling approach see:
159
* Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
160
* "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
161
  (https://arxiv.org/abs/1012.0256)
162

163
Printing random values: Most of the sampling algorithms work by generating
164
a random value for each line. (See "Compatibility mode" below.) The nature
165
of these values depends on the sampling algorithm. They are used for both
166
line selection and output ordering. The '--p|print-random' option can be
167
used to print these values. The random value is prepended to the line
168
separated by the --d|delimiter char (TAB by default). The
169
'--gen-random-inorder' option takes this one step further, generating
170
random values for all input lines without changing the input order. The
171
types of values currently used by these sampling algorithms:
172
* Unweighted sampling: Uniform random value in the interval [0,1]. This
173
  includes Bernoulli sampling and unweighted line order randomization.
174
* Weighted sampling: Value in the interval [0,1]. Distribution depends on
175
  the values in the weight field. It is used as a partial ordering.
176
* Distinct sampling: An integer, zero and up, representing a selection
177
  group. The inclusion probability determines the number of selection groups.
178
* Sampling with replacement: Random value printing is not supported.
179

180
The specifics behind these random values are subject to change in future
181
releases.
182

183
Compatibility mode: As described above, many of the sampling algorithms
184
assign a random value to each line. This is useful when printing random
185
values. It has another occasionally useful property: repeated runs with
186
the same static seed but different selection parameters are more
187
compatible with each other, as each line gets assigned the same random
188
value on every run. For example, if Bernoulli sampling is run with
189
'--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
190
all the lines selected in the first run will be selected in the second.
191
This comes at a cost: in some cases there are faster algorithms that don't
192
preserve this property. By default, tsv-sample will use faster algorithms
193
when available. However, the '--compatibility-mode' option switches to
194
algorithms that assign a random value per line. Printing random values
195
also engages compatibility mode.
196

197
Options:
198
EOS";
199

200
/** Container for command line options and derived data.
201
 *
202
 * TsvSampleOptions handles several aspects of command line options. On the input side,
203
 * it defines the command line options available, performs validation, and sets up any
204
 * derived state based on the options provided. These activities are handled by the
205
 * processArgs() member.
206
 *
207
 * Once argument processing is complete, TsvSampleOptions is used as a container
208
 * holding the specific processing options used by the different sampling routines.
209
 */
210
struct TsvSampleOptions
211
{
212
    import tsv_utils.common.utils : InputSourceRange;
213

214
    string programName;                        /// Program name
215
    InputSourceRange inputSources;             /// Input files
216
    bool hasHeader = false;                    /// --H|header
217
    ulong sampleSize = 0;                      /// --n|num - Size of the desired sample
218
    double inclusionProbability = double.nan;  /// --p|prob - Inclusion probability
219
    size_t[] keyFields;                        /// Derived: --k|key-fields - Used with inclusion probability
220
    size_t weightField = 0;                    /// Derived: --w|weight-field - Field holding the weight
221
    bool srsWithReplacement = false;           /// --r|replace
222
    bool preserveInputOrder = false;           /// --i|inorder
223
    bool staticSeed = false;                   /// --s|static-seed
224
    uint seedValueOptionArg = 0;               /// --v|seed-value
225
    bool printRandom = false;                  /// --print-random
226
    bool genRandomInorder = false;             /// --gen-random-inorder
227
    string randomValueHeader = "random_value"; /// --random-value-header
228
    bool compatibilityMode = false;            /// --compatibility-mode
229
    char delim = '\t';                         /// --d|delimiter
230
    bool preferSkipSampling = false;           /// --prefer-skip-sampling
231
    bool preferAlgorithmR = false;             /// --prefer-algorithm-r
232
    bool hasWeightField = false;               /// Derived.
233
    bool useBernoulliSampling = false;         /// Derived.
234
    bool useDistinctSampling = false;          /// Derived.
235
    bool distinctKeyIsFullLine = false;        /// Derived. True if '--k|key-fields 0' is specfied.
236
    bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
237
    uint seed = 0;                             /// Derived from --static-seed, --seed-value
238

239
    /** Process tsv-sample command line arguments.
240
     *
241
     * Defines the command line options, performs validation, and derives additional
242
     * state. std.getopt.getopt is called to do the main option processing followed
243
     * additional validation and derivation.
244
     *
245
     * Help text is printed to standard output if help was requested. Error text is
246
     * written to stderr if invalid input is encountered.
247
     *
248
     * A tuple is returned. First value is true if command line arguments were
249
     * successfully processed and execution should continue, or false if an error
250
     * occurred or the user asked for help. If false, the second value is the
251
     * appropriate exit code (0 or 1).
252
     *
253
     * Returning true (execution continues) means args have been validated and derived
254
     * values calculated. Field indices will have been converted to zero-based.
255
     */
256
    auto processArgs(ref string[] cmdArgs)
257
    {
258
        import std.algorithm : all, canFind, each;
259
        import std.conv : to;
260
        import std.getopt;
261
        import std.math : isNaN;
262
        import std.path : baseName, stripExtension;
263
        import std.typecons : Yes, No;
264
        import tsv_utils.common.utils : inputSourceRange, ReadHeader, throwIfWindowsNewlineOnUnix;
265
        import tsv_utils.common.fieldlist;
266

267 1
        bool helpVerbose = false;                  // --help-verbose
268 1
        bool helpFields = false;                   // --help-fields
269 1
        bool versionWanted = false;                // --V|version
270 1
        string keyFieldsArg;                       // --k|key-fields
271 1
        string weightFieldArg;                     // --w|weight-field
272

273 1
        string keyFieldsOptionString = "k|key-fields";
274 1
        string weightFieldOptionString = "w|weight-field";
275

276 1
        programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
277

278
        try
279
        {
280 1
            arraySep = ",";    // Use comma to separate values in command line options
281 1
            auto r = getopt(
282
                cmdArgs,
283
                "help-verbose",    "     Print more detailed help.", &helpVerbose,
284
                "help-fields",     "     Print help on specifying fields.", &helpFields,
285

286
                std.getopt.config.caseSensitive,
287
                "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
288
                std.getopt.config.caseInsensitive,
289

290
                "n|num",           "NUM  Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
291
                "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
292

293
                keyFieldsOptionString,
294
                "<field-list>  Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.",
295
                &keyFieldsArg,
296

297
                weightFieldOptionString,
298
                "NUM  Field containing weights. All lines get equal weight if not provided.",
299
                &weightFieldArg,
300

301
                "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
302
                "i|inorder",       "     Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder,
303
                "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
304

305
                std.getopt.config.caseSensitive,
306
                "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
307
                std.getopt.config.caseInsensitive,
308

309
                "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
310
                "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
311
                "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
312
                "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
313

314
                "d|delimiter",     "CHR  Field delimiter.", &delim,
315

316
                std.getopt.config.caseSensitive,
317
                "V|version",       "     Print version information and exit.", &versionWanted,
318
                std.getopt.config.caseInsensitive,
319

320
                "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
321
                &preferSkipSampling,
322

323
                "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
324
                &preferAlgorithmR,
325
                );
326

327 1
            if (r.helpWanted)
328
            {
329 1
                defaultGetoptPrinter(helpText, r.options);
330 1
                return tuple(false, 0);
331
            }
332 1
            else if (helpVerbose)
333
            {
334 1
                defaultGetoptPrinter(helpTextVerbose, r.options);
335 1
                return tuple(false, 0);
336
            }
337 1
            else if (helpFields)
338
            {
339 1
                writeln(fieldListHelpText);
340 1
                return tuple(false, 0);
341
            }
342 1
            else if (versionWanted)
343
            {
344
                import tsv_utils.common.tsvutils_version;
345 1
                writeln(tsvutilsVersionNotice("tsv-sample"));
346 1
                return tuple(false, 0);
347
            }
348

349
            /* Input files. Remaining command line args are files. */
350 1
            string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
351 1
            cmdArgs.length = 1;
352

353
            /* Validation and derivations - Do as much validation prior to header line
354
             * processing as possible (avoids waiting on stdin).
355
             *
356
             * Note: keyFields and weightField depend on header line processing, but
357
             * keyFieldsArg and weightFieldArg can be used to detect whether the
358
             * command line argument was specified.
359
             */
360

361
            /* Set hasWeightField here so it can be used in other validation checks.
362
             * Field validity checked after reading file header.
363
             */
364 1
            hasWeightField = !weightFieldArg.empty;
365

366
            /* Sampling with replacement checks (--r|replace). */
367 1
            if (srsWithReplacement)
368
            {
369 1
                enforce(!hasWeightField,
370 1
                        "Sampling with replacement (--r|replace) does not support weights (--w|weight-field).");
371

372 1
                enforce(inclusionProbability.isNaN,
373 1
                        "Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
374

375 1
                enforce(keyFieldsArg.empty,
376 1
                        "Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
377

378 1
                enforce(!printRandom && !genRandomInorder,
379 1
                        "Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
380

381 1
                enforce(!preserveInputOrder,
382 1
                        "Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option).");
383
            }
384

385
            /* Distinct sampling checks (--k|key-fields --p|prob). */
386 1
            enforce(keyFieldsArg.empty | !inclusionProbability.isNaN,
387 1
                    "--p|prob is required when using --k|key-fields.");
388

389
            /* Inclusion probability (--p|prob) is used for both Bernoulli sampling
390
             * and distinct sampling.
391
             */
392 1
            if (!inclusionProbability.isNaN)
393
            {
394 1
                enforce(inclusionProbability > 0.0 && inclusionProbability <= 1.0,
395 1
                        format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
396

397 1
                if (!keyFieldsArg.empty) useDistinctSampling = true;
398 1
                else useBernoulliSampling = true;
399

400 1
                enforce(!hasWeightField, "--w|weight-field and --p|prob cannot be used together.");
401

402 1
                enforce(!genRandomInorder || useDistinctSampling,
403 1
                        "--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used." ~
404
                        "\nUse --gen-random-inorder alone to print probabilities for all lines." ~
405
                        "\nUse --p|prob and --print-random to print probabilities for lines satisfying the probability threshold.");
406
            }
407 1
            else if (genRandomInorder && !hasWeightField)
408
            {
409 1
                useBernoulliSampling = true;
410
            }
411

412
            /* randomValueHeader (--random-value-header) validity. Note that
413
               randomValueHeader is initialized to a valid, non-empty string.
414
            */
415 1
            enforce(!randomValueHeader.empty && !randomValueHeader.canFind('\n') &&
416 1
                    !randomValueHeader.canFind(delim),
417 1
                    "--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
418

419
            /* Check for incompatible use of (--i|inorder) and shuffling of the full
420
             * data set. Sampling with replacement is also incompatible, this is
421
             * detected earlier. Shuffling is the default operation, so it identified
422
             * by eliminating the other modes of operation.
423
             */
424 1
            enforce(!preserveInputOrder ||
425 1
                    sampleSize != 0 ||
426 1
                    useBernoulliSampling ||
427 1
                    useDistinctSampling,
428 1
                    "Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder.");
429

430

431
            /* Compatibility mode checks:
432
             * - Random value printing implies compatibility-mode, otherwise user's
433
             *   selection is used.
434
             * - Distinct sampling doesn't support compatibility-mode. The routines
435
             *   don't care, but users might expect larger probabilities to be a
436
             *   superset of smaller probabilities. This would be confusing, so
437
             *   flag it as an error.
438
             */
439 1
            enforce(!(compatibilityMode && useDistinctSampling),
440 1
                    "Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode.");
441

442 1
            if (printRandom || genRandomInorder) compatibilityMode = true;
443

444

445
            /* Seed. */
446
            import std.random : unpredictableSeed;
447

448 1
            usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
449

450 1
            if (usingUnpredictableSeed) seed = unpredictableSeed;
451 1
            else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
452 1
            else if (staticSeed) seed = 2438424139;
453 0
            else assert(0, "Internal error, invalid seed option states.");
454

455 1
            string[] headerFields;
456

457
            /* fieldListArgProcessing encapsulates the field list processing. It is
458
             * called prior to reading the header line if headers are not being used,
459
             * and after if headers are being used.
460
             */
461
            void fieldListArgProcessing()
462
            {
463 1
                if (!weightFieldArg.empty)
464
                {
465 1
                    auto fieldIndices =
466
                        weightFieldArg
467
                        .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero)
468
                        (hasHeader, headerFields, weightFieldOptionString)
469
                        .array;
470

471 1
                    enforce(fieldIndices.length == 1,
472 1
                            format("'--%s' must be a single field.", weightFieldOptionString));
473

474 1
                    weightField = fieldIndices[0];
475
                }
476

477 1
                if (!keyFieldsArg.empty)
478
                {
479 1
                    keyFields =
480
                        keyFieldsArg
481
                        .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
482
                        (hasHeader, headerFields, keyFieldsOptionString)
483
                        .array;
484

485 1
                    assert(keyFields.length > 0);
486

487 1
                    if (keyFields.length > 0)
488
                    {
489 1
                        if (keyFields.length == 1 && keyFields[0] == 0)
490
                        {
491 1
                            distinctKeyIsFullLine = true;
492
                        }
493
                        else
494
                        {
495 1
                            enforce(keyFields.length <= 1 || keyFields.all!(x => x != 0),
496 1
                                    "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
497

498 1
                            keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
499
                        }
500
                    }
501
                }
502
            }
503

504 1
            if (!hasHeader) fieldListArgProcessing();
505

506
            /*
507
             * Create the inputSourceRange and perform header line processing.
508
             */
509 1
            ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
510 1
            inputSources = inputSourceRange(filepaths, readHeader);
511

512 1
            if (hasHeader)
513
            {
514 1
                throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1);
515 1
                headerFields = inputSources.front.header.split(delim).to!(string[]);
516 1
                fieldListArgProcessing();
517
            }
518

519
        }
520
        catch (Exception exc)
521
        {
522 1
            stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
523 1
            return tuple(false, 1);
524
        }
525 1
        return tuple(true, 0);
526
    }
527
}
528
/** Invokes the appropriate sampling routine based on the command line arguments.
529
 *
530
 * tsvSample is the top-level routine handling the different tsv-sample use cases.
531
 * Its primary role is to invoke the correct routine for type of sampling requested.
532
 */
533
void tsvSample(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
534
if (isOutputRange!(OutputRange, char))
535
{
536 1
    if (cmdopt.srsWithReplacement)
537
    {
538 1
        simpleRandomSamplingWithReplacement(cmdopt, outputStream);
539
    }
540 1
    else if (cmdopt.useBernoulliSampling)
541
    {
542 1
        bernoulliSamplingCommand(cmdopt, outputStream);
543
    }
544 1
    else if (cmdopt.useDistinctSampling)
545
    {
546 1
        if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
547 1
        else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
548
    }
549 1
    else if (cmdopt.genRandomInorder)
550
    {
551
        /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli,
552
         * Distinct), or don't handle it (SRS w/ Replacement).
553
         */
554 1
        assert(cmdopt.hasWeightField);
555 1
        generateWeightedRandomValuesInorder(cmdopt, outputStream);
556
    }
557 1
    else if (cmdopt.sampleSize != 0)
558
    {
559 1
        randomSamplingCommand(cmdopt, outputStream);
560
    }
561
    else
562
    {
563 1
        shuffleCommand(cmdopt, outputStream);
564
    }
565
}
566

567
/** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling
568
 * routine based on the command line arguments.
569
 *
570
 * This routine selects the appropriate Bernoulli sampling function and template
571
 * instantiation to use based on the command line arguments.
572
 *
573
 * One of the basic choices is whether to use the vanilla algorithm or skip sampling.
574
 * Skip sampling is a little bit faster when the inclusion probability is small but
575
 * doesn't support compatibility mode. See the bernoulliSkipSampling documentation
576
 * for a discussion of the skipSamplingProbabilityThreshold used here.
577
 */
578
void bernoulliSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
579
if (isOutputRange!(OutputRange, char))
580
{
581 1
    assert(!cmdopt.hasWeightField);
582

583 1
    immutable double skipSamplingProbabilityThreshold = 0.04;
584

585 1
    if (cmdopt.compatibilityMode ||
586 1
        (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
587
    {
588 1
        if (cmdopt.genRandomInorder)
589
        {
590 1
            bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
591
        }
592
        else
593
        {
594 1
            bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
595
        }
596
    }
597
    else
598
    {
599 1
        bernoulliSkipSampling(cmdopt, outputStream);
600
    }
601
}
602

603
/** Bernoulli sampling of lines from the input stream.
604
 *
605
 * Each input line is a assigned a random value and output if less than
606
 * cmdopt.inclusionProbability. The order of the lines is not changed.
607
 *
608
 * This routine supports random value printing and gen-random-inorder value printing.
609
 */
610
void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
611
    (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
612
if (isOutputRange!(OutputRange, char))
613
{
614
    import std.random : Random = Mt19937, uniform01;
615
    import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
616
        InputSourceRange, throwIfWindowsNewlineOnUnix;
617

618 1
    static if (generateRandomAll) assert(cmdopt.genRandomInorder);
619 1
    else assert(!cmdopt.genRandomInorder);
620

621 1
    assert(!cmdopt.inputSources.empty);
622
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
623

624 1
    auto randomGenerator = Random(cmdopt.seed);
625

626
    /* First header is read during command line argument processing. */
627 1
    if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
628
    {
629 1
        auto inputStream = cmdopt.inputSources.front;
630

631
        static if (generateRandomAll)
632
        {
633 1
            outputStream.put(cmdopt.randomValueHeader);
634 1
            outputStream.put(cmdopt.delim);
635
        }
636 1
        else if (cmdopt.printRandom)
637
        {
638 1
            outputStream.put(cmdopt.randomValueHeader);
639 1
            outputStream.put(cmdopt.delim);
640
        }
641

642 1
        outputStream.put(inputStream.header);
643 1
        outputStream.put("\n");
644

645
        /* Immediately flush the header so subsequent processes in a unix command
646
         * pipeline see it early. This helps provide timely error messages.
647
         */
648 1
        static if (isFlushableOutputRange!OutputRange) outputStream.flush;
649
    }
650

651
    /* Process each line. */
652 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
653 1
    ulong numLinesWritten = 0;
654

655 1
    foreach (inputStream; cmdopt.inputSources)
656
    {
657 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
658

659 1
        foreach (ulong fileLineNum, line;
660
                 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
661
        {
662 1
            if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
663

664 1
            immutable double lineScore = uniform01(randomGenerator);
665

666
            static if (generateRandomAll)
667
            {
668 1
                outputStream.formatRandomValue(lineScore);
669 1
                outputStream.put(cmdopt.delim);
670 1
                outputStream.put(line);
671 1
                outputStream.put("\n");
672

673 1
                if (cmdopt.sampleSize != 0)
674
                {
675 1
                    ++numLinesWritten;
676 1
                    if (numLinesWritten == cmdopt.sampleSize) return;
677
                }
678
            }
679 1
            else if (lineScore < cmdopt.inclusionProbability)
680
            {
681 1
                if (cmdopt.printRandom)
682
                {
683 1
                    outputStream.formatRandomValue(lineScore);
684 1
                    outputStream.put(cmdopt.delim);
685
                }
686 1
                outputStream.put(line);
687 1
                outputStream.put("\n");
688

689 1
                if (cmdopt.sampleSize != 0)
690
                {
691 1
                    ++numLinesWritten;
692 1
                    if (numLinesWritten == cmdopt.sampleSize) return;
693
                }
694
            }
695
        }
696
    }
697
}
698

699
/** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips.
700
 *
701
 * Skip sampling works by skipping a random number of lines between selections. This
702
 * can be faster than assigning a random value to each line when the inclusion
703
 * probability is low, as it reduces the number of calls to the random number
704
 * generator. Both the random number generator and the log() function are called when
705
 * calculating the next skip size. These additional log() calls add up as the
706
 * inclusion probability increases.
707
 *
708
 * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
709
 * file-oriented line sampling. This is obviously environment specific. In the
710
 * environments this implementation has been tested in the performance improvements
711
 * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
712
 *
713
 * The algorithm does not assign random values to individual lines. This makes it
714
 * incompatible with random value printing. It is not suitable for compatibility mode
715
 * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
716
 * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
717
 * does not have this property.
718
 *
719
 * The algorithm for calculating the skip size has been described by multiple sources.
720
 * There are two key variants depending on whether the total number of lines in the
721
 * data set is known in advance. (This implementation does not know the total.)
722
 * Useful references:
723
 * $(LIST
724
 *     * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
725
 *       ACM Trans on Mathematical Software, 1987. On-line:
726
 *       http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
727
 *     * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
728
 *       "Data Stream Management", Springer-Verlag, 2016. On-line:
729
 *       https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
730
 *     * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
731
 *       http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
732
 * )
733
 */
734
void bernoulliSkipSampling(OutputRange)(ref TsvSampleOptions cmdopt, OutputRange outputStream)
735
    if (isOutputRange!(OutputRange, char))
736
{
737
    import std.conv : to;
738
    import std.math : log, trunc;
739
    import std.random : Random = Mt19937, uniform01;
740
    import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
741
        InputSourceRange, throwIfWindowsNewlineOnUnix;
742

743 1
    assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
744 1
    assert(!cmdopt.printRandom);
745 1
    assert(!cmdopt.compatibilityMode);
746

747 1
    assert(!cmdopt.inputSources.empty);
748
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
749

750 1
    auto randomGenerator = Random(cmdopt.seed);
751

752 1
    immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
753 1
    immutable double logDiscardRate = log(discardRate);
754

755
    /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
756
     * interval to (0.0, 1.0], excluding 0.0.
757
     */
758 1
    size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
759

760
    /* First header is read during command line argument processing. */
761 1
    if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
762
    {
763 1
        auto inputStream = cmdopt.inputSources.front;
764

765 1
        outputStream.put(inputStream.header);
766 1
        outputStream.put("\n");
767

768
        /* Immediately flush the header so subsequent processes in a unix command
769
         * pipeline see it early. This helps provide timely error messages.
770
         */
771 1
        static if (isFlushableOutputRange!OutputRange) outputStream.flush;
772
    }
773

774
    /* Process each line. */
775 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
776 1
    ulong numLinesWritten = 0;
777 1
    foreach (inputStream; cmdopt.inputSources)
778
    {
779 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
780

781 1
        foreach (ulong fileLineNum, line;
782
                 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
783
        {
784 1
            if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
785

786 1
            if (remainingSkips > 0)
787
            {
788 1
                --remainingSkips;
789
            }
790
            else
791
            {
792 1
                outputStream.put(line);
793 1
                outputStream.put("\n");
794

795 1
                if (cmdopt.sampleSize != 0)
796
                {
797 1
                    ++numLinesWritten;
798 1
                    if (numLinesWritten == cmdopt.sampleSize) return;
799
                }
800

801 1
                remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
802
            }
803
        }
804
    }
805
}
806

807
/** Sample lines by choosing a random set of distinct keys formed from one or more
808
 * fields on each line.
809
 *
810
 * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling.
811
 * However, instead of each line being subject to an independent trial, lines are
812
 * selected based on a key from each line. A portion of keys are randomly selected for
813
 * output, and every line containing a selected key is included in the output.
814
 *
815
 * An example use-case is a query log having <user, query, clicked-url> triples. It is
816
 * often useful to sample records for portion of the users, but including all records
817
 * for the users selected. Distinct sampling supports this by selecting a subset of
818
 * users to include in the output.
819
 *
820
 * Distinct sampling is done by hashing the key and mapping the hash value into
821
 * buckets sized to hold the inclusion probability. Records having a key mapping to
822
 * bucket zero are output. Buckets are equal size and therefore may be larger than the
823
 * inclusion probability. (The other approach would be to have the caller specify the
824
 * the number of buckets. More correct, but less convenient.)
825
 */
826
void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
827
    (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
828
if (isOutputRange!(OutputRange, char))
829
{
830
    import std.algorithm : splitter;
831
    import std.conv : to;
832
    import std.digest.murmurhash;
833
    import std.math : lrint;
834
    import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
835
        InputFieldReordering, InputSourceRange, throwIfWindowsNewlineOnUnix;
836

837 1
    static if (generateRandomAll) assert(cmdopt.genRandomInorder);
838 1
    else assert(!cmdopt.genRandomInorder);
839

840 1
    assert(cmdopt.keyFields.length > 0);
841 1
    assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
842

843 1
    assert(!cmdopt.inputSources.empty);
844
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
845

846
    static if (generateRandomAll)
847
    {
848
        import std.format : formatValue, singleSpec;
849 1
        immutable randomValueFormatSpec = singleSpec("%d");
850
    }
851

852 1
    immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
853

854 1
    uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
855

856
    /* Create a mapping for the key fields. */
857 1
    auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
858

859
    /* First header is read during command line argument processing. */
860 1
    if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
861
    {
862 1
        auto inputStream = cmdopt.inputSources.front;
863

864
        static if (generateRandomAll)
865
        {
866 1
            outputStream.put(cmdopt.randomValueHeader);
867 1
            outputStream.put(cmdopt.delim);
868
        }
869 1
        else if (cmdopt.printRandom)
870
        {
871 1
            outputStream.put(cmdopt.randomValueHeader);
872 1
            outputStream.put(cmdopt.delim);
873
        }
874

875 1
        outputStream.put(inputStream.header);
876 1
        outputStream.put("\n");
877

878
        /* Immediately flush the header so subsequent processes in a unix command
879
         * pipeline see it early. This helps provide timely error messages.
880
         */
881 1
        static if (isFlushableOutputRange!OutputRange) outputStream.flush;
882
    }
883

884
    /* Process each line. */
885 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
886 1
    ulong numLinesWritten = 0;
887

888 1
    foreach (inputStream; cmdopt.inputSources)
889
    {
890 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
891

892 1
        foreach (ulong fileLineNum, line;
893
                 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
894
        {
895 1
            if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
896

897
            /* Murmurhash works by successively adding individual keys, then finalizing.
898
             * Adding individual keys is simpler if the full-line-as-key and individual
899
             * fields as keys cases are separated.
900
             */
901 1
            auto hasher = MurmurHash3!32(cmdopt.seed);
902

903 1
            if (cmdopt.distinctKeyIsFullLine)
904
            {
905 1
                hasher.put(cast(ubyte[]) line);
906
            }
907
            else
908
            {
909 1
                assert(keyFieldsReordering !is null);
910

911
                /* Gather the key field values and assemble the key. */
912 1
                keyFieldsReordering.initNewLine;
913 1
                foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
914
                {
915 1
                    keyFieldsReordering.processNextField(fieldIndex, fieldValue);
916 1
                    if (keyFieldsReordering.allFieldsFilled) break;
917
                }
918

919 1
                enforce(keyFieldsReordering.allFieldsFilled,
920 1
                        format("Not enough fields in line. File: %s, Line: %s",
921
                               inputStream.name, fileLineNum));
922

923 1
                foreach (count, key; keyFieldsReordering.outputFields.enumerate)
924
                {
925 1
                    if (count > 0) hasher.put(delimArray);
926 1
                    hasher.put(cast(ubyte[]) key);
927
                }
928
            }
929

930 1
            hasher.finish;
931

932
            static if (generateRandomAll)
933
            {
934
                import std.conv : to;
935 1
                outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
936 1
                outputStream.put(cmdopt.delim);
937 1
                outputStream.put(line);
938 1
                outputStream.put("\n");
939

940 1
                if (cmdopt.sampleSize != 0)
941
                {
942 1
                    ++numLinesWritten;
943 1
                    if (numLinesWritten == cmdopt.sampleSize) return;
944
                }
945
            }
946 1
            else if (hasher.get % numBuckets == 0)
947
            {
948 1
                if (cmdopt.printRandom)
949
                {
950 1
                    outputStream.put('0');
951 1
                    outputStream.put(cmdopt.delim);
952
                }
953 1
                outputStream.put(line);
954 1
                outputStream.put("\n");
955

956 1
                if (cmdopt.sampleSize != 0)
957
                {
958 1
                    ++numLinesWritten;
959 1
                    if (numLinesWritten == cmdopt.sampleSize) return;
960
                }
961
            }
962
        }
963
    }
964
}
965

966
/** Random sampling command handler. Invokes the appropriate sampling routine based on
967
 * the command line arguments.
968
 *
969
 * Random sampling selects a fixed size random sample from the input stream. Both
970
 * simple random sampling (equal likelihood) and weighted random sampling are
971
 * supported. Selected lines are output either in random order or original input order.
972
 * For weighted sampling the random order is the weighted selection order.
973
 *
974
 * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via
975
 * Algorithm R. This routine selects the appropriate reservoir sampling function and
976
 * template instantiation to based on the command line arguments.
977
 *
978
 * Weighted sampling always uses the heap approach. Compatibility mode does as well,
979
 * as it is the method that uses per-line random value assignments. The implication
980
 * of compatibility mode is that a larger sample size includes all the results from
981
 * a smaller sample, assuming the same random seed is used.
982
 *
983
 * For unweighted sampling there is a performance tradeoff between implementations.
984
 * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for
985
 * large sample sizes. The threshold used was chosen based on performance tests. See
986
 * the reservoirSamplingAlgorithmR documentation for more information.
987
 */
988

989
void randomSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
990
if (isOutputRange!(OutputRange, char))
991
{
992 1
    assert(cmdopt.sampleSize != 0);
993

994 1
    immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
995

996 1
    if (cmdopt.hasWeightField)
997
    {
998 1
        if (cmdopt.preserveInputOrder)
999
        {
1000 1
            reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
1001
        }
1002
        else
1003
        {
1004 1
            reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
1005
        }
1006
    }
1007 1
    else if (cmdopt.compatibilityMode ||
1008 1
             (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
1009
    {
1010 1
        if (cmdopt.preserveInputOrder)
1011
        {
1012 1
            reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
1013
        }
1014
        else
1015
        {
1016 1
            reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
1017
        }
1018
    }
1019 1
    else if (cmdopt.preserveInputOrder)
1020
    {
1021 1
        reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream);
1022
    }
1023
    else
1024
    {
1025 1
        reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream);
1026
    }
1027
}
1028

1029
/** Reservoir sampling using a heap. Both weighted and unweighted random sampling are
1030
 * supported.
1031
 *
1032
 * The algorithm used here is based on the one-pass algorithm described by Pavlos
1033
 * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
1034
 * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
1035
 * simply set to one.
1036
 *
1037
 * The implementation uses a heap (priority queue) large enough to hold the desired
1038
 * number of lines. Input is read line-by-line, assigned a random value, and added to
1039
 * the heap. The role of the heap is to identify the lines with the highest assigned
1040
 * random values. Once the heap is full, adding a new line means dropping the line with
1041
 * the lowest score. A "min" heap used for this reason.
1042
 *
1043
 * When done reading all lines, the "min" heap is in reverse of weighted selection
1044
 * order. Weighted selection order is obtained by removing each element one at at time
1045
 * from the heap. The underlying data store will have the elements in weighted selection
1046
 * order (largest weights first).
1047
 *
1048
 * Generating output in weighted order is useful for several reasons:
1049
 *  - For weighted sampling, it preserves the property that smaller valid subsets can be
1050
 *    created by taking the first N lines.
1051
 *  - For unweighted sampling, it ensures that all output permutations are possible, and
1052
 *    are not influenced by input order or the heap data structure used.
1053
 *  - Order consistency is maintained when making repeated use of the same random seed,
1054
 *    but with different sample sizes.
1055
 *
1056
 * The other choice is preserving input order. This is supporting by recording line
1057
 * numbers and sorting the selected sample.
1058
 *
1059
 * There are use cases where only the selection set matters. For these some performance
1060
 * could be gained by skipping the reordering and simply printing the backing store
1061
 * array in-order. Performance tests indicate only a minor benefit, so this is not
1062
 * supported.
1063
 *
1064
 * Notes:
1065
 * $(LIST
1066
 *    * In tsv-sample versions 1.2.1 and earlier this routine also supported
1067
 *      randomization of all input lines. This was dropped in version 1.2.2 in favor
1068
 *      of the approach used in randomizeLines. The latter has significant advantages
1069
 *      given that all data must be read into memory.
1070
 *    * For large reservoir sizes better performance can be achieved using Algorithm R.
1071
 *      See the reservoirSamplingAlgorithmR documentation for details.
1072
 * )
1073
 */
1074
void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
1075
    (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1076
if (isOutputRange!(OutputRange, char))
1077
{
1078
    import std.algorithm : sort;
1079
    import std.container.array;
1080
    import std.container.binaryheap;
1081
    import std.meta : AliasSeq;
1082
    import std.random : Random = Mt19937, uniform01;
1083
    import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
1084
        InputSourceRange, throwIfWindowsNewlineOnUnix;
1085

1086 1
    static if (isWeighted) assert(cmdopt.hasWeightField);
1087 1
    else assert(!cmdopt.hasWeightField);
1088

1089 1
    assert(cmdopt.sampleSize > 0);
1090

1091 1
    assert(!cmdopt.inputSources.empty);
1092
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1093

1094 1
    auto randomGenerator = Random(cmdopt.seed);
1095

1096
    static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
1097
    {
1098
        double score;
1099
        const(char)[] line;
1100
        static if (preserveInputOrder) ulong lineNumber;
1101
    }
1102

1103
    /* Create the heap and backing data store.
1104
     *
1105
     * Note: An std.container.array is used as the backing store to avoid some issues in
1106
     * the standard library (Phobos) binaryheap implementation. Specifically, when an
1107
     * std.container.array is used as backing store, the heap can efficiently reversed by
1108
     * removing the heap elements. This leaves the backing store in the reversed order.
1109
     * However, the current binaryheap implementation does not support this for all
1110
     * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
1111
     */
1112

1113 1
    Array!(Entry!preserveInputOrder) dataStore;
1114 1
    dataStore.reserve(cmdopt.sampleSize);
1115 1
    auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
1116

1117
    /* First header is read during command line argument processing. */
1118 1
    if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1119
    {
1120 1
        auto inputStream = cmdopt.inputSources.front;
1121

1122 1
        if (cmdopt.printRandom)
1123
        {
1124 1
            outputStream.put(cmdopt.randomValueHeader);
1125 1
            outputStream.put(cmdopt.delim);
1126
        }
1127 1
        outputStream.put(inputStream.header);
1128 1
        outputStream.put("\n");
1129

1130
        /* Immediately flush the header so subsequent processes in a unix command
1131
         * pipeline see it early. This helps provide timely error messages.
1132
         */
1133 1
        static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1134
    }
1135

1136
    /* Process each line. */
1137 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1138 1
    static if (preserveInputOrder) ulong totalLineNum = 0;
1139

1140 1
    foreach (inputStream; cmdopt.inputSources)
1141
    {
1142 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1143

1144 1
        foreach (ulong fileLineNum, line;
1145
                 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1146
        {
1147 1
            if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1148

1149
            static if (!isWeighted)
1150
            {
1151 1
                immutable double lineScore = uniform01(randomGenerator);
1152
            }
1153
            else
1154
            {
1155 1
                immutable double lineWeight =
1156
                    getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1157 1
                immutable double lineScore =
1158
                    (lineWeight > 0.0)
1159 1
                    ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1160 1
                    : 0.0;
1161
            }
1162

1163
            static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1164
            else alias entryCTArgs = AliasSeq!();
1165

1166 1
            if (reservoir.length < cmdopt.sampleSize)
1167
            {
1168 1
                reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1169
            }
1170 1
            else if (reservoir.front.score < lineScore)
1171
            {
1172 1
                reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1173
            }
1174

1175 1
            static if (preserveInputOrder) ++totalLineNum;
1176
        }
1177
    }
1178

1179
    /* Done with input, all entries are in the reservoir. */
1180

1181
    /* The asserts here avoid issues with the current binaryheap implementation. They
1182
     * detect use of backing stores having a length not synchronized to the reservoir.
1183
     */
1184 1
    immutable ulong numLines = reservoir.length;
1185 1
    assert(numLines == dataStore.length);
1186

1187
    /* Update the backing store so it is in the desired output order.
1188
     */
1189
    static if (preserveInputOrder)
1190
    {
1191 1
        dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber);
1192
    }
1193
    else
1194
    {
1195
        /* Output in weighted selection order. The heap is in reverse order of assigned
1196
         * weights. Reversing order is done by removing all elements from the heap. This
1197
         * leaves the backing store in the correct order.
1198
         */
1199 1
        while (!reservoir.empty) reservoir.removeFront;
1200
    }
1201

1202 1
    assert(numLines == dataStore.length);
1203

1204 1
    foreach (entry; dataStore)
1205
    {
1206 1
        if (cmdopt.printRandom)
1207
        {
1208 1
            outputStream.formatRandomValue(entry.score);
1209 1
            outputStream.put(cmdopt.delim);
1210
        }
1211 1
        outputStream.put(entry.line);
1212 1
        outputStream.put("\n");
1213
    }
1214
 }
1215

1216
/** Generate weighted random values for all input lines, preserving input order.
1217
 *
1218
 * This complements weighted reservoir sampling, but instead of using a reservoir it
1219
 * simply iterates over the input lines generating the values. The weighted random
1220
 * values are generated with the same formula used by reservoirSampling.
1221
 */
1222
void generateWeightedRandomValuesInorder(OutputRange)
1223
    (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1224
if (isOutputRange!(OutputRange, char))
1225
{
1226
    import std.random : Random = Mt19937, uniform01;
1227
    import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
1228
        InputSourceRange, throwIfWindowsNewlineOnUnix;
1229

1230 1
    assert(cmdopt.hasWeightField);
1231

1232 1
    assert(!cmdopt.inputSources.empty);
1233
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1234

1235 1
    auto randomGenerator = Random(cmdopt.seed);
1236

1237
    /* First header is read during command line argument processing. */
1238 1
    if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1239
    {
1240 1
        auto inputStream = cmdopt.inputSources.front;
1241

1242 1
        outputStream.put(cmdopt.randomValueHeader);
1243 1
        outputStream.put(cmdopt.delim);
1244 1
        outputStream.put(inputStream.header);
1245 1
        outputStream.put("\n");
1246

1247
        /* Immediately flush the header so subsequent processes in a unix command
1248
         * pipeline see it early. This helps provide timely error messages.
1249
         */
1250 1
        static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1251
    }
1252

1253
    /* Process each line. */
1254 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1255 1
    ulong numLinesWritten = 0;
1256

1257 1
    foreach (inputStream; cmdopt.inputSources)
1258
    {
1259 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1260

1261 1
        foreach (ulong fileLineNum, line;
1262
                 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1263
        {
1264 1
            if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1265

1266 1
            immutable double lineWeight =
1267
                getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1268

1269 1
            immutable double lineScore =
1270
                (lineWeight > 0.0)
1271 1
                ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1272 1
                : 0.0;
1273

1274 1
            outputStream.formatRandomValue(lineScore);
1275 1
            outputStream.put(cmdopt.delim);
1276 1
            outputStream.put(line);
1277 1
            outputStream.put("\n");
1278

1279 1
            if (cmdopt.sampleSize != 0)
1280
            {
1281 1
                ++numLinesWritten;
1282 1
                if (numLinesWritten == cmdopt.sampleSize) return;
1283
            }
1284
        }
1285
    }
1286
}
1287

1288
/** Reservoir sampling via Algorithm R
1289
 *
1290
 * This is an implementation of reservoir sampling using what is commonly known as
1291
 * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1292
 * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1293
 * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1294
 * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1295
 * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1296
 *
1297
 * Algorithm R is used for unweighted sampling without replacement. The heap-based
1298
 * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1299
 *
1300
 * The classic algorithm stops after identifying the selected set of items. This
1301
 * implementation goes one step further and randomizes the order of the selected
1302
 * lines. This is consistent with shuffling (line order randomization), a primary
1303
 * tsv-sample use-case.
1304
 *
1305
 * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1306
 * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1307
 * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1308
 * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1309
 *
1310
 * This speed advantage may be offset a certain amount by using a more expensive random
1311
 * value generator. reservoirSamplingViaHeap generates values between zero and one,
1312
 * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing
1313
 * interval. The latter is expected to be more expensive. This is consistent with
1314
 * performance tests indicating that reservoirSamplingViaHeap is faster when using
1315
 * small-to-medium size reservoirs and large input streams.
1316
 */
1317
void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
1318
    (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1319
if (isOutputRange!(OutputRange, char))
1320
{
1321
    import std.meta : AliasSeq;
1322
    import std.random : Random = Mt19937, randomShuffle, uniform;
1323
    import std.algorithm : sort;
1324
    import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
1325
        InputSourceRange, throwIfWindowsNewlineOnUnix;
1326

1327 1
    assert(cmdopt.sampleSize > 0);
1328 1
    assert(!cmdopt.hasWeightField);
1329 1
    assert(!cmdopt.compatibilityMode);
1330 1
    assert(!cmdopt.printRandom);
1331 1
    assert(!cmdopt.genRandomInorder);
1332

1333 1
    assert(!cmdopt.inputSources.empty);
1334
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1335

1336
    static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
1337
    {
1338
        const(char)[] line;
1339
        static if (preserveInputOrder) ulong lineNumber;
1340
    }
1341

1342 1
    Entry!preserveInputOrder[] reservoir;
1343 1
    auto reservoirAppender = appender(&reservoir);
1344 1
    reservoirAppender.reserve(cmdopt.sampleSize);
1345

1346 1
    auto randomGenerator = Random(cmdopt.seed);
1347

1348
    /* First header is read during command line argument processing. */
1349 1
    if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1350
    {
1351 1
        auto inputStream = cmdopt.inputSources.front;
1352

1353 1
        outputStream.put(inputStream.header);
1354 1
        outputStream.put("\n");
1355

1356
        /* Immediately flush the header so subsequent processes in a unix command
1357
         * pipeline see it early. This helps provide timely error messages.
1358
         */
1359 1
        static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1360
    }
1361

1362
    /* Process each line. */
1363 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1364 1
    ulong totalLineNum = 0;
1365

1366 1
    foreach (inputStream; cmdopt.inputSources)
1367
    {
1368 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1369

1370 1
        foreach (ulong fileLineNum, line;
1371
                 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1372
        {
1373 1
            if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1374

1375
            /* Add lines to the reservoir until the reservoir is filled.
1376
             * After that lines are added with decreasing likelihood, based on
1377
             * the total number of lines seen. If added to the reservoir, the
1378
             * line replaces a randomly chosen existing line.
1379
             */
1380
            static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1381
            else alias entryCTArgs = AliasSeq!();
1382

1383 1
            if (totalLineNum < cmdopt.sampleSize)
1384
            {
1385 1
                reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs);
1386
            }
1387
            else
1388
            {
1389 1
                immutable size_t i = uniform(0, totalLineNum, randomGenerator);
1390 1
                if (i < reservoir.length)
1391
                {
1392 1
                    reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs);
1393
                }
1394
            }
1395

1396 1
            ++totalLineNum;
1397
        }
1398
    }
1399

1400
    /* Done with input. The sample is in the reservoir. Update the order and print. */
1401

1402
    static if (preserveInputOrder)
1403
    {
1404 1
        reservoir.sort!((a, b) => a.lineNumber < b.lineNumber);
1405
    }
1406
    else
1407
    {
1408 1
        reservoir.randomShuffle(randomGenerator);
1409
    }
1410

1411 1
    foreach (ref entry; reservoir)
1412
    {
1413 1
        outputStream.put(entry.line);
1414 1
        outputStream.put("\n");
1415
    }
1416
}
1417

1418
/** Shuffling command handler. Invokes the appropriate shuffle (line order
1419
 * randomization) routine based on the command line arguments.
1420
 *
1421
 * Shuffling has similarities to random sampling, but the algorithms used are
1422
 * different. Random sampling selects a subset, only the current subset selection
1423
 * needs to be kept in memory. This is supported by reservoir sampling. By contrast,
1424
 * shuffling needs to hold all input in memory, so it works better to read all lines
1425
 * into memory at once and then shuffle.
1426
 *
1427
 * Two different algorithms are used. Array shuffling is used for unweighted shuffling.
1428
 * Sorting plus random weight assignments is used for weighted shuffling and when
1429
 * compatibility mode is being used.
1430
 *
1431
 * The algorithms used here are all limited by available memory.
1432
 */
1433
void shuffleCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1434
if (isOutputRange!(OutputRange, char))
1435
{
1436 1
    if (cmdopt.hasWeightField)
1437
    {
1438 1
        randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1439
    }
1440 1
    else if (cmdopt.compatibilityMode)
1441
    {
1442 1
        randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1443
    }
1444
    else
1445
    {
1446 1
        randomizeLinesViaShuffle(cmdopt, outputStream);
1447
    }
1448
}
1449

1450
/** Shuffle all input lines by assigning random weights and sorting.
1451
 *
1452
 * randomizeLinesViaSort reads in all input lines and writes them out in random order.
1453
 * The algorithm works by assigning a random value to each line and sorting. Both
1454
 * weighted and unweighted shuffling are supported.
1455
 *
1456
 * Notes:
1457
 * $(LIST
1458
 *   * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used
1459
 *     unless compatibility mode is needed.
1460
 *   * This routine is significantly faster than heap-based reservoir sampling in the
1461
 *     case where the entire file is being read.
1462
 *   * Input data must be read entirely in memory. Disk oriented techniques are needed
1463
 *     when data sizes get too large for available memory. One option is to generate
1464
 *     random values for each line, e.g. --gen-random-inorder, and sort with a disk-
1465
 *     backed sort program like GNU sort.
1466
 * )
1467
 */
1468
void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)
1469
    (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1470
if (isOutputRange!(OutputRange, char))
1471
{
1472
    import std.algorithm : map, sort;
1473

1474 1
    static if (isWeighted) assert(cmdopt.hasWeightField);
1475 1
    else assert(!cmdopt.hasWeightField);
1476

1477 1
    assert(cmdopt.sampleSize == 0);
1478

1479
    /*
1480
     * Read all file data into memory. Then split the data into lines and assign a
1481
     * random value to each line. readFileData also writes the first header line.
1482
     */
1483 1
    const fileData = readFileData!(Yes.hasRandomValue)(cmdopt, outputStream);
1484 1
    auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt);
1485

1486
    /*
1487
     * Sort by the weight and output the lines.
1488
     */
1489 1
    inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1490

1491 1
    foreach (lineEntry; inputLines)
1492
    {
1493 1
        if (cmdopt.printRandom)
1494
        {
1495 1
            outputStream.formatRandomValue(lineEntry.randomValue);
1496 1
            outputStream.put(cmdopt.delim);
1497
        }
1498 1
        outputStream.put(lineEntry.data);
1499 1
        outputStream.put("\n");
1500
    }
1501
}
1502

1503
/** Shuffle (randomize) all input lines using a shuffling algorithm.
1504
 *
1505
 * All lines in files and/or standard input are read in and written out in random
1506
 * order. This routine uses array shuffling, which is faster than sorting. It is a
1507
 * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the
1508
 * most common case).
1509
 *
1510
 * Input data size is limited by available memory. Disk oriented techniques are needed
1511
 * when data sizes are larger. For example, generating random values line-by-line (ala
1512
 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1513
 *
1514
 * This routine does not support random value printing or compatibility-mode.
1515
 */
1516
void randomizeLinesViaShuffle(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1517
if (isOutputRange!(OutputRange, char))
1518
{
1519
    import std.algorithm : map;
1520
    import std.random : Random = Mt19937, randomShuffle;
1521

1522 1
    assert(cmdopt.sampleSize == 0);
1523 1
    assert(!cmdopt.hasWeightField);
1524 1
    assert(!cmdopt.printRandom);
1525 1
    assert(!cmdopt.genRandomInorder);
1526

1527
    /*
1528
     * Read all file data into memory and split into lines.
1529
     */
1530 1
    const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1531 1
    auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1532

1533
    /*
1534
     * Randomly shuffle and print each line.
1535
     *
1536
     * Note: Also tried randomCover, but that was exceedingly slow.
1537
     */
1538
    import std.random : randomShuffle;
1539

1540 1
    auto randomGenerator = Random(cmdopt.seed);
1541 1
    inputLines.randomShuffle(randomGenerator);
1542

1543 1
    foreach (ref line; inputLines)
1544
    {
1545 1
        outputStream.put(line.data);
1546 1
        outputStream.put("\n");
1547
    }
1548
}
1549

1550
/** Simple random sampling with replacement.
1551
 *
1552
 * All lines in files and/or standard input are read in. Then random lines are selected
1553
 * one at a time and output. Lines can be selected multiple times. This process continues
1554
 * until the desired number of samples (--n|num) has been output. Output continues
1555
 * indefinitely if a sample size was not provided.
1556
 */
1557
void simpleRandomSamplingWithReplacement(OutputRange)
1558
    (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1559
if (isOutputRange!(OutputRange, char))
1560
{
1561
    import std.algorithm : map;
1562
    import std.random : Random = Mt19937, uniform;
1563

1564
    /*
1565
     * Read all file data into memory and split the data into lines.
1566
     */
1567 1
    const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1568 1
    const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1569

1570 1
    if (inputLines.length > 0)
1571
    {
1572 1
        auto randomGenerator = Random(cmdopt.seed);
1573

1574
        /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1575 1
        size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1576 1
        while (numLeft != 0)
1577
        {
1578 1
            immutable size_t index = uniform(0, inputLines.length, randomGenerator);
1579 1
            outputStream.put(inputLines[index].data);
1580 1
            outputStream.put("\n");
1581 1
            if (cmdopt.sampleSize != 0) numLeft--;
1582
        }
1583
    }
1584
}
1585

1586
/** A container holding data read from a file or standard input.
1587
 *
1588
 * The InputBlock struct is used to represent a block of data read from a file or
1589
 * standard input. An array of InputBlocks is returned by readFileData. Typically one
1590
 * block per file. Multiple blocks are used for standard input and when the file size
1591
 * cannot be determined. Individual lines are not allowed to span blocks. The blocks
1592
 * allocated to an individual file are numbered starting with zero.
1593
 *
1594
 * See readFileData() for more information.
1595
 */
1596
static struct InputBlock
1597
{
1598
    string filename;          /// Original filename or path. "-" denotes standard input.
1599
    size_t fileBlockNumber;   /// Zero-based block number for the file.
1600
    char[] data;              /// The actual data. Newline terminated or last block for the file.
1601
}
1602

1603
/** Read data from one or more files. This routine is used by algorithms needing to
1604
 * read all data into memory.
1605
 *
1606
 * readFileData reads in all data from a set of files. Data is returned as an array
1607
 * of InputBlock structs. Normally one InputBlock per file, sized to match the size
1608
 * of the file. Standard input is read in one or more blocks, as are files whose size
1609
 * cannot be determined. Multiple blocks are used in these last two cases to avoid
1610
 * expensive memory reallocations. This is not necessary when file size is known as
1611
 * the necessary memory can be preallocated.
1612
 *
1613
 * Individual lines never span multiple blocks, and newlines are preserved. This
1614
 * means that each block starts at the beginning of a line and ends with a newline
1615
 * unless the end of a file has been reached.
1616
 *
1617
 * Each file gets its own block. Prior to using InputSourceRange this was so header
1618
 * processing can be done. With InputSourceRange the header is read separately, so
1619
 * this could be changed.
1620
 */
1621
InputBlock[] readFileData(HasRandomValue hasRandomValue, OutputRange)
1622
(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1623
if (isOutputRange!(OutputRange, char))
1624
{
1625
    import std.algorithm : find, min;
1626
    import std.range : retro;
1627
    import tsv_utils.common.utils : InputSourceRange, isFlushableOutputRange,
1628
        throwIfWindowsNewlineOnUnix;
1629

1630 1
    static if(!hasRandomValue) assert(!cmdopt.printRandom);
1631

1632 1
    assert(!cmdopt.inputSources.empty);
1633
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1634

1635
    /* First header is read during command line argument processing. */
1636 1
    if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1637
    {
1638 1
        auto inputStream = cmdopt.inputSources.front;
1639

1640 1
        if (cmdopt.printRandom)
1641
        {
1642 1
            outputStream.put(cmdopt.randomValueHeader);
1643 1
            outputStream.put(cmdopt.delim);
1644
        }
1645 1
        outputStream.put(inputStream.header);
1646 1
        outputStream.put("\n");
1647

1648
        /* Immediately flush the header so subsequent processes in a unix command
1649
         * pipeline see it early. This helps provide timely error messages.
1650
         */
1651 1
        static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1652
    }
1653

1654
    enum BlockSize = 1024L * 1024L * 1024L;  // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.)
1655
    enum ReadSize = 1024L * 128L;
1656
    enum NewlineSearchSize = 1024L * 16L;
1657

1658 1
    InputBlock[] blocks;
1659 1
    auto blocksAppender = appender(&blocks);
1660 1
    blocksAppender.reserve(cmdopt.inputSources.length);  // At least one block per file.
1661

1662 1
    ubyte[] rawReadBuffer = new ubyte[ReadSize];
1663

1664 1
    foreach (inputStream; cmdopt.inputSources)
1665
    {
1666 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1667

1668
        /* If the file size can be determined then read it as a single block.
1669
         * Otherwise read as multiple blocks. File.size() returns ulong.max
1670
         * if file size cannot be determined, so we'll combine that check
1671
         * with the standard input case.
1672
         */
1673

1674 1
        immutable ulong filesize = inputStream.isStdin ? ulong.max : inputStream.file.size;
1675 1
        auto ifile = inputStream.file;
1676

1677 1
        if (filesize != ulong.max)
1678
        {
1679 1
            readFileDataAsOneBlock(inputStream.name, ifile, filesize,
1680
                                   blocksAppender, rawReadBuffer);
1681
        }
1682
        else
1683
        {
1684 1
            readFileDataAsMultipleBlocks(
1685
                inputStream.name, ifile, blocksAppender, rawReadBuffer,
1686
                BlockSize, NewlineSearchSize);
1687
        }
1688
    }
1689 1
    return blocks;
1690
}
1691

1692
/* readFileData() helper function. Read data from a File handle as a single block. The
1693
 * new block is appended to an existing InputBlock[] array.
1694
 *
1695
 * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case
1696
 * where a file is being read as a single block. Normally initialBlockSize is passed
1697
 * as the size of the file.
1698
 *
1699
 * This routine has been separated out to enable unit testing. At present it is not
1700
 * intended as a general API. See readFileData for more info.
1701
 */
1702
private void readFileDataAsOneBlock(
1703
    string filename,
1704
    ref File ifile,
1705
    const ulong initialBlockSize,
1706
    ref RefAppender!(InputBlock[]) blocksAppender,
1707
    ref ubyte[] rawReadBuffer)
1708
{
1709 1
    blocksAppender.put(InputBlock(filename, 0));
1710 1
    auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1711 1
    dataAppender.reserve(initialBlockSize);
1712

1713 1
    foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1714
    {
1715 1
        dataAppender.put(cast(char[]) buffer);
1716
    }
1717
}
1718

1719
/* readFileData() helper function. Read data from a File handle as one or more blocks.
1720
 * Blocks are appended to an existing InputBlock[] array.
1721
 *
1722
 * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case
1723
 * where a file or standard input is being read as a series of blocks. This is the
1724
 * standard approach for standard input, but also applies when the file size cannot be
1725
 * determined.
1726
 *
1727
 * This routine has been separated out to enable unit testing. At present it is not
1728
 * intended as a general API. See readFileData for more info.
1729
 */
1730
private void readFileDataAsMultipleBlocks(
1731
    string filename,
1732
    ref File ifile,
1733
    ref RefAppender!(InputBlock[]) blocksAppender,
1734
    ref ubyte[] rawReadBuffer,
1735
    const size_t blockSize,
1736
    const size_t newlineSearchSize)
1737
{
1738
    import std.algorithm : find, min;
1739
    import std.range : retro;
1740

1741 1
    assert(ifile.isOpen);
1742

1743
    /* Create a new block for the file and an Appender for writing data.
1744
     */
1745 1
    blocksAppender.put(InputBlock(filename, 0));
1746 1
    auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1747 1
    dataAppender.reserve(blockSize);
1748 1
    size_t blockNumber = 0;
1749

1750
    /* Read all the data and copy it to an InputBlock. */
1751 1
    foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1752
    {
1753 1
        assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber);
1754

1755 1
        immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length;
1756

1757 1
        if (buffer.length <= remainingCapacity)
1758
        {
1759 1
            dataAppender.put(cast(char[]) buffer);
1760
        }
1761
        else
1762
        {
1763
            /* Look for the last newline in the input buffer that fits in remaining
1764
             * capacity of the block.
1765
             */
1766 1
            auto searchRegion = buffer[0 .. remainingCapacity];
1767 1
            auto appendRegion = searchRegion.retro.find('\n').source;
1768

1769 1
            if (appendRegion.length > 0)
1770
            {
1771
                /* Copy the first part of the read buffer to the block. */
1772 1
                dataAppender.put(cast(char[]) appendRegion);
1773

1774
                /* Create a new InputBlock and copy the remaining data to it. */
1775 1
                blockNumber++;
1776 1
                blocksAppender.put(InputBlock(filename, blockNumber));
1777 1
                dataAppender = appender(&(blocksAppender.data[$-1].data));
1778 1
                dataAppender.reserve(blockSize);
1779 1
                dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]);
1780

1781 1
                assert(blocksAppender.data.length >= 2);
1782 1
                assert(blocksAppender.data[$-2].data[$-1] == '\n');
1783
            }
1784
            else
1785
            {
1786
                /* Search backward in the current block for a newline. If found, it
1787
                 * becomes the last newline in the current block. Anything following
1788
                 * it is moved to the block. If a newline is not found, simply append
1789
                 * to the current block and let it grow. We'll only search backward
1790
                 * so far.
1791
                 */
1792 1
                immutable size_t currBlockLength = blocksAppender.data[$-1].data.length;
1793 1
                immutable size_t searchLength = min(currBlockLength, newlineSearchSize);
1794 1
                immutable size_t searchStart = currBlockLength - searchLength;
1795 1
                auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $];
1796 1
                auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length;
1797

1798 1
                if (lastNewlineOffset != 0)
1799
                {
1800
                    /* Create a new InputBlock. The previous InputBlock is then found
1801
                     * at blocksAppender.data[$-2]. It may be a physically different
1802
                     * struct (a copy) if the blocks array gets reallocated.
1803
                     */
1804 1
                    blockNumber++;
1805 1
                    blocksAppender.put(InputBlock(filename, blockNumber));
1806 1
                    dataAppender = appender(&(blocksAppender.data[$-1].data));
1807 1
                    dataAppender.reserve(blockSize);
1808

1809
                    /* Copy data following the newline from the last block to the new
1810
                     * block. Then append the current read buffer.
1811
                     */
1812 1
                    immutable size_t moveRegionStart = searchStart + lastNewlineOffset;
1813 1
                    dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]);
1814 1
                    dataAppender.put(cast(char[]) buffer);
1815

1816
                    /* Now delete the moved region from the last block. */
1817 1
                    blocksAppender.data[$-2].data.length = moveRegionStart;
1818

1819 1
                    assert(blocksAppender.data.length >= 2);
1820 1
                    assert(blocksAppender.data[$-2].data[$-1] == '\n');
1821
                }
1822
                else
1823
                {
1824
                    /* Give up. Allow the current block to grow. */
1825 1
                    dataAppender.put(cast(char[]) buffer);
1826
                }
1827
            }
1828
        }
1829
    }
1830
}
1831

1832
/** HasRandomValue is a boolean flag used at compile time by identifyInputLines to
1833
 * distinguish use cases needing random value assignments from those that don't.
1834
 */
1835
alias HasRandomValue = Flag!"hasRandomValue";
1836

1837
/** An InputLine array is returned by identifyInputLines to represent each non-header line
1838
 * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1839
 * line is included if random values are being generated.
1840
 */
1841
static struct InputLine(HasRandomValue hasRandomValue)
1842
{
1843
    const(char)[] data;
1844
    static if (hasRandomValue) double randomValue;
1845
}
1846

1847
/** identifyInputLines is used by algorithms that read all files into memory prior to
1848
 * processing. It does the initial processing of the file data.
1849
 *
1850
 * Two main tasks are performed. One is splitting all input data into lines. The second
1851
 * is assigning a random value to the line, if random values are being generated.
1852
 *
1853
 * The key input is an InputBlock array. Normally one block for each file, but standard
1854
 * input may have multiple blocks.
1855
 *
1856
 * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1857
 * member if random values are being assigned.
1858
 */
1859
InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted)
1860
(const ref InputBlock[] inputBlocks, ref TsvSampleOptions cmdopt)
1861
{
1862
    import std.algorithm : splitter;
1863
    import std.array : appender;
1864
    import std.random : Random = Mt19937, uniform01;
1865
    import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1866

1867
    static assert(hasRandomValue || !isWeighted);
1868 1
    static if(!hasRandomValue) assert(!cmdopt.printRandom);
1869

1870 1
    InputLine!hasRandomValue[] inputLines;
1871

1872 1
    auto linesAppender = appender(&inputLines);
1873 1
    static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1874

1875
    /* Note: fileLineNum is zero-based here. One-based in most other code in this file. */
1876 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 1 : 0;
1877 1
    size_t fileLineNum = fileBodyStartLine;
1878

1879 1
    foreach (block; inputBlocks)
1880
    {
1881
        /* Drop the last newline to avoid adding an extra empty line. */
1882 1
        const data = (block.data.length > 0 && block.data[$-1] == '\n') ?
1883 1
            block.data[0 .. $-1] : block.data;
1884

1885 1
        if (block.fileBlockNumber == 0) fileLineNum = fileBodyStartLine;
1886

1887 1
        foreach (ref line; data.splitter('\n'))
1888
        {
1889 1
            fileLineNum++;
1890

1891 1
            if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, block.filename, fileLineNum);
1892

1893
            static if (!hasRandomValue)
1894
            {
1895 1
                linesAppender.put(InputLine!hasRandomValue(line));
1896
            }
1897
            else
1898
            {
1899
                static if (!isWeighted)
1900
                {
1901 1
                    immutable double randomValue = uniform01(randomGenerator);
1902
                }
1903
                else
1904
                {
1905 1
                    immutable double lineWeight =
1906
                        getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1907
                                             block.filename, fileLineNum);
1908 1
                    immutable double randomValue =
1909
                        (lineWeight > 0.0)
1910 1
                        ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1911 1
                        : 0.0;
1912
                }
1913

1914 1
                linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1915
            }
1916
        }
1917
    }
1918

1919 1
    return inputLines;
1920
}
1921

1922

1923
/* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios.
1924
 * Other use paths are well tested by the tests at the end cases.
1925
 */
1926
unittest
1927
{
1928
    import tsv_utils.common.unittest_utils;
1929
    import std.algorithm : equal, find, joiner, splitter;
1930
    import std.array : appender;
1931
    import std.file : rmdirRecurse;
1932
    import std.path : buildPath;
1933
    import std.range : repeat;
1934

1935 1
    auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData");
1936 1
    scope(exit) rfdTestDir.rmdirRecurse;
1937

1938 1
    char[] file1Data;
1939 1
    char[] file2Data;
1940 1
    char[] file3Data;
1941

1942 1
    auto app1 = appender(&file1Data);
1943 1
    auto app2 = appender(&file2Data);
1944 1
    auto app3 = appender(&file3Data);
1945

1946
    /* File 1: 1000 short lines. */
1947 1
    app1.put("\n".repeat(100).joiner);
1948 1
    app1.put("x\n".repeat(100).joiner);
1949 1
    app1.put("yz\n".repeat(100).joiner);
1950 1
    app1.put("pqr\n".repeat(100).joiner);
1951 1
    app1.put("a\nbc\ndef\n".repeat(100).joiner);
1952 1
    app1.put('\n'.repeat(100));
1953 1
    app1.put("z\n".repeat(100).joiner);
1954 1
    app1.put("xy\n".repeat(100).joiner);
1955

1956
    /* File 2: 500 longer lines. */
1957 1
    app2.put(
1958
        "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1959
        .repeat(100)
1960
        .joiner);
1961 1
    app2.put(
1962
        "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n"
1963
        .repeat(100)
1964
        .joiner);
1965 1
    app2.put(
1966
         "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1967
        .repeat(100)
1968
        .joiner);
1969

1970
    /* File 3: 1000 mixed length lines. */
1971 1
    app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner);
1972

1973 1
    string file1Path = buildPath(rfdTestDir, "file1.txt");
1974 1
    string file2Path = buildPath(rfdTestDir, "file2.txt");
1975 1
    string file3Path = buildPath(rfdTestDir, "file3.txt");
1976

1977
    try
1978
    {
1979 1
        auto ofile1 = File(file1Path, "w");
1980 1
        ofile1.write(file1Data);
1981
    }
1982 0
    catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file1Path, e.msg));
1983

1984
    try
1985
    {
1986 1
        auto ofile2 = File(file2Path, "w");
1987 1
        ofile2.write(file2Data);
1988
    }
1989 0
    catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file2Path, e.msg));
1990

1991
    try
1992
    {
1993 1
        auto ofile3 = File(file3Path, "w");
1994 1
        ofile3.write(file3Data);
1995
    }
1996 0
    catch  (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file3Path, e.msg));
1997

1998 1
    auto allData = file1Data ~ file2Data ~ file3Data;
1999 1
    auto expectedLines = allData.splitter('\n').array[0 .. $-1];
2000

2001 1
    auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $];
2002 1
    auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $];
2003 1
    auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader;
2004 1
    auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1];
2005

2006 1
    assert(expectedLines.length == expectedLinesUsingHeader.length + 2);
2007

2008 1
    TsvSampleOptions cmdoptNoHeader;
2009 1
    auto noHeaderCmdArgs = ["unittest", file1Path];
2010 1
    auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs);
2011 1
    assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs));
2012

2013 1
    TsvSampleOptions cmdoptYesHeader;
2014 1
    auto yesHeaderCmdArgs = ["unittest", "--header", file1Path];
2015 1
    auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs);
2016 1
    assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs));
2017

2018 1
    auto outputStream = appender!(char[])();
2019

2020
    {
2021
        /* Reading as single blocks. */
2022 1
        ubyte[] rawReadBuffer = new ubyte[256];
2023 1
        InputBlock[] blocks;
2024 1
        auto blocksAppender = appender(&blocks);
2025 1
        blocksAppender.reserve(3);
2026 1
        foreach (f; [ file1Path, file2Path, file3Path ])
2027
        {
2028 1
            auto ifile = f.File;
2029 1
            ulong filesize = ifile.size;
2030 1
            if (filesize == ulong.max) filesize = 1000;
2031 1
            readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer);
2032
        }
2033 1
        auto inputLines =
2034
            identifyInputLines!(No.hasRandomValue, No.isWeighted)(
2035
                blocks, cmdoptNoHeader);
2036

2037 1
        assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
2038
    }
2039

2040
    {
2041
        /* Reading as multiple blocks. */
2042 1
        foreach (size_t searchSize; [ 0, 1, 2, 64 ])
2043
        {
2044 1
            foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ])
2045
            {
2046 1
                foreach (size_t readSize; [ 1, 2, 8, 32 ])
2047
                {
2048 1
                    ubyte[] rawReadBuffer = new ubyte[readSize];
2049 1
                    InputBlock[] blocks;
2050 1
                    auto blocksAppender = appender(&blocks);
2051 1
                    blocksAppender.reserve(3);
2052 1
                    foreach (f; [ file1Path, file2Path, file3Path ])
2053
                    {
2054 1
                        auto ifile = f.File;
2055 1
                        readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
2056
                                                     rawReadBuffer, blockSize, searchSize);
2057
                    }
2058 1
                    auto inputLines =
2059
                        identifyInputLines!(No.hasRandomValue, No.isWeighted)(
2060
                            blocks, cmdoptNoHeader);
2061

2062 1
                    assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
2063
                }
2064
            }
2065
        }
2066
    }
2067
    version(none) {
2068
    {
2069
        /* Reading as multiple blocks, with header processing. */
2070
        const size_t readSize = 32;
2071
        const size_t blockSize = 48;
2072
        const size_t searchSize = 16;
2073

2074
        ubyte[] rawReadBuffer = new ubyte[readSize];
2075
        InputBlock[] blocks;
2076
        auto blocksAppender = appender(&blocks);
2077
        blocksAppender.reserve(3);
2078
        foreach (f; [ file1Path, file2Path, file3Path ])
2079
        {
2080
            auto ifile = f.File;
2081
            readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
2082
                                         rawReadBuffer, blockSize, searchSize);
2083
        }
2084
        auto inputLines =
2085
            identifyInputLines!(No.hasRandomValue, No.isWeighted)(
2086
                blocks, cmdoptYesHeader);
2087

2088
        assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n');
2089
        assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $]));
2090
    }
2091
    }
2092
}
2093

2094
/** Write a floating point random value to an output stream.
2095
 *
2096
 * This routine is used for floating point random value printing. This routine writes
2097
 * 17 significant digits, the range available in doubles. This routine prefers decimal
2098
 * format, without exponents. It will generate somewhat large precision numbers,
2099
 * currently up to 28 digits, before switching to exponents.
2100
 *
2101
 * The primary reason for this approach is to enable faster sorting on random values
2102
 * by GNU sort and similar external sorting programs. GNU sort is dramatically faster
2103
 * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch).
2104
 * The 'general numeric' handles exponential notation. The difference is 5-10x.
2105
 *
2106
 * Random values generated by Bernoulli sampling are nearly always greater than 1e-12.
2107
 * No examples less than 1e-09 were seen in hundred of millions of trials. Similar
2108
 * results were seen with weighted sampling with integer weights. The same is not true
2109
 * with floating point weights. These produce quite large exponents. However, even
2110
 * for floating point weights this can be useful. For random weights [0,1] less than 5%
2111
 * will be less than 1e-12 and use exponential notation.
2112
 */
2113
void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value)
2114
if (isOutputRange!(OutputRange, char))
2115
{
2116
    import std.format : formatValue, singleSpec;
2117

2118 1
    immutable spec17f = singleSpec("%.17f");
2119 1
    immutable spec18f = singleSpec("%.18f");
2120 1
    immutable spec19f = singleSpec("%.19f");
2121 1
    immutable spec20f = singleSpec("%.20f");
2122 1
    immutable spec21f = singleSpec("%.21f");
2123 1
    immutable spec22f = singleSpec("%.22f");
2124 1
    immutable spec23f = singleSpec("%.23f");
2125 1
    immutable spec24f = singleSpec("%.24f");
2126 1
    immutable spec25f = singleSpec("%.25f");
2127 1
    immutable spec26f = singleSpec("%.26f");
2128 1
    immutable spec27f = singleSpec("%.27f");
2129 1
    immutable spec28f = singleSpec("%.28f");
2130

2131 1
    immutable spec17g = singleSpec("%.17g");
2132

2133 1
    immutable formatSpec =
2134 1
        (value >= 1e-01) ? spec17f :
2135 1
        (value >= 1e-02) ? spec18f :
2136 1
        (value >= 1e-03) ? spec19f :
2137 1
        (value >= 1e-04) ? spec20f :
2138 1
        (value >= 1e-05) ? spec21f :
2139 1
        (value >= 1e-06) ? spec22f :
2140 1
        (value >= 1e-07) ? spec23f :
2141 1
        (value >= 1e-08) ? spec24f :
2142 1
        (value >= 1e-09) ? spec25f :
2143 1
        (value >= 1e-10) ? spec26f :
2144 1
        (value >= 1e-11) ? spec27f :
2145 1
        (value >= 1e-12) ? spec28f : spec17g;
2146

2147 1
    outputStream.formatValue(value, formatSpec);
2148
}
2149

2150
@safe unittest
2151
{
2152
    void testFormatValue(double value, string expected)
2153
    {
2154
        import std.array : appender;
2155

2156 1
        auto s = appender!string();
2157 1
        s.formatRandomValue(value);
2158 1
        assert(s.data == expected,
2159
               format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data));
2160
    }
2161

2162 1
    testFormatValue(1.0,   "1.00000000000000000");
2163 1
    testFormatValue(0.1,   "0.10000000000000001");
2164 1
    testFormatValue(0.01,  "0.010000000000000000");
2165 1
    testFormatValue(1e-03, "0.0010000000000000000");
2166 1
    testFormatValue(1e-04, "0.00010000000000000000");
2167 1
    testFormatValue(1e-05, "0.000010000000000000001");
2168 1
    testFormatValue(1e-06, "0.0000010000000000000000");
2169 1
    testFormatValue(1e-07, "0.00000010000000000000000");
2170 1
    testFormatValue(1e-08, "0.000000010000000000000000");
2171 1
    testFormatValue(1e-09, "0.0000000010000000000000001");
2172 1
    testFormatValue(1e-10, "0.00000000010000000000000000");
2173 1
    testFormatValue(1e-11, "0.000000000009999999999999999");
2174 1
    testFormatValue(1e-12, "0.0000000000010000000000000000");
2175 1
    testFormatValue(1e-13, "1e-13");
2176 1
    testFormatValue(1e-14, "1e-14");
2177 1
    testFormatValue(12345678901234567e-15, "12.34567890123456735");
2178 1
    testFormatValue(12345678901234567e-16, "1.23456789012345669");
2179 1
    testFormatValue(12345678901234567e-17, "0.12345678901234566");
2180 1
    testFormatValue(12345678901234567e-18, "0.012345678901234567");
2181 1
    testFormatValue(12345678901234567e-19, "0.0012345678901234567");
2182 1
    testFormatValue(12345678901234567e-20, "0.00012345678901234567");
2183 1
    testFormatValue(12345678901234567e-21, "0.000012345678901234568");
2184 1
    testFormatValue(12345678901234567e-22, "0.0000012345678901234567");
2185 1
    testFormatValue(12345678901234567e-23, "0.00000012345678901234566");
2186 1
    testFormatValue(12345678901234567e-24, "0.000000012345678901234567");
2187 1
    testFormatValue(12345678901234567e-25, "0.0000000012345678901234566");
2188 1
    testFormatValue(12345678901234567e-26, "0.00000000012345678901234568");
2189 1
    testFormatValue(12345678901234567e-27, "0.000000000012345678901234567");
2190 1
    testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567");
2191 1
    testFormatValue(12345678901234567e-29, "1.2345678901234566e-13");
2192
}
2193

2194

2195
/** Convenience function for extracting a single field from a line. See
2196
 * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error
2197
 * text tailored for this program.
2198
 */
2199
import std.traits : isSomeChar;
2200
T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe
2201
if (isSomeChar!C)
2202
{
2203
    import std.conv : ConvException, to;
2204
    import tsv_utils.common.utils : getTsvFieldValue;
2205

2206 1
    T val;
2207
    try
2208
    {
2209 1
        val = getTsvFieldValue!T(line, fieldIndex, delim);
2210
    }
2211
    catch (ConvException exc)
2212
    {
2213 1
        throw new Exception(
2214
            format("Could not process line: %s\n  File: %s Line: %s%s",
2215 1
                   exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
2216 1
                   (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
2217
    }
2218
    catch (Exception exc)
2219
    {
2220
        /* Not enough fields on the line. */
2221 1
        throw new Exception(
2222
            format("Could not process line: %s\n  File: %s Line: %s",
2223 1
                   exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
2224
    }
2225

2226 1
    return val;
2227
}
2228

2229
@safe unittest
2230
{
2231
    /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
2232
     * These tests make basic sanity checks on the getFieldValue wrapper.
2233
     */
2234
    import std.exception;
2235

2236 1
    assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
2237 1
    assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
2238 1
    assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
2239 1
    assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
2240 1
    assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
2241 1
    assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
2242
}
2243

2244
/* Unit tests for the main program start here.
2245
 *
2246
 * Portability note: Many of the tests here rely on generating consistent random numbers
2247
 * across different platforms when using the same random seed. So far this has succeeded
2248
 * on several different platform, compiler, and library versions. However, it is certainly
2249
 * possible this condition will not hold on other platforms.
2250
 *
2251
 * For tsv-sample, this portability implies generating the same results on different
2252
 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
2253
 * but it is convenient for testing. If platforms are identified that do not generate
2254
 * the same results these tests will need to be adjusted.
2255
 */
2256
version(unittest)
2257
{
2258
    /* Unit test helper functions. */
2259

2260
    import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
2261
    import std.conv : to;
2262

2263
    void testTsvSample(string[] cmdArgs, string[][] expected)
2264
    {
2265
        import std.array : appender;
2266

2267 1
        assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
2268

2269
        auto formatAssertMessage(T...)(string msg, T formatArgs)
2270
        {
2271 0
            auto formatString = "[testTsvSample] %s: " ~ msg;
2272 0
            return format(formatString, cmdArgs[0], formatArgs);
2273
        }
2274

2275 1
        TsvSampleOptions cmdopt;
2276 1
        auto savedCmdArgs = cmdArgs.to!string;
2277 1
        auto r = cmdopt.processArgs(cmdArgs);
2278 1
        assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
2279 1
        auto output = appender!(char[])();
2280

2281 1
        tsvSample(cmdopt, output);    // This invokes the main code line.
2282

2283 1
        auto expectedOutput = expected.tsvDataToString;
2284

2285 1
        assert(output.data == expectedOutput,
2286
               formatAssertMessage(
2287
                   "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
2288
                   expectedOutput.to!string, output.data.to!string));
2289
    }
2290
 }
2291

2292
unittest
2293
{
2294
    import std.path : buildPath;
2295
    import std.file : rmdirRecurse;
2296

2297 1
    auto testDir = makeUnittestTempDir("tsv_sample");
2298 1
    scope(exit) testDir.rmdirRecurse;
2299

2300
    /* Tabular data sets and expected results use the built-in static seed.
2301
     * Tests are run by writing the data set to a file, then calling the main
2302
     * routine to process. The function testTsvSample plays the role of the
2303
     * main program. Rather than writing to expected output, the results are
2304
     * matched against expected. The expected results were verified by hand
2305
     * prior to inclusion in the test.
2306
     *
2307
     * The initial part of this section is simply setting up data files and
2308
     * expected results.
2309
     *
2310
     * Expected results naming conventions:
2311
     *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
2312
     *  - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct
2313
     *  - Compatibility: Compat, AlgoR, Skip, Swap, Inorder
2314
     *  - Weight Field: Wt<num>, e.g. Wt3
2315
     *  - Sample Size: Num<num>, eg. Num3
2316
     *  - Seed Value: V<num>, eg. V77
2317
     *  - Key Field: K<num>, e.g. K2
2318
     *  - Probability: P<num>, e.g P05 (5%)
2319
     *  - Printing Probabilities: Probs
2320
     *  - Printing Probs in order: ProbsInorder
2321
     *  - Printing Probs with custom header: RVCustom
2322
     */
2323

2324
    /* Empty file. */
2325 1
    string[][] dataEmpty = [];
2326 1
    string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
2327 1
    writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
2328

2329
    /* 3x0, header only. */
2330 1
    string[][] data3x0 = [["field_a", "field_b", "field_c"]];
2331 1
    string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
2332 1
    writeUnittestTsvFile(fpath_data3x0, data3x0);
2333

2334
    /* 3x1 */
2335 1
    string[][] data3x1 =
2336
        [["field_a", "field_b", "field_c"],
2337
         ["tan", "タン", "8.5"]];
2338

2339 1
    string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
2340 1
    string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
2341 1
    writeUnittestTsvFile(fpath_data3x1, data3x1);
2342 1
    writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]);
2343

2344 1
    string[][] data3x1ExpectedReplaceNum3 =
2345
        [["field_a", "field_b", "field_c"],
2346
         ["tan", "タン", "8.5"],
2347
         ["tan", "タン", "8.5"],
2348
         ["tan", "タン", "8.5"]];
2349

2350
    /* 3x2 */
2351 1
    string[][] data3x2 =
2352
        [["field_a", "field_b", "field_c"],
2353
         ["brown", "褐色", "29.2"],
2354
         ["gray", "グレー", "6.2"]];
2355

2356 1
    string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
2357 1
    string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
2358 1
    writeUnittestTsvFile(fpath_data3x2, data3x2);
2359 1
    writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]);
2360

2361 1
    string[][] data3x2PermuteCompat =
2362
        [["field_a", "field_b", "field_c"],
2363
         ["gray", "グレー", "6.2"],
2364
         ["brown", "褐色", "29.2"]];
2365

2366 1
    string[][] data3x2PermuteShuffle =
2367
        [["field_a", "field_b", "field_c"],
2368
         ["gray", "グレー", "6.2"],
2369
         ["brown", "褐色", "29.2"]];
2370

2371
    /* 3x3 */
2372 1
    string[][] data3x3 =
2373
        [["field_a", "field_b", "field_c"],
2374
         ["orange", "オレンジ", "2.5"],
2375
         ["pink", "ピンク", "1.1"],
2376
         ["purple", "紫の", "42"]];
2377

2378 1
    string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
2379 1
    string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
2380 1
    writeUnittestTsvFile(fpath_data3x3, data3x3);
2381 1
    writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]);
2382

2383 1
    string[][] data3x3ExpectedPermuteCompat =
2384
        [["field_a", "field_b", "field_c"],
2385
         ["purple", "紫の", "42"],
2386
         ["pink", "ピンク", "1.1"],
2387
         ["orange", "オレンジ", "2.5"]];
2388

2389 1
    string[][] data3x3ExpectedPermuteSwap =
2390
        [["field_a", "field_b", "field_c"],
2391
         ["purple", "紫の", "42"],
2392
         ["orange", "オレンジ", "2.5"],
2393
         ["pink", "ピンク", "1.1"]];
2394

2395
    /* 3x6 */
2396 1
    string[][] data3x6 =
2397
        [["field_a", "field_b", "field_c"],
2398
         ["red", "赤", "23.8"],
2399
         ["green", "緑", "0.0072"],
2400
         ["white", "白", "1.65"],
2401
         ["yellow", "黄", "12"],
2402
         ["blue", "青", "12"],
2403
         ["black", "黒", "0.983"]];
2404 1
    string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
2405 1
    string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
2406 1
    writeUnittestTsvFile(fpath_data3x6, data3x6);
2407 1
    writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]);
2408

2409
    // Randomization, all lines
2410 1
    string[][] data3x6ExpectedPermuteCompat =
2411
        [["field_a", "field_b", "field_c"],
2412
         ["yellow", "黄", "12"],
2413
         ["black", "黒", "0.983"],
2414
         ["blue", "青", "12"],
2415
         ["white", "白", "1.65"],
2416
         ["green", "緑", "0.0072"],
2417
         ["red", "赤", "23.8"]];
2418

2419 1
    string[][] data3x6ExpectedPermuteSwap =
2420
        [["field_a", "field_b", "field_c"],
2421
         ["black", "黒", "0.983"],
2422
         ["green", "緑", "0.0072"],
2423
         ["red", "赤", "23.8"],
2424
         ["yellow", "黄", "12"],
2425
         ["white", "白", "1.65"],
2426
         ["blue", "青", "12"]];
2427

2428 1
    string[][] data3x6ExpectedPermuteCompatProbs =
2429
        [["random_value", "field_a", "field_b", "field_c"],
2430
         ["0.96055546286515892", "yellow", "黄", "12"],
2431
         ["0.75710153928957880", "black", "黒", "0.983"],
2432
         ["0.52525980887003243", "blue", "青", "12"],
2433
         ["0.49287854949943721", "white", "白", "1.65"],
2434
         ["0.15929344086907804", "green", "緑", "0.0072"],
2435
         ["0.010968807619065046", "red", "赤", "23.8"]];
2436

2437
    /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
2438
     * both are effectively the same algorithm given that --num is data length. Both read
2439
     * in the full data in order then call randomShuffle.
2440
     */
2441 1
    string[][] data3x6ExpectedSampleAlgoRNum6 =
2442
        [["field_a", "field_b", "field_c"],
2443
         ["black", "黒", "0.983"],
2444
         ["green", "緑", "0.0072"],
2445
         ["red", "赤", "23.8"],
2446
         ["yellow", "黄", "12"],
2447
         ["white", "白", "1.65"],
2448
         ["blue", "青", "12"]];
2449

2450 1
    string[][] data3x6ExpectedSampleAlgoRNum5 =
2451
        [["field_a", "field_b", "field_c"],
2452
         ["red", "赤", "23.8"],
2453
         ["black", "黒", "0.983"],
2454
         ["white", "白", "1.65"],
2455
         ["green", "緑", "0.0072"],
2456
         ["yellow", "黄", "12"]];
2457

2458 1
    string[][] data3x6ExpectedSampleAlgoRNum4 =
2459
        [["field_a", "field_b", "field_c"],
2460
         ["blue", "青", "12"],
2461
         ["green", "緑", "0.0072"],
2462
         ["black", "黒", "0.983"],
2463
         ["white", "白", "1.65"]];
2464

2465 1
    string[][] data3x6ExpectedSampleAlgoRNum3 =
2466
        [["field_a", "field_b", "field_c"],
2467
         ["red", "赤", "23.8"],
2468
         ["black", "黒", "0.983"],
2469
         ["green", "緑", "0.0072"]];
2470

2471 1
    string[][] data3x6ExpectedSampleAlgoRNum2 =
2472
        [["field_a", "field_b", "field_c"],
2473
         ["black", "黒", "0.983"],
2474
         ["red", "赤", "23.8"]];
2475

2476 1
    string[][] data3x6ExpectedSampleAlgoRNum1 =
2477
        [["field_a", "field_b", "field_c"],
2478
         ["green", "緑", "0.0072"]];
2479

2480
    /* Inorder versions. */
2481 1
    string[][] data3x6ExpectedSampleAlgoRNum6Inorder =
2482
        [["field_a", "field_b", "field_c"],
2483
         ["red", "赤", "23.8"],
2484
         ["green", "緑", "0.0072"],
2485
         ["white", "白", "1.65"],
2486
         ["yellow", "黄", "12"],
2487
         ["blue", "青", "12"],
2488
         ["black", "黒", "0.983"]];
2489

2490 1
    string[][] data3x6ExpectedSampleAlgoRNum5Inorder =
2491
        [["field_a", "field_b", "field_c"],
2492
         ["red", "赤", "23.8"],
2493
         ["green", "緑", "0.0072"],
2494
         ["white", "白", "1.65"],
2495
         ["yellow", "黄", "12"],
2496
         ["black", "黒", "0.983"]];
2497

2498 1
    string[][] data3x6ExpectedSampleAlgoRNum4Inorder =
2499
        [["field_a", "field_b", "field_c"],
2500
         ["green", "緑", "0.0072"],
2501
         ["white", "白", "1.65"],
2502
         ["blue", "青", "12"],
2503
         ["black", "黒", "0.983"]];
2504

2505 1
    string[][] data3x6ExpectedSampleAlgoRNum3Inorder =
2506
        [["field_a", "field_b", "field_c"],
2507
         ["red", "赤", "23.8"],
2508
         ["green", "緑", "0.0072"],
2509
         ["black", "黒", "0.983"]];
2510

2511 1
    string[][] data3x6ExpectedSampleAlgoRNum2Inorder =
2512
        [["field_a", "field_b", "field_c"],
2513
         ["red", "赤", "23.8"],
2514
         ["black", "黒", "0.983"]];
2515

2516 1
    string[][] data3x6ExpectedSampleAlgoRNum1Inorder =
2517
        [["field_a", "field_b", "field_c"],
2518
         ["green", "緑", "0.0072"]];
2519

2520
    /* Reservoir inorder */
2521 1
    string[][] data3x6ExpectedSampleCompatNum6Inorder =
2522
        [["field_a", "field_b", "field_c"],
2523
         ["red", "赤", "23.8"],
2524
         ["green", "緑", "0.0072"],
2525
         ["white", "白", "1.65"],
2526
         ["yellow", "黄", "12"],
2527
         ["blue", "青", "12"],
2528
         ["black", "黒", "0.983"]];
2529

2530 1
    string[][] data3x6ExpectedSampleCompatNum5Inorder =
2531
        [["field_a", "field_b", "field_c"],
2532
         ["green", "緑", "0.0072"],
2533
         ["white", "白", "1.65"],
2534
         ["yellow", "黄", "12"],
2535
         ["blue", "青", "12"],
2536
         ["black", "黒", "0.983"]];
2537

2538 1
    string[][] data3x6ExpectedSampleCompatNum4Inorder =
2539
        [["field_a", "field_b", "field_c"],
2540
         ["white", "白", "1.65"],
2541
         ["yellow", "黄", "12"],
2542
         ["blue", "青", "12"],
2543
         ["black", "黒", "0.983"]];
2544

2545 1
    string[][] data3x6ExpectedSampleCompatNum3Inorder =
2546
        [["field_a", "field_b", "field_c"],
2547
         ["yellow", "黄", "12"],
2548
         ["blue", "青", "12"],
2549
         ["black", "黒", "0.983"]];
2550

2551 1
    string[][] data3x6ExpectedSampleCompatNum2Inorder =
2552
        [["field_a", "field_b", "field_c"],
2553
         ["yellow", "黄", "12"],
2554
         ["black", "黒", "0.983"]];
2555

2556 1
    string[][] data3x6ExpectedSampleCompatNum1Inorder =
2557
        [["field_a", "field_b", "field_c"],
2558
         ["yellow", "黄", "12"]];
2559

2560

2561
    /* Reservoir inorder with probabilities. */
2562 1
    string[][] data3x6ExpectedSampleCompatNum6ProbsInorder =
2563
        [["random_value", "field_a", "field_b", "field_c"],
2564
         ["0.010968807619065046", "red", "赤", "23.8"],
2565
         ["0.15929344086907804", "green", "緑", "0.0072"],
2566
         ["0.49287854949943721", "white", "白", "1.65"],
2567
         ["0.96055546286515892", "yellow", "黄", "12"],
2568
         ["0.52525980887003243", "blue", "青", "12"],
2569
         ["0.75710153928957880", "black", "黒", "0.983"]];
2570

2571 1
    string[][] data3x6ExpectedSampleCompatNum5ProbsInorder =
2572
        [["random_value", "field_a", "field_b", "field_c"],
2573
         ["0.15929344086907804", "green", "緑", "0.0072"],
2574
         ["0.49287854949943721", "white", "白", "1.65"],
2575
         ["0.96055546286515892", "yellow", "黄", "12"],
2576
         ["0.52525980887003243", "blue", "青", "12"],
2577
         ["0.75710153928957880", "black", "黒", "0.983"]];
2578

2579 1
    string[][] data3x6ExpectedSampleCompatNum4ProbsInorder =
2580
        [["random_value", "field_a", "field_b", "field_c"],
2581
         ["0.49287854949943721", "white", "白", "1.65"],
2582
         ["0.96055546286515892", "yellow", "黄", "12"],
2583
         ["0.52525980887003243", "blue", "青", "12"],
2584
         ["0.75710153928957880", "black", "黒", "0.983"]];
2585

2586 1
    string[][] data3x6ExpectedSampleCompatNum3ProbsInorder =
2587
        [["random_value", "field_a", "field_b", "field_c"],
2588
         ["0.96055546286515892", "yellow", "黄", "12"],
2589
         ["0.52525980887003243", "blue", "青", "12"],
2590
         ["0.75710153928957880", "black", "黒", "0.983"]];
2591

2592 1
    string[][] data3x6ExpectedSampleCompatNum2ProbsInorder =
2593
        [["random_value", "field_a", "field_b", "field_c"],
2594
         ["0.96055546286515892", "yellow", "黄", "12"],
2595
         ["0.75710153928957880", "black", "黒", "0.983"]];
2596

2597 1
    string[][] data3x6ExpectedSampleCompatNum1ProbsInorder =
2598
        [["random_value", "field_a", "field_b", "field_c"],
2599
         ["0.96055546286515892", "yellow", "黄", "12"]];
2600

2601 1
    string[][] data3x6ExpectedWt3Num6Inorder =
2602
        [["field_a", "field_b", "field_c"],
2603
         ["red", "赤", "23.8"],
2604
         ["green", "緑", "0.0072"],
2605
         ["white", "白", "1.65"],
2606
         ["yellow", "黄", "12"],
2607
         ["blue", "青", "12"],
2608
         ["black", "黒", "0.983"]];
2609

2610 1
    string[][] data3x6ExpectedWt3Num5Inorder =
2611
        [["field_a", "field_b", "field_c"],
2612
         ["green", "緑", "0.0072"],
2613
         ["white", "白", "1.65"],
2614
         ["yellow", "黄", "12"],
2615
         ["blue", "青", "12"],
2616
         ["black", "黒", "0.983"]];
2617

2618 1
    string[][] data3x6ExpectedWt3Num4Inorder =
2619
        [["field_a", "field_b", "field_c"],
2620
         ["white", "白", "1.65"],
2621
         ["yellow", "黄", "12"],
2622
         ["blue", "青", "12"],
2623
         ["black", "黒", "0.983"]];
2624

2625 1
    string[][] data3x6ExpectedWt3Num3Inorder =
2626
        [["field_a", "field_b", "field_c"],
2627
         ["yellow", "黄", "12"],
2628
         ["blue", "青", "12"],
2629
         ["black", "黒", "0.983"]];
2630

2631 1
    string[][] data3x6ExpectedWt3Num2Inorder =
2632
        [["field_a", "field_b", "field_c"],
2633
         ["yellow", "黄", "12"],
2634
         ["black", "黒", "0.983"]];
2635

2636 1
    string[][] data3x6ExpectedWt3Num1Inorder =
2637
        [["field_a", "field_b", "field_c"],
2638
         ["yellow", "黄", "12"]];
2639

2640

2641 1
    string[][] data3x6ExpectedBernoulliProbsP100 =
2642
        [["random_value", "field_a", "field_b", "field_c"],
2643
         ["0.010968807619065046", "red", "赤", "23.8"],
2644
         ["0.15929344086907804", "green", "緑", "0.0072"],
2645
         ["0.49287854949943721", "white", "白", "1.65"],
2646
         ["0.96055546286515892", "yellow", "黄", "12"],
2647
         ["0.52525980887003243", "blue", "青", "12"],
2648
         ["0.75710153928957880", "black", "黒", "0.983"]];
2649

2650 1
    string[][] data3x6ExpectedBernoulliCompatProbsP60 =
2651
        [["random_value", "field_a", "field_b", "field_c"],
2652
         ["0.010968807619065046", "red", "赤", "23.8"],
2653
         ["0.15929344086907804", "green", "緑", "0.0072"],
2654
         ["0.49287854949943721", "white", "白", "1.65"],
2655
         ["0.52525980887003243", "blue", "青", "12"]];
2656

2657 1
    string[][] data3x6ExpectedBernoulliSkipP40 =
2658
        [["field_a", "field_b", "field_c"],
2659
         ["red", "赤", "23.8"],
2660
         ["green", "緑", "0.0072"],
2661
         ["yellow", "黄", "12"]];
2662

2663 1
    string[][] data3x6ExpectedBernoulliCompatP60 =
2664
        [["field_a", "field_b", "field_c"],
2665
         ["red", "赤", "23.8"],
2666
         ["green", "緑", "0.0072"],
2667
         ["white", "白", "1.65"],
2668
         ["blue", "青", "12"]];
2669

2670 1
    string[][] data3x6ExpectedDistinctK1K3P60 =
2671
        [["field_a", "field_b", "field_c"],
2672
         ["green", "緑", "0.0072"],
2673
         ["white", "白", "1.65"],
2674
         ["blue", "青", "12"]];
2675

2676 1
    string[][] data3x6ExpectedDistinctK1K3P60Probs =
2677
        [["random_value", "field_a", "field_b", "field_c"],
2678
         ["0", "green", "緑", "0.0072"],
2679
         ["0", "white", "白", "1.65"],
2680
         ["0", "blue", "青", "12"]];
2681

2682 1
    string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
2683
        [["custom_random_value_header", "field_a", "field_b", "field_c"],
2684
         ["0", "green", "緑", "0.0072"],
2685
         ["0", "white", "白", "1.65"],
2686
         ["0", "blue", "青", "12"]];
2687

2688 1
    string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
2689
        [["random_value", "field_a", "field_b", "field_c"],
2690
         ["1", "red", "赤", "23.8"],
2691
         ["0", "green", "緑", "0.0072"],
2692
         ["0", "white", "白", "1.65"],
2693
         ["1", "yellow", "黄", "12"],
2694
         ["3", "blue", "青", "12"],
2695
         ["2", "black", "黒", "0.983"]];
2696

2697 1
    string[][] data3x6ExpectedPermuteWt3Probs =
2698
        [["random_value", "field_a", "field_b", "field_c"],
2699
         ["0.99665198757645390", "yellow", "黄", "12"],
2700
         ["0.94775884809836686", "blue", "青", "12"],
2701
         ["0.82728234682286661", "red", "赤", "23.8"],
2702
         ["0.75346697377181959", "black", "黒", "0.983"],
2703
         ["0.65130103496422487", "white", "白", "1.65"],
2704
         ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
2705

2706 1
    string[][] data3x6ExpectedWt3ProbsInorder =
2707
        [["random_value", "field_a", "field_b", "field_c"],
2708
         ["0.82728234682286661", "red", "赤", "23.8"],
2709
         ["1.5636943712879866e-111", "green", "緑", "0.0072"],
2710
         ["0.65130103496422487", "white", "白", "1.65"],
2711
         ["0.99665198757645390", "yellow", "黄", "12"],
2712
         ["0.94775884809836686", "blue", "青", "12"],
2713
         ["0.75346697377181959", "black", "黒", "0.983"]];
2714

2715 1
    string[][] data3x6ExpectedPermuteWt3 =
2716
        [["field_a", "field_b", "field_c"],
2717
         ["yellow", "黄", "12"],
2718
         ["blue", "青", "12"],
2719
         ["red", "赤", "23.8"],
2720
         ["black", "黒", "0.983"],
2721
         ["white", "白", "1.65"],
2722
         ["green", "緑", "0.0072"]];
2723

2724

2725 1
    string[][] data3x6ExpectedReplaceNum10 =
2726
        [["field_a", "field_b", "field_c"],
2727
         ["black", "黒", "0.983"],
2728
         ["green", "緑", "0.0072"],
2729
         ["green", "緑", "0.0072"],
2730
         ["red", "赤", "23.8"],
2731
         ["yellow", "黄", "12"],
2732
         ["red", "赤", "23.8"],
2733
         ["white", "白", "1.65"],
2734
         ["yellow", "黄", "12"],
2735
         ["yellow", "黄", "12"],
2736
         ["white", "白", "1.65"],
2737
        ];
2738

2739 1
    string[][] data3x6ExpectedReplaceNum10V77 =
2740
        [["field_a", "field_b", "field_c"],
2741
         ["black", "黒", "0.983"],
2742
         ["red", "赤", "23.8"],
2743
         ["black", "黒", "0.983"],
2744
         ["yellow", "黄", "12"],
2745
         ["green", "緑", "0.0072"],
2746
         ["green", "緑", "0.0072"],
2747
         ["green", "緑", "0.0072"],
2748
         ["yellow", "黄", "12"],
2749
         ["blue", "青", "12"],
2750
         ["white", "白", "1.65"],
2751
        ];
2752

2753
    /* Using a different static seed. */
2754 1
    string[][] data3x6ExpectedPermuteCompatV41Probs =
2755
        [["random_value", "field_a", "field_b", "field_c"],
2756
         ["0.68057272653095424", "green", "緑", "0.0072"],
2757
         ["0.67681624367833138", "blue", "青", "12"],
2758
         ["0.32097338931635022", "yellow", "黄", "12"],
2759
         ["0.25092361867427826", "red", "赤", "23.8"],
2760
         ["0.15535934292711318", "black", "黒", "0.983"],
2761
         ["0.046095821075141430", "white", "白", "1.65"]];
2762

2763 1
    string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
2764
        [["random_value", "field_a", "field_b", "field_c"],
2765
         ["0.25092361867427826", "red", "赤", "23.8"],
2766
         ["0.046095821075141430", "white", "白", "1.65"],
2767
         ["0.32097338931635022", "yellow", "黄", "12"],
2768
         ["0.15535934292711318", "black", "黒", "0.983"]];
2769

2770 1
    string[][] data3x6ExpectedPermuteWt3V41Probs =
2771
        [["random_value", "field_a", "field_b", "field_c"],
2772
         ["0.96799377498910666", "blue", "青", "12"],
2773
         ["0.94356245792573568", "red", "赤", "23.8"],
2774
         ["0.90964601024271996", "yellow", "黄", "12"],
2775
         ["0.15491658409260103", "white", "白", "1.65"],
2776
         ["0.15043620392537033", "black", "黒", "0.983"],
2777
         ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
2778

2779 1
    string[][] data3x6ExpectedWt3V41ProbsInorder =
2780
        [["random_value", "field_a", "field_b", "field_c"],
2781
         ["0.94356245792573568", "red", "赤", "23.8"],
2782
         ["6.1394674830701461e-24", "green", "緑", "0.0072"],
2783
         ["0.15491658409260103", "white", "白", "1.65"],
2784
         ["0.90964601024271996", "yellow", "黄", "12"],
2785
         ["0.96799377498910666", "blue", "青", "12"],
2786
         ["0.15043620392537033", "black", "黒", "0.983"]];
2787

2788

2789
    /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2790 1
    string[][] combo1ExpectedPermuteCompat =
2791
        [["field_a", "field_b", "field_c"],
2792
         ["yellow", "黄", "12"],
2793
         ["tan", "タン", "8.5"],
2794
         ["brown", "褐色", "29.2"],
2795
         ["green", "緑", "0.0072"],
2796
         ["red", "赤", "23.8"],
2797
         ["purple", "紫の", "42"],
2798
         ["black", "黒", "0.983"],
2799
         ["white", "白", "1.65"],
2800
         ["gray", "グレー", "6.2"],
2801
         ["blue", "青", "12"],
2802
         ["pink", "ピンク", "1.1"],
2803
         ["orange", "オレンジ", "2.5"]];
2804

2805 1
    string[][] combo1ExpectedPermuteCompatProbs =
2806
        [["random_value", "field_a", "field_b", "field_c"],
2807
         ["0.97088520275428891", "yellow", "黄", "12"],
2808
         ["0.96055546286515892", "tan", "タン", "8.5"],
2809
         ["0.81756894313730299", "brown", "褐色", "29.2"],
2810
         ["0.75710153928957880", "green", "緑", "0.0072"],
2811
         ["0.52525980887003243", "red", "赤", "23.8"],
2812
         ["0.49287854949943721", "purple", "紫の", "42"],
2813
         ["0.47081507067196071", "black", "黒", "0.983"],
2814
         ["0.38388182921335101", "white", "白", "1.65"],
2815
         ["0.29215990612283349", "gray", "グレー", "6.2"],
2816
         ["0.24033216014504433", "blue", "青", "12"],
2817
         ["0.15929344086907804", "pink", "ピンク", "1.1"],
2818
         ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
2819

2820
    /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2821 1
    string[][] combo1ExpectedProbsInorder =
2822
        [["random_value", "field_a", "field_b", "field_c"],
2823
         ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2824
         ["0.15929344086907804", "pink", "ピンク", "1.1"],
2825
         ["0.49287854949943721", "purple", "紫の", "42"],
2826
         ["0.96055546286515892", "tan", "タン", "8.5"],
2827
         ["0.52525980887003243", "red", "赤", "23.8"],
2828
         ["0.75710153928957880", "green", "緑", "0.0072"],
2829
         ["0.38388182921335101", "white", "白", "1.65"],
2830
         ["0.97088520275428891", "yellow", "黄", "12"],
2831
         ["0.24033216014504433", "blue", "青", "12"],
2832
         ["0.47081507067196071", "black", "黒", "0.983"],
2833
         ["0.81756894313730299", "brown", "褐色", "29.2"],
2834
         ["0.29215990612283349", "gray", "グレー", "6.2"]];
2835

2836 1
    string[][] combo1ExpectedBernoulliCompatP50Probs =
2837
        [["random_value", "field_a", "field_b", "field_c"],
2838
         ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2839
         ["0.15929344086907804", "pink", "ピンク", "1.1"],
2840
         ["0.49287854949943721", "purple", "紫の", "42"],
2841
         ["0.38388182921335101", "white", "白", "1.65"],
2842
         ["0.24033216014504433", "blue", "青", "12"],
2843
         ["0.47081507067196071", "black", "黒", "0.983"],
2844
         ["0.29215990612283349", "gray", "グレー", "6.2"]];
2845

2846 1
    string[][] combo1ExpectedBernoulliCompatP40 =
2847
        [["field_a", "field_b", "field_c"],
2848
         ["orange", "オレンジ", "2.5"],
2849
         ["pink", "ピンク", "1.1"],
2850
         ["white", "白", "1.65"],
2851
         ["blue", "青", "12"],
2852
         ["gray", "グレー", "6.2"]];
2853

2854 1
    string[][] combo1ExpectedDistinctK1P40 =
2855
        [["field_a", "field_b", "field_c"],
2856
         ["orange", "オレンジ", "2.5"],
2857
         ["red", "赤", "23.8"],
2858
         ["green", "緑", "0.0072"],
2859
         ["blue", "青", "12"],
2860
         ["black", "黒", "0.983"]];
2861

2862 1
    string[][] combo1ExpectedPermuteWt3Probs =
2863
        [["random_value", "field_a", "field_b", "field_c"],
2864
         ["0.99754077523718754", "yellow", "黄", "12"],
2865
         ["0.99527665440088786", "tan", "タン", "8.5"],
2866
         ["0.99312578945741659", "brown", "褐色", "29.2"],
2867
         ["0.98329602553389361", "purple", "紫の", "42"],
2868
         ["0.97330961938083660", "red", "赤", "23.8"],
2869
         ["0.88797551521739648", "blue", "青", "12"],
2870
         ["0.81999230489041786", "gray", "グレー", "6.2"],
2871
         ["0.55975569204250941", "white", "白", "1.65"],
2872
         ["0.46472135609205739", "black", "黒", "0.983"],
2873
         ["0.18824582704191337", "pink", "ピンク", "1.1"],
2874
         ["0.16446131853299920", "orange", "オレンジ", "2.5"],
2875
         ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
2876

2877 1
    string[][] combo1ExpectedPermuteWt3 =
2878
        [["field_a", "field_b", "field_c"],
2879
         ["yellow", "黄", "12"],
2880
         ["tan", "タン", "8.5"],
2881
         ["brown", "褐色", "29.2"],
2882
         ["purple", "紫の", "42"],
2883
         ["red", "赤", "23.8"],
2884
         ["blue", "青", "12"],
2885
         ["gray", "グレー", "6.2"],
2886
         ["white", "白", "1.65"],
2887
         ["black", "黒", "0.983"],
2888
         ["pink", "ピンク", "1.1"],
2889
         ["orange", "オレンジ", "2.5"],
2890
         ["green", "緑", "0.0072"]];
2891

2892 1
        string[][] combo1ExpectedSampleAlgoRNum4 =
2893
        [["field_a", "field_b", "field_c"],
2894
         ["blue", "青", "12"],
2895
         ["gray", "グレー", "6.2"],
2896
         ["brown", "褐色", "29.2"],
2897
         ["white", "白", "1.65"]];
2898

2899 1
        string[][] combo1ExpectedSampleAlgoRNum4Inorder =
2900
        [["field_a", "field_b", "field_c"],
2901
         ["white", "白", "1.65"],
2902
         ["blue", "青", "12"],
2903
         ["brown", "褐色", "29.2"],
2904
         ["gray", "グレー", "6.2"]];
2905

2906 1
    string[][] combo1ExpectedReplaceNum10 =
2907
        [["field_a", "field_b", "field_c"],
2908
         ["gray", "グレー", "6.2"],
2909
         ["yellow", "黄", "12"],
2910
         ["yellow", "黄", "12"],
2911
         ["white", "白", "1.65"],
2912
         ["tan", "タン", "8.5"],
2913
         ["white", "白", "1.65"],
2914
         ["blue", "青", "12"],
2915
         ["black", "黒", "0.983"],
2916
         ["tan", "タン", "8.5"],
2917
         ["purple", "紫の", "42"]];
2918

2919
    /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
2920 1
    string[][] data1x200 =
2921
        [["field_a"],
2922
         ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
2923
         ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
2924
         ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
2925
         ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
2926
         ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
2927
         ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
2928
         ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
2929
         ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
2930
         ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
2931
         ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
2932
         ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
2933
         ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
2934
         ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
2935
         ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
2936
         ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
2937
         ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
2938
         ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
2939
         ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
2940
         ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
2941
         ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
2942
        ];
2943

2944 1
    string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
2945 1
    string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
2946 1
    writeUnittestTsvFile(fpath_data1x200, data1x200);
2947 1
    writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]);
2948

2949 1
    string[][] data1x200ExpectedBernoulliSkipV333P01 =
2950
        [["field_a"],
2951
         ["077"],
2952
         ["119"]];
2953

2954 1
    string[][] data1x200ExpectedBernoulliSkipV333P02 =
2955
        [["field_a"],
2956
         ["038"],
2957
         ["059"],
2958
         ["124"],
2959
         ["161"],
2960
         ["162"],
2961
         ["183"]];
2962

2963 1
    string[][] data1x200ExpectedBernoulliSkipV333P03 =
2964
        [["field_a"],
2965
         ["025"],
2966
         ["039"],
2967
         ["082"],
2968
         ["107"],
2969
         ["108"],
2970
         ["122"],
2971
         ["136"],
2972
         ["166"],
2973
         ["182"]];
2974

2975 1
    string[][] data1x200ExpectedBernoulliCompatV333P01 =
2976
        [["field_a"],
2977
         ["072"]];
2978

2979 1
    string[][] data1x200ExpectedBernoulliCompatV333P02 =
2980
        [["field_a"],
2981
         ["004"],
2982
         ["072"]];
2983

2984 1
    string[][] data1x200ExpectedBernoulliCompatV333P03 =
2985
        [["field_a"],
2986
         ["004"],
2987
         ["072"],
2988
         ["181"]];
2989

2990
    /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
2991
     * only expected results. The header is from 3x0, the results are offset 1-position
2992
     * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
2993
     */
2994 1
    string[][] combo2ExpectedBernoulliSkipV333P03 =
2995
        [["field_a", "field_b", "field_c"],
2996
         ["024"],
2997
         ["038"],
2998
         ["081"],
2999
         ["106"],
3000
         ["107"],
3001
         ["121"],
3002
         ["135"],
3003
         ["165"],
3004
         ["181"]];
3005

3006

3007
    /* 1x10 - Simple 1-column file. */
3008 1
    string[][] data1x10 =
3009
        [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
3010 1
    string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
3011 1
    string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
3012 1
    writeUnittestTsvFile(fpath_data1x10, data1x10);
3013 1
    writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]);
3014

3015 1
    string[][] data1x10ExpectedPermuteCompat =
3016
        [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
3017