1
/**
2
Command line tool that reads TSV files and summarizes field values associated with
3
equivalent keys.
4

5
Copyright (c) 2016-2020, eBay Inc.
6
Initially written by Jon Degenhardt
7

8
License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9
*/
10
module tsv_utils.tsv_summarize;
11

12
import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter;
13
import std.array : join;
14
import std.conv : to;
15
import std.exception : enforce;
16
import std.format : format;
17
import std.range;
18
import std.stdio;
19
import std.typecons : tuple;
20
import std.container : DList;
21

22
static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
23

24
version(unittest)
25
{
26
    // When running unit tests, use main from -main compiler switch.
27
}
28
else
29
{
30
    int main(string[] cmdArgs)
31
    {
32
        /* When running in DMD code coverage mode, turn on report merging. */
33
        version(D_Coverage) version(DigitalMars)
34
        {
35
            import core.runtime : dmd_coverSetMerge;
36 15
            dmd_coverSetMerge(true);
37
        }
38

39 15
        TsvSummarizeOptions cmdopt;
40 15
        auto r = cmdopt.processArgs(cmdArgs);
41 15
        if (!r[0]) return r[1];
42
        version(LDC_Profile)
43
        {
44
            import ldc.profile : resetAll;
45
            resetAll();
46
        }
47 15
        try tsvSummarize(cmdopt);
48
        catch (Exception exc)
49
        {
50 15
            stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
51 15
            return 1;
52
        }
53 15
        return 0;
54
    }
55
}
56

57
auto helpTextVerbose = q"EOS
58
Synopsis: tsv-summarize [options] file [file...]
59

60
tsv-summarize reads tabular data files (tab-separated by default), tracks
61
field values for each unique key, and runs summarization algorithms. Consider
62
the file data.tsv:
63

64
  Make    Color   Time
65
  ford    blue    131
66
  chevy   green   124
67
  ford    red     128
68
  bmw     black   118
69
  bmw     black   126
70
  ford    blue    122
71

72
The min and average times for each make is generated by the command:
73

74
  $ tsv-summarize --header --group-by Make --min Time --mean Time data.tsv
75

76
This produces:
77

78
  Make   Time_min Time_mean
79
  ford   122      127
80
  chevy  124      124
81
  bmw    118      122
82

83
Using '--group-by Make,Color' will group by both 'Make' and 'Color'.
84
Omitting the '--group-by' entirely summarizes fields for the full file.
85

86
The previous example uses field names to identify fields. Field numbers
87
can be used as well. The next two commands are equivalent:
88

89
  $ tsv-summarize -H --group-by Make,Color --min Time --mean Time data.tsv
90
  $ tsv-summarize -H --group-by 1,2 --min 3 --mean 3 data.tsv
91

92
The program tries to generate useful headers, but custom headers can be
93
specified. Example (using -g and -H shortcuts for --header and --group-by):
94

95
  $ tsv-summarize -H -g 1 --min 3:Fastest --mean 3:Average data.tsv
96

97
Most operators take custom headers in a similarly way, generally following:
98

99
  --<operator-name> FIELD[:header]
100

101
Operators can be specified multiple times. They can also take multiple
102
fields (though not when a custom header is specified). Examples:
103

104
  --median 2,3,4
105
  --median 2-5,7-11
106
  --median elapsed_time,system_time,user_time
107
  --median '*_time'              # Wildcard. All fields ending in '_time'.
108

109
The quantile operator requires one or more probabilities after the fields:
110

111
  --quantile run_time:0.25       # Quantile 1 of the 'run_time' field
112
  --quantile 2:0.25              # Quantile 1 of field 2
113
  --quantile 2-4:0.25,0.5,0.75   # Q1, Median, Q3 of fields 2, 3, 4
114

115
Summarization operators available are:
116
  count       range        mad            values
117
  retain      sum          var            unique-values
118
  first       mean         stddev         unique-count
119
  last        median       mode           missing-count
120
  min         quantile     mode-count     not-missing-count
121
  max
122

123
Calculated numeric values are printed to 12 significant digits by default.
124
This can be changed using the '--p|float-precision' option. If six or less
125
it sets the number of significant digits after the decimal point. If
126
greater than six it sets the total number of significant digits.
127

128
Calculations hold onto the minimum data needed while reading data. A few
129
operations like median keep all data values in memory. These operations will
130
start to encounter performance issues as available memory becomes scarce. The
131
size that can be handled effectively is machine dependent, but often quite
132
large files can be handled.
133

134
Operations requiring numeric entries will signal an error and terminate
135
processing if a non-numeric entry is found.
136

137
Missing values are not treated specially by default, this can be changed
138
using the '--x|exclude-missing' or '--r|replace-missing' option. The former
139
turns off processing for missing values, the latter uses a replacement value.
140

141
Options:
142
EOS";
143

144
auto helpText = q"EOS
145
Synopsis: tsv-summarize [options] file [file...]
146

147
tsv-summarize runs aggregation operations on fields in tab-separated value
148
files. Operations can be run against the full input data or grouped by key
149
fields. Fields can be specified either by field number or field name. Use
150
'--help-verbose' for more detailed help.
151

152
Options:
153
EOS";
154

155
/** Command line options - Container and processing. The processArgs method is used to
156
 * process the command line.
157
 */
158
struct TsvSummarizeOptions {
159
    import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange;
160

161
    string programName;                /// Program name
162
    ByLineSourceRange!() inputSources; /// Input Files
163
    size_t[] keyFields;                /// -g, --group-by
164
    bool hasHeader = false;            /// --header
165
    bool writeHeader = false;          /// -w, --write-header
166
    char inputFieldDelimiter = '\t';   /// --d|delimiter
167
    char valuesDelimiter = '|';        /// --v|values-delimiter
168
    size_t floatPrecision = 12;        /// --p|float-precision
169
    DList!Operator operators;          /// Operators, in the order specified.
170
    size_t endFieldIndex = 0;          /// Derived value. Max field index used plus one.
171
    MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   /// Derived value.
172

173
    /* tsv-summarize operators require access to the header line when the operator is
174
     * created. This is because named fields may be used to describe fields names. To
175
     * enable this, a CmdOptionHandler delegate is added to the cmdLinOperatorOptions
176
     * array during during initial processing by std.getopt. The group-by operation is
177
     * similar, but is added to the cmdLineOtherFieldOptions instead. At least one
178
     * cmdLineOperatorOptions entry is required.
179
     *
180
     * The different handlers are defined after processArgs.
181
     */
182

183
    /* CmdOptionHandler delegate signature - This is the call made to process the command
184
     * line option arguments after the header line has been read.
185
     */
186
    alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields);
187

188
    private CmdOptionHandler[]  cmdLineOperatorOptions;
189
    private CmdOptionHandler[]  cmdLineOtherFieldOptions;
190

191
    /* Returns a tuple. First value is true if command line arguments were successfully
192
     * processed and execution should continue, or false if an error occurred or the user
193
     * asked for help. If false, the second value is the appropriate exit code (0 or 1).
194
     *
195
     * Returning true (execution continues) means args have been validated and derived
196
     * values calculated. In addition, field indices have been converted to zero-based.
197
     */
198
    auto processArgs (ref string[] cmdArgs) {
199
        import std.algorithm : any, each;
200
        import std.getopt;
201
        import std.path : baseName, stripExtension;
202
        import std.typecons : Yes, No;
203
        import tsv_utils.common.fieldlist : fieldListHelpText;
204
        import tsv_utils.common.getopt_inorder;
205
        import tsv_utils.common.utils : throwIfWindowsNewline;
206

207 15
        bool helpVerbose = false;          // --help-verbose
208 15
        bool helpFields = false;           // --help-fields
209 15
        bool versionWanted = false;        // --V|version
210 15
        bool excludeMissing = false;       // --x|exclude-missing
211 15
        string missingValueReplacement;    // --r|replace-missing
212

213

214 15
        programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
215

216
        try
217
        {
218 15
            arraySep = ",";    // Use comma to separate values in command line options
219 15
            auto r = getoptInorder(
220
                cmdArgs,
221
                "help-verbose",       "              Print full help.", &helpVerbose,
222
                "help-fields",        "              Print help on specifying fields.", &helpFields,
223

224
                std.getopt.config.caseSensitive,
225
                "V|version",          "              Print version information and exit.", &versionWanted,
226
                std.getopt.config.caseInsensitive,
227

228
                "g|group-by",         "<field-list>  Fields to use as key.", &addGroupByOptionHandler,
229

230
                std.getopt.config.caseSensitive,
231
                "H|header",           "              Treat the first line of each file as a header.", &hasHeader,
232
                std.getopt.config.caseInsensitive,
233

234
                "w|write-header",     "              Write an output header even if there is no input header.", &writeHeader,
235
                "d|delimiter",        "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter,
236
                "v|values-delimiter", "CHR           Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter,
237
                "p|float-precision",  "NUM           'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision,
238
                "x|exclude-missing",  "              Exclude missing (empty) fields from calculations.", &excludeMissing,
239
                "r|replace-missing",  "STR           Replace missing (empty) fields with STR in calculations.", &missingValueReplacement,
240
                "count",              "              Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &addCountOptionHandler,
241
                "count-header",       "STR           Count occurrences of each unique key, like '--count', but use STR as the header.", &addCountHeaderOptionHandler,
242
                "retain",             "<field-list>  Retain one copy of the field.", &addOperatorOptionHandler!RetainOperator,
243
                "first",              "<field-list>[:STR]  First value seen.", &addOperatorOptionHandler!FirstOperator,
244
                "last",               "<field-list>[:STR]  Last value seen.", &addOperatorOptionHandler!LastOperator,
245
                "min",                "<field-list>[:STR]  Min value. (Fields with numeric values only.)", &addOperatorOptionHandler!MinOperator,
246
                "max",                "<field-list>[:STR]  Max value. (Fields with numeric values only.)", &addOperatorOptionHandler!MaxOperator,
247
                "range",              "<field-list>[:STR]  Difference between min and max values. (Fields with numeric values only.)", &addOperatorOptionHandler!RangeOperator,
248
                "sum",                "<field-list>[:STR]  Sum of the values. (Fields with numeric values only.)", &addOperatorOptionHandler!SumOperator,
249
                "mean",               "<field-list>[:STR]  Mean (average). (Fields with numeric values only.)", &addOperatorOptionHandler!MeanOperator,
250
                "median",             "<field-list>[:STR]  Median value. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MedianOperator,
251
                "quantile",           "<field-list>:p[,p...][:STR]  Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Fields with numeric values only. Reads all values into memory.)", &addQuantileOperatorOptionHandler,
252
                "mad",                "<field-list>[:STR]  Median absolute deviation from the median. Raw value, not scaled. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MadOperator,
253
                "var",                "<field-list>[:STR]  Variance. (Sample variance, numeric fields only).", &addOperatorOptionHandler!VarianceOperator,
254
                "stdev",              "<field-list>[:STR]  Standard deviation. (Sample st.dev, numeric fields only).", &addOperatorOptionHandler!StDevOperator,
255
                "mode",               "<field-list>[:STR]  Mode. The most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeOperator,
256
                "mode-count",         "<field-list>[:STR]  Count of the most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeCountOperator,
257
                "unique-count",       "<field-list>[:STR]  Number of unique values. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueCountOperator,
258
                "missing-count",      "<field-list>[:STR]  Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &addOperatorOptionHandler!MissingCountOperator,
259
                "not-missing-count",  "<field-list>[:STR]  Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &addOperatorOptionHandler!NotMissingCountOperator,
260
                "values",             "<field-list>[:STR]  All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &addOperatorOptionHandler!ValuesOperator,
261
                "unique-values",      "<field-list>[:STR]  All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueValuesOperator,
262
                );
263

264 15
            if (r.helpWanted)
265
            {
266 15
                defaultGetoptPrinter(helpText, r.options);
267 15
                return tuple(false, 0);
268
            }
269 15
            else if (helpVerbose)
270
            {
271 15
                defaultGetoptPrinter(helpTextVerbose, r.options);
272 15
                return tuple(false, 0);
273
            }
274 15
            else if (helpFields)
275
            {
276 15
                writeln(fieldListHelpText);
277 15
                return tuple(false, 0);
278
            }
279 15
            else if (versionWanted)
280
            {
281
                import tsv_utils.common.tsvutils_version;
282 15
                writeln(tsvutilsVersionNotice("tsv-summarize"));
283 15
                return tuple(false, 0);
284
            }
285

286
            /* Remaining command line args are files. Use standard input if files
287
             * were not provided. Truncate cmdArgs to consume the arguments.
288
             */
289 15
            string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
290 15
            cmdArgs.length = 1;
291

292
            /* Validation and derivations - Do as much validation prior to header line
293
             * processing as possible (avoids waiting on stdin).
294
             */
295

296 15
            enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required.");
297

298 15
            enforce(inputFieldDelimiter != valuesDelimiter,
299 15
                    "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
300

301 15
            enforce(!(excludeMissing && missingValueReplacement.length != 0),
302 15
                    "Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
303

304
            /* Missing field policy. */
305 15
            globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
306

307 15
            string[] headerFields;
308

309
            /* fieldListArgProcessing encapsulates the field list processing. It is
310
             * called prior to reading the header line if headers are not being used,
311
             * and after if headers are being used.
312
             */
313
            void fieldListArgProcessing()
314
            {
315
                /* Run all the operator handlers. */
316 15
                cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields));
317 15
                cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields));
318

319
                /* keyFields need to be part of the endFieldIndex, which is one past
320
                 * the last field index. */
321 15
                keyFields.each!(delegate (size_t x)
322
                                {
323 15
                                    if (x >= endFieldIndex) endFieldIndex = x + 1;
324
                                } );
325
            }
326

327 15
            if (!hasHeader) fieldListArgProcessing();
328

329
            /*
330
             * Create the byLineSourceRange and perform header line processing.
331
             */
332 15
            inputSources = byLineSourceRange(filepaths);
333

334

335 15
            if (hasHeader)
336
            {
337 15
                if (!inputSources.front.byLine.empty)
338
                {
339 15
                    throwIfWindowsNewline(inputSources.front.byLine.front, inputSources.front.name, 1);
340 15
                    headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]);
341
                }
342

343 15
                fieldListArgProcessing();
344
            }
345
        }
346
        catch (Exception exc)
347
        {
348 15
            stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
349 15
            return tuple(false, 1);
350
        }
351 15
        return tuple(true, 0);
352
    }
353

354
    private void addGroupByOptionHandler(string option, string optionVal)
355
    {
356 15
        cmdLineOtherFieldOptions ~=
357
            (bool hasHeader, string[] headerFields)
358 15
            => groupByOptionHandler(hasHeader, headerFields, option, optionVal);
359
    }
360

361
    private void groupByOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal)
362
    {
363
        import tsv_utils.common.fieldlist;
364

365
        try
366
        {
367 15
            keyFields =
368
                optionVal
369
                .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields)
370
                .array;
371
        }
372
        catch (Exception e)
373
        {
374 15
            e.msg = format("[--%s %s]. %s", option, optionVal, e.msg);
375 15
            throw e;
376
        }
377
    }
378

379
    private void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
380
    {
381 15
        cmdLineOperatorOptions ~=
382
            (bool hasHeader, string[] headerFields)
383 15
            => operatorOptionHandler!OperatorClass(hasHeader, headerFields, option, optionVal);
384
    }
385

386
    /* operationOptionHandler functions are callbacks that process command line options
387
     * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers
388
     * check syntactic correctness and instantiate Operator objects that do the work. This
389
     * is also where 1-upped field numbers are converted to 0-based indices.
390
     */
391
    private void operatorOptionHandler(OperatorClass : SingleFieldOperator)
392
    (bool hasHeader, string[] headerFields, string option, string optionVal)
393
    {
394
        import std.range : enumerate;
395
        import std.typecons : Yes, No;
396
        import tsv_utils.common.fieldlist;
397

398
        try
399
        {
400 15
            auto optionValParse =
401
                optionVal
402
                .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
403
                (hasHeader, headerFields);
404

405 15
            auto fieldIndices = optionValParse.array;
406 15
            bool hasOptionalHeader = optionVal.length > optionValParse.consumed;
407 15
            string optionalHeader;
408

409 15
            if (hasOptionalHeader)
410
            {
411 15
                enforce(fieldIndices.length <= 1, "Cannot specify a custom header when using multiple fields.");
412 15
                enforce(optionVal.length - optionValParse.consumed > 1,
413 15
                        format("No value after field list.\n   Expected: '--%s <field-list>' or '--%s <field>:<header>'.",
414
                               option, option));
415 15
                optionalHeader = optionVal[optionValParse.consumed + 1 .. $].idup;
416
            }
417

418 15
            foreach (fieldIndex; fieldIndices)
419
            {
420 15
                auto op = new OperatorClass(fieldIndex, globalMissingPolicy);
421

422 15
                if (hasOptionalHeader)
423
                {
424 15
                    enforce(op.allowCustomHeader, "Operator does not support custom headers.");
425 15
                    op.setCustomHeader(optionalHeader);
426
                }
427

428 15
                operators.insertBack(op);
429 15
                if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
430
            }
431
        }
432
        catch (Exception exc)
433
        {
434
            import std.format : format;
435 15
            exc.msg = format("[--%s %s] %s", option, optionVal, exc.msg);
436 15
            throw exc;
437
        }
438
    }
439

440
    private void addQuantileOperatorOptionHandler(string option, string optionVal)
441
    {
442 15
        cmdLineOperatorOptions ~=
443
            (bool hasHeader, string[] headerFields)
444 15
            => quantileOperatorOptionHandler(hasHeader, headerFields, option, optionVal);
445
    }
446

447
    /* QuantileOperator has a different syntax and needs a custom command option handler. */
448
    private void quantileOperatorOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal)
449
    {
450
        import std.typecons : Yes, No;
451
        import tsv_utils.common.fieldlist;
452

453
        try
454
        {
455 15
            auto optionValParse =
456
                optionVal
457
                .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
458
                (hasHeader, headerFields);
459

460 15
            auto fieldIndices = optionValParse.array;
461 15
            enforce(optionVal.length - optionValParse.consumed > 1, "No probabilities entered.");
462

463 15
            auto splitRemaining =
464
                optionVal[optionValParse.consumed + 1 .. $]
465
                .findSplit(":");
466

467 15
            enforce(splitRemaining[1].empty || !splitRemaining[2].empty,
468 15
                    "Empty custom header.");
469

470 15
            auto probStr = splitRemaining[0];
471 15
            auto header = splitRemaining[2];
472

473 15
            double[] probs;
474

475 15
            foreach (str; probStr.splitter(','))
476
            {
477 15
                double p = str.to!double;
478 15
                enforce(p >= 0.0 && p <= 1.0,
479 15
                        format("Probability '%g' is not in the interval [0.0,1.0].", p));
480 15
                probs ~= p;
481
            }
482

483 15
            enforce(header.empty || (fieldIndices.length <= 1 && probs.length <= 1),
484 15
                    format("Cannot specify a custom header when using multiple fields or multiple probabilities."));
485

486 15
            assert (fieldIndices.length > 0);
487 15
            assert (probs.length > 0);
488 15
            assert (header.empty || (fieldIndices.length == 1 && probs.length == 1));
489

490 15
            foreach (fieldIndex; fieldIndices)
491
            {
492 15
                foreach (p; probs)
493
                {
494 15
                    auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p);
495 15
                    if (!header.empty) op.setCustomHeader(header);
496 15
                    operators.insertBack(op);
497
                }
498 15
                if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
499
            }
500
        }
501
        catch (Exception e)
502
        {
503 15
            e.msg = format(
504
                "[--%s %s]. %s\n   Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.",
505
                option, optionVal, e.msg, option, option);
506 15
            throw e;
507
        }
508

509
    }
510

511
    private void addCountOptionHandler()
512
    {
513 15
        cmdLineOperatorOptions ~=
514
            (bool hasHeader, string[] headerFields)
515 15
            => countOptionHandler(hasHeader, headerFields);
516
    }
517

518
    private void countOptionHandler(bool hasHeader, string[] headerFields)
519
    {
520 15
        operators.insertBack(new CountOperator());
521
    }
522

523
   private  void addCountHeaderOptionHandler(string option, string optionVal)
524
    {
525 15
        cmdLineOperatorOptions ~=
526
            (bool hasHeader, string[] headerFields)
527 15
            => countHeaderOptionHandler(hasHeader, headerFields, option, optionVal);
528
    }
529

530
    private void countHeaderOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal)
531
    {
532 15
        auto op = new CountOperator();
533 15
        op.setCustomHeader(optionVal);
534 15
        operators.insertBack(op);
535
    }
536
}
537

538
/** tsvSummarize does the primary work of the tsv-summarize program.
539
 */
540
void tsvSummarize(ref TsvSummarizeOptions cmdopt)
541
{
542
    import tsv_utils.common.utils : BufferedOutputRange, ByLineSourceRange,
543
        bufferedByLine, throwIfWindowsNewline;
544

545
    /* Check that the input files were setup as expected. Should at least have one
546
     * input, stdin if nothing else, and newlines removed from the byLine range.
547
     */
548 15
    assert(!cmdopt.inputSources.empty);
549
    static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));
550

551
    /* BufferedOutputRange is faster than writing directly to stdout if many lines are
552
     * being written. This will happen mostly when group-by is used.
553
     */
554 15
    auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
555

556
    /* Pick the Summarizer based on the number of key-fields entered. */
557 15
    auto summarizer =
558
        (cmdopt.keyFields.length == 0)
559 15
        ? new NoKeySummarizer!(typeof(bufferedOutput))(
560
            cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
561

562 15
        : (cmdopt.keyFields.length == 1)
563 15
        ? new OneKeySummarizer!(typeof(bufferedOutput))(
564
            cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
565

566 15
        : new MultiKeySummarizer!(typeof(bufferedOutput))(
567
            cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
568

569
    /* Add the operators to the Summarizer. */
570 15
    summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
571

572
    /* If there's no input header line, but writing an output header anyway, then
573
     * write it now. This helps tasks further on in a unix pipeline detect errors
574
     * quickly, without waiting for all the data to flow through the pipeline.
575
     */
576 15
    auto printOptions = SummarizerPrintOptions(
577
        cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
578

579 15
    if (!cmdopt.hasHeader && cmdopt.writeHeader)
580
    {
581 15
        summarizer.writeSummaryHeader(bufferedOutput, printOptions);
582 15
        bufferedOutput.flush;
583
    }
584

585
    /* Process each input file, one line at a time. */
586 15
    auto lineFields = new char[][](cmdopt.endFieldIndex);
587 15
    bool headerFound = false;
588 15
    foreach (inputStream; cmdopt.inputSources)
589
    {
590 15
        foreach (lineNum, line; inputStream.byLine.enumerate(1))
591
        {
592 15
            if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum);
593

594
            /* Copy the needed number of fields to the fields array.
595
             * Note: The number is zero if no operator needs fields. Notably, the count
596
             * operator. Used by itself, it counts the number input lines (ala 'wc -l').
597
             */
598 15
            if (cmdopt.endFieldIndex > 0)
599
            {
600 15
                size_t fieldIndex = 0;
601 15
                foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter))
602
                {
603 15
                    if (fieldIndex == cmdopt.endFieldIndex) break;
604 15
                    lineFields[fieldIndex] = fieldValue;
605 15
                    fieldIndex++;
606
                }
607

608 15
                if (fieldIndex == 0)
609
                {
610 15
                    assert(cmdopt.endFieldIndex > 0);
611 15
                    assert(line.length == 0);
612

613
                    /* Bug work-around. Empty lines are not handled properly by splitter.
614
                     *   - Bug: https://issues.dlang.org/show_bug.cgi?id=15735
615
                     *   - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
616
                     * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the
617
                     * unique values in field 1. If there's only one column, then an empty
618
                     * line becomes an empty string for field 1. Work-around: Point to the
619
                     * line. It's an empty string.
620
                     */
621 15
                    lineFields[fieldIndex] = line;
622 15
                    fieldIndex++;
623
                }
624

625 15
                enforce(fieldIndex >= cmdopt.endFieldIndex,
626 15
                        format("Not enough fields in line. File: %s, Line: %s",
627
                               inputStream.name, lineNum));
628
            }
629

630 15
            if (cmdopt.hasHeader && lineNum == 1)
631
            {
632 15
                if (!headerFound)
633
                {
634 15
                    summarizer.processHeaderLine(lineFields);
635 15
                    headerFound = true;
636

637
                    /* Write the header now. This helps tasks further on in a unix
638
                     * pipeline detect errors quickly, without waiting for all the
639
                     * data to flow through the pipeline. Note that an upstream task
640
                     * may have flushed its header line, so the header may arrive
641
                     * long before the main block of data.
642
                     */
643 15
                    summarizer.writeSummaryHeader(bufferedOutput, printOptions);
644 15
                    bufferedOutput.flush;
645
                }
646
            }
647
            else
648
            {
649
                /* Process the line. Processing will fail (throw) if a field cannot be
650
                 * converted to the expected type.
651
                 */
652 15
                try summarizer.processNextLine(lineFields);
653
                catch (Exception exc)
654
                {
655 15
                    throw new Exception(
656
                        format("Could not process line or field: %s\n  File: %s Line: %s%s",
657
                               exc.msg, inputStream.name, lineNum,
658 15
                               (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
659
                }
660
            }
661
        }
662
    }
663

664
    debug writeln("[tsvSummarize] After reading all data.");
665

666
    /* Whew! We're done processing input data. Run the calculations and print. */
667

668 15
    summarizer.writeSummaryBody(bufferedOutput, printOptions);
669
}
670

671
/** The default field header. This is used when the input doesn't have field headers,
672
 * but field headers are used in the output. The default is "fieldN", where N is the
673
 * 1-upped field number.
674
 */
675
string fieldHeaderFromIndex(size_t fieldIndex)
676
{
677
    enum prefix = "field";
678 15
    return prefix ~ (fieldIndex + 1).to!string;
679
}
680

681
unittest
682
{
683 15
    assert(fieldHeaderFromIndex(0) == "field1");
684 15
    assert(fieldHeaderFromIndex(10) == "field11");
685
}
686

687
/** Produce a summary header from a field header.
688
 *
689
 * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is
690
 * "length" and the operation is "max", the summary header is "length_max". The field
691
 * header typically comes a header line in the input data or was constructed by
692
 * fieldHeaderFromIndex().
693
 *
694
 * If operationName is the empty string, then fieldHeader is used unchanged. This supports
695
 * the Retain operator.
696
 */
697
string summaryHeaderFromFieldHeader(string fieldHeader, string operationName)
698
{
699 15
    return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader;
700
}
701

702
unittest
703
{
704 15
    assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc");
705 15
    assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield");
706
}
707

708
/** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically
709
 * specified with command line options, it is separated out for modularity.
710
 */
711
struct SummarizerPrintOptions
712
{
713
    char fieldDelimiter;
714
    char valuesDelimiter;
715
    size_t floatPrecision = 12;
716

717
    import std.traits : isFloatingPoint, isIntegral;
718

719
    auto formatNumber(T)(T n) const
720
    if (isFloatingPoint!T || isIntegral!T)
721
    {
722
        import tsv_utils.common.numerics : formatNumber;
723 15
        return formatNumber!T(n, floatPrecision);
724
    }
725
}
726

727
/** A Summarizer object maintains the state of the summarization and performs basic
728
 * processing. Handling of files and input lines is left to the caller.
729
 *
730
 * Classes supporting the Summarizer must implement the methods:
731
 *  - setOperators - Called after initializing the object for each operator to be processed.
732
 *  - processHeaderLine - Called to process the header line of each file. Returns true if
733
 *   it was the first header line processed (used when reading multiple files).
734
 * - processNextLine - Called to process non-header lines.
735
 * - writeSummaryHeader - Called to write the header line.
736
 * - writeSummaryBody - Called to write the result lines.
737
 *
738
 */
739
interface Summarizer(OutputRange)
740
{
741
    /** Called after initializing the object for each operator to be processed. */
742
    void setOperators(InputRange!Operator op);
743

744
    /** Called to process the header line of each file. Returns true if it was the
745
     *  first header line processed (used when reading multiple files).
746
     */
747
    bool processHeaderLine(const char[][] lineFields);
748

749
    /** Called to process non-header lines. */
750
    void processNextLine(const char[][] lineFields);
751

752
    /** Called to write the header line. */
753
    void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
754

755
    /** Called to write the result lines. */
756
    void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
757
}
758

759
/** SummarizerBase performs work shared by all sumarizers, most everything except for
760
 * handling of unique keys.
761
 *
762
 * The base class handles creation, allocates storage for Operators and SharedFieldValues,
763
 * and similar. Derived classes deal primarily with unique keys and the associated Calculators
764
 * and UniqueKeyValuesLists.
765
 */
766
class SummarizerBase(OutputRange) : Summarizer!OutputRange
767
{
768
    private char _inputFieldDelimiter;
769
    private bool _hasProcessedFirstHeaderLine = false;
770
    private SharedFieldValues _sharedFieldValues = null;  // Null if no shared field value lists.
771
    protected MissingFieldPolicy _missingPolicy;
772
    protected DList!Operator _operators;
773
    protected size_t _numOperators = 0;
774

775 15
    this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
776
    {
777 15
        _inputFieldDelimiter = inputFieldDelimiter;
778 15
        _missingPolicy = missingPolicy;
779
    }
780

781
    char inputFieldDelimiter() const @property
782
    {
783 15
        return _inputFieldDelimiter;
784
    }
785

786
    /** Sets the Operators used by the Summarizer. Called after construction. */
787
    void setOperators(InputRange!Operator operators)
788
    {
789 15
        foreach (op; operators)
790
        {
791 15
            _operators.insertBack(op);
792 15
            _numOperators++;
793 15
            auto numericFieldsToSave = op.numericFieldsToSave();
794 15
            auto textFieldsToSave = op.textFieldsToSave();
795

796 15
            if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0)
797
            {
798 15
                if (_sharedFieldValues is null)
799
                {
800 15
                    _sharedFieldValues = new SharedFieldValues();
801
                }
802 15
                numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x));
803 15
                textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x));
804
            }
805
        }
806
    }
807

808
    /** Called to process the header line of each file. Returns true if it was the
809
     *  first header line processed (used when reading multiple files).
810
     */
811
    bool processHeaderLine(const char[][] lineFields)
812
    {
813 15
        if (!_hasProcessedFirstHeaderLine)
814
        {
815 15
            _operators.each!(x => x.processHeaderLine(lineFields));
816 15
            _hasProcessedFirstHeaderLine = true;
817 15
            return true;
818
        }
819
        else
820
        {
821 0
            return false;
822
        }
823
    }
824

825
    protected final UniqueKeyValuesLists makeUniqueKeyValuesLists()
826
    {
827 15
        return (_sharedFieldValues is null)
828 15
            ? null
829 15
            : _sharedFieldValues.makeUniqueKeyValuesLists;
830
    }
831

832
    abstract void processNextLine(const char[][] lineFields);
833
    abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
834
    abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
835
}
836

837
/** The NoKeySummarizer is used when summarizing values across the entire input.
838
 *
839
 * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing
840
 * through that mechanism.
841
 */
842
final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange
843
{
844
    private Calculator[] _calculators;
845
    private UniqueKeyValuesLists _valueLists;
846

847 15
    this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
848
    {
849 15
        super(inputFieldDelimiter, missingPolicy);
850
    }
851

852
    /** Called after initializing the object for each operator to be processed. */
853
    override void setOperators(InputRange!Operator operators)
854
    {
855 15
        super.setOperators(operators);
856

857
        /* Only one Calculator per Operation, so create them as Operators are added. */
858 15
        foreach (op; operators) _calculators ~= op.makeCalculator;
859 15
        _valueLists = super.makeUniqueKeyValuesLists();
860
    }
861

862
     /** Called to process non-header lines. */
863
    override void processNextLine(const char[][] lineFields)
864
    {
865 15
        _calculators.each!(x => x.processNextLine(lineFields));
866 15
        if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy);
867
    }
868

869
    /** Called to write the header line. */
870
    override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
871
    {
872 15
        put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
873 15
        put(outputStream, '\n');
874
    }
875

876
    /** Called to write the result lines. */
877
    override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
878
    {
879 15
        put(outputStream,
880
            _calculators[]
881 15
            .map!(x => x.calculate(_valueLists, printOptions))
882
            .join(printOptions.fieldDelimiter));
883 15
        put(outputStream, '\n');
884
    }
885
}
886

887
/** KeySummarizerBase does work shared by the single key and multi-key summarizers.
888
 *
889
 * The primary difference between those two is the formation of the key. The primary
890
 * reason for separating those into two separate classes is to simplify (speed-up)
891
 * handling of single field keys, which are the most common use case.
892
 */
893
class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange
894
{
895
    protected struct UniqueKeyData
896
    {
897
        Calculator[] calculators;
898
        UniqueKeyValuesLists valuesLists;
899
    }
900

901
    private DList!string _uniqueKeys;
902
    private UniqueKeyData[string] _uniqueKeyData;
903

904 15
    this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
905
    {
906 15
        super(inputFieldDelimiter, missingPolicy);
907
    }
908

909
    protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields)
910
    {
911
        debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string);
912

913 15
        auto dataPtr = (key in _uniqueKeyData);
914 15
        auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr;
915

916 15
        data.calculators.each!(x => x.processNextLine(lineFields));
917 15
        if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy);
918
    }
919

920
    protected UniqueKeyData addUniqueKey(string key)
921
    {
922 15
        assert(key !in _uniqueKeyData);
923

924 15
        _uniqueKeys.insertBack(key);
925

926 15
        auto calculators = new Calculator[_numOperators];
927 15
        size_t i = 0;
928 15
        foreach (op; _operators)
929
        {
930 15
            calculators[i] = op.makeCalculator;
931 15
            i++;
932
        }
933

934 15
        return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists());
935
    }
936

937
    override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
938
    {
939 15
        put(outputStream, keyFieldHeader());
940 15
        put(outputStream, printOptions.fieldDelimiter);
941 15
        put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
942 15
        put(outputStream, '\n');
943
    }
944

945
    override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
946
    {
947 15
        foreach(key; _uniqueKeys)
948
        {
949 15
            auto data = _uniqueKeyData[key];
950 15
            put(outputStream, key);
951 15
            put(outputStream, printOptions.fieldDelimiter);
952 15
            put(outputStream,
953
                data.calculators[]
954 15
                .map!(x => x.calculate(data.valuesLists, printOptions))
955
                .join(printOptions.fieldDelimiter));
956 15
            put(outputStream, '\n');
957
        }
958
    }
959

960
    abstract string keyFieldHeader() const @property;
961
}
962

963
/** This Summarizer is for the case where the unique key is based on exactly one field.
964
 */
965
final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
966
{
967
    private size_t _keyFieldIndex = 0;
968
    private string _keyFieldHeader;
969
    private DList!string _uniqueKeys;
970

971 15
    this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
972
    {
973 15
        super(inputFieldDelimiter, missingPolicy);
974 15
        _keyFieldIndex = keyFieldIndex;
975 15
        _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex);
976
    }
977

978
    override string keyFieldHeader() const @property
979
    {
980 15
        return _keyFieldHeader;
981
    }
982

983
    override bool processHeaderLine(const char[][] lineFields)
984
    {
985 15
        assert(_keyFieldIndex <= lineFields.length);
986

987 15
        bool isFirstHeaderLine = super.processHeaderLine(lineFields);
988 15
        if (isFirstHeaderLine)
989
        {
990 15
            _keyFieldHeader = lineFields[_keyFieldIndex].to!string;
991
        }
992 15
        return isFirstHeaderLine;
993
    }
994

995
    override void processNextLine(const char[][] lineFields)
996
    {
997 15
        assert(_keyFieldIndex < lineFields.length);
998 15
        processNextLineWithKey(lineFields[_keyFieldIndex], lineFields);
999
    }
1000
}
1001

1002
/** This Summarizer is for the case where the unique key is based on multiple fields.
1003
 */
1004
final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
1005
{
1006
    private size_t[] _keyFieldIndices;
1007
    private string _keyFieldHeader;
1008
    private DList!string _uniqueKeys;
1009

1010 15
    this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
1011
    {
1012 15
        super(inputFieldDelimiter, missingPolicy);
1013 15
        _keyFieldIndices = keyFieldIndices.dup;
1014 15
        _keyFieldHeader =
1015 15
            _keyFieldIndices.map!(i => fieldHeaderFromIndex(i))
1016
            .join(inputFieldDelimiter);
1017
    }
1018

1019
    override string keyFieldHeader() const @property
1020
    {
1021 15
        return _keyFieldHeader;
1022
    }
1023

1024
    override bool processHeaderLine(const char[][] lineFields)
1025
    {
1026 15
        assert(_keyFieldIndices.all!(x => x < lineFields.length));
1027 15
        assert(_keyFieldIndices.length >= 2);
1028

1029 15
        bool isFirstHeaderLine = super.processHeaderLine(lineFields);
1030 15
        if (isFirstHeaderLine)
1031
        {
1032 15
            _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
1033
        }
1034 15
        return isFirstHeaderLine;
1035
    }
1036

1037
    override void processNextLine(const char[][] lineFields)
1038
    {
1039 15
        assert(_keyFieldIndices.all!(x => x < lineFields.length));
1040 15
        assert(_keyFieldIndices.length >= 2);
1041

1042 15
        string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
1043 15
        processNextLineWithKey(key, lineFields);
1044
    }
1045
}
1046

1047
version(unittest)
1048
{
1049
    /* testSummarizer is a helper that can run many types of unit tests against
1050
     * Summarizers. It can also test operators, but there are separate helper functions
1051
     * better suited for that purpose.
1052
     *
1053
     * Arguments are a command line args, an input file, and expected output. The
1054
     * input file and expected output are already split into lines and fields, the helper
1055
     * manages re-assembly. The program name from the command line args is printed if an
1056
     * an error occurs, it is useful to identify the test that failed.
1057
     *
1058
     * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of
1059
     * file input/output would enable running unit tests directly on top of tsvSummarize.
1060
     *
1061
     * Update (April 2020): With the introduction of InputSourceRange and ByLineSource,
1062
     * there needs to be a physical file when call processArgs. Its hard to get around,
1063
     * as the intent is to read the header line of the first input file during command
1064
     * line argument processing. Eventually this unit test process will need to be
1065
     * rewritten. For now, a file with the equivalent data is being added to the command
1066
     * line.
1067
     *
1068
     * Update (Sept 2020): The physical file needs to be closed for unit tests on
1069
     * Windows. This is so the temporary file can be deleted without trouble. Since its
1070
     * a placeholder in these tests, it's getting iterated but not popped off the
1071
     * inputSources and closed. Normal collection is not closing it quick enought. So
1072
     * all inputSources are closed at the end of this function.
1073
     */
1074
    void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected)
1075
    {
1076
        import std.array : appender;
1077

1078 15
        assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty.");
1079

1080
        auto formatAssertMessage(T...)(string msg, T formatArgs)
1081
        {
1082 0
            auto formatString = "[testSummarizer] %s: " ~ msg;
1083 0
            return format(formatString, cmdArgs[0], formatArgs);
1084
        }
1085

1086 15
        TsvSummarizeOptions cmdopt;
1087 15
        auto savedCmdArgs = cmdArgs.to!string;
1088 15
        auto r = cmdopt.processArgs(cmdArgs);
1089 15
        assert(r[0], formatAssertMessage("Invalid command line args: '%s'.", savedCmdArgs));
1090

1091 15
        assert(file.all!(line => line.length >= cmdopt.endFieldIndex),
1092
               formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file."));
1093

1094
        /* Pick the Summarizer based on the number of key-fields entered. */
1095 15
        auto summarizer =
1096
            (cmdopt.keyFields.length == 0)
1097 15
            ? new NoKeySummarizer!(typeof(appender!(char[])()))(
1098
                cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
1099

1100 15
            : (cmdopt.keyFields.length == 1)
1101 15
            ? new OneKeySummarizer!(typeof(appender!(char[])()))(
1102
                cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
1103

1104 15
            : new MultiKeySummarizer!(typeof(appender!(char[])()))(
1105
                cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
1106

1107
        /* Add the operators to the Summarizer. */
1108 15
        summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
1109

1110
        /* Process the file one line at a time. */
1111 15
        auto lineFields = new char[][](cmdopt.endFieldIndex);
1112 15
        bool headerFound = false;
1113 15
        foreach (lineNum, line; file.enumerate(1))
1114
        {
1115
            /* Copy the needed fields to the fields array. */
1116 15
            foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup;
1117

1118 15
            if (cmdopt.hasHeader && lineNum == 1)
1119
            {
1120 15
                if (!headerFound)
1121
                {
1122 15
                    summarizer.processHeaderLine(lineFields);
1123 15
                    headerFound = true;
1124
                }
1125
            }
1126
            else
1127
            {
1128 15
                try summarizer.processNextLine(lineFields);
1129
                catch (Exception exc)
1130
                {
1131 0
                    assert(false, formatAssertMessage(exc.msg));
1132
                }
1133
            }
1134
        }
1135 15
        auto printOptions = SummarizerPrintOptions(
1136
        cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
1137

1138 15
        auto summarizerOutput = appender!(char[])();
1139

1140 15
        if (cmdopt.hasHeader || cmdopt.writeHeader)
1141
        {
1142 15
            summarizer.writeSummaryHeader(summarizerOutput, printOptions);
1143
        }
1144

1145 15
        summarizer.writeSummaryBody(summarizerOutput, printOptions);
1146 15
        auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string;
1147 15
        if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n";
1148

1149 15
        assert(summarizerOutput.data == expectedOutput,
1150
               formatAssertMessage(
1151
                   "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1152
                   expectedOutput.to!string, summarizerOutput.data.to!string));
1153

1154
        /* Ensure all files are closed by emptying the stack. */
1155 15
        while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
1156
    }
1157

1158
    void writeDataFile(string filepath, string[][] fileData, string delimiter = "\t")
1159
    {
1160
        import std.algorithm;
1161
        import std.stdio;
1162

1163 15
        auto f = filepath.File("wb");
1164 15
        foreach (record; fileData) f.writeln(record.joiner(delimiter));
1165 15
        f.close;
1166
    }
1167
}
1168

1169
unittest
1170
{
1171
    import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1172
    import std.file : mkdir, rmdirRecurse;
1173
    import std.path : buildPath;
1174

1175 15
    auto testDir = makeUnittestTempDir("tsv_summarizer");
1176 15
    scope(exit) testDir.rmdirRecurse;
1177

1178
    /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited
1179
     * extent, command line option handling (TsvSummarizeOptions). Individual operators
1180
     * have separate tests, those tests test the no-key summarizer. The Values operator is
1181
     * used in these tests. It engages a number of behaviors, and the results have limited
1182
     * ambiguity. Using only one operator limits dependence on individual operators.
1183
     *
1184
     * Update (April 2020): There now needs to be a real file passed to testSummarizer.
1185
     * See the comments with testSummarizer for details.
1186
     */
1187

1188 15
    auto file1 = [["fld1", "fld2", "fld3"],
1189
                  ["a", "a",  "3"],
1190
                  ["c", "a",  "2b"],
1191
                  ["c", "bc", ""],
1192
                  ["a", "c",  "2b"],
1193
                  ["",  "bc", ""],
1194
                  ["c", "bc", "3"]];
1195

1196 15
    auto file1Path = buildPath(testDir, "file1.tsv");
1197 15
    auto file1NoHeaderPath = buildPath(testDir, "file1_noheader.tsv");
1198 15
    writeDataFile(file1Path, file1);
1199 15
    writeDataFile(file1NoHeaderPath, file1[1 .. $]);
1200

1201
    /* Single-key summarizer tests.
1202
     */
1203 15
    testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1", file1Path],
1204
                   file1,
1205
                   [["fld1", "fld1_values"],
1206
                    ["a", "a|a"],
1207
                    ["c", "c|c|c"],
1208
                    ["",  ""]]
1209
        );
1210 15
    testSummarizer(["unittest-sk-1-named", "--header", "--group-by", "fld1", "--values", "fld1", file1Path],
1211
                   file1,
1212
                   [["fld1", "fld1_values"],
1213
                    ["a", "a|a"],
1214
                    ["c", "c|c|c"],
1215
                    ["",  ""]]
1216
        );
1217 15
    testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2", file1Path],
1218
                   file1,
1219
                   [["fld1", "fld2_values"],
1220
                    ["a", "a|c"],
1221
                    ["c", "a|bc|bc"],
1222
                    ["",  "bc"]]
1223
        );
1224 15
    testSummarizer(["unittest-sk-2-named", "-H", "--group-by", "fld1", "--values", "fld2", file1Path],
1225
                   file1,
1226
                   [["fld1", "fld2_values"],
1227
                    ["a", "a|c"],
1228
                    ["c", "a|bc|bc"],
1229
                    ["",  "bc"]]
1230
        );
1231 15
    testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3", file1Path],
1232
                   file1,
1233
                   [["fld1", "fld3_values"],
1234
                    ["a", "3|2b"],
1235
                    ["c", "2b||3"],
1236
                    ["",  ""]]
1237
        );
1238 15
    testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3", file1Path],
1239
                   file1,
1240
                   [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1241
                    ["a", "a|a",   "a|c",     "3|2b"],
1242
                    ["c", "c|c|c", "a|bc|bc", "2b||3"],
1243
                    ["",  "",      "bc",      ""]]
1244
        );
1245 15
    testSummarizer(["unittest-sk-4-named-a", "-H", "--group-by", "fld1", "--values", "fld1,fld2,fld3", file1Path],
1246
                   file1,
1247
                   [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1248
                    ["a", "a|a",   "a|c",     "3|2b"],
1249
                    ["c", "c|c|c", "a|bc|bc", "2b||3"],
1250
                    ["",  "",      "bc",      ""]]
1251
        );
1252 15
    testSummarizer(["unittest-sk-4-named-b", "-H", "--group-by", "fld1", "--values", "fld*", file1Path],
1253
                   file1,
1254
                   [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1255
                    ["a", "a|a",   "a|c",     "3|2b"],
1256
                    ["c", "c|c|c", "a|bc|bc", "2b||3"],
1257
                    ["",  "",      "bc",      ""]]
1258
        );
1259 15
    testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3", file1Path],
1260
                   file1,
1261
                   [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1262
                    ["a", "a|a",   "a|c",     "3|2b"],
1263
                    ["c", "c|c|c", "a|bc|bc", "2b||3"],
1264
                    ["",  "",      "bc",      ""]]
1265
        );
1266 15
    testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1", file1Path],
1267
                   file1,
1268
                   [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1269
                    ["a", "3|2b",  "a|c",     "a|a"],
1270
                    ["c", "2b||3", "a|bc|bc", "c|c|c"],
1271
                    ["",  "",      "bc",      ""]]
1272
        );
1273 15
    testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1", file1Path],
1274
                   file1,
1275
                   [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1276
                    ["a", "3|2b",  "a|c",     "a|a"],
1277
                    ["c", "2b||3", "a|bc|bc", "c|c|c"],
1278
                    ["",  "",      "bc",      ""]]
1279
        );
1280 15
    testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1", file1Path],
1281
                   file1,
1282
                   [["fld2", "fld1_values"],
1283
                    ["a",  "a|c"],
1284
                    ["bc", "c||c"],
1285
                    ["c",  "a"]]
1286
        );
1287 15
    testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2", file1Path],
1288
                   file1,
1289
                   [["fld2", "fld2_values"],
1290
                    ["a",  "a|a"],
1291
                    ["bc", "bc|bc|bc"],
1292
                    ["c",  "c"]]
1293
        );
1294 15
    testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3", file1Path],
1295
                   file1,
1296
                   [["fld2", "fld3_values"],
1297
                    ["a",  "3|2b"],
1298
                    ["bc", "||3"],
1299
                    ["c",  "2b"]]
1300
        );
1301 15
    testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3", file1Path],
1302
                   file1,
1303
                   [["fld2", "fld1_values", "fld3_values"],
1304
                    ["a",  "a|c",  "3|2b"],
1305
                    ["bc", "c||c", "||3"],
1306
                    ["c",  "a",    "2b"]]
1307
        );
1308 15
    testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1", file1Path],
1309
                   file1,
1310
                   [["fld2", "fld3_values", "fld1_values"],
1311
                    ["a",  "3|2b", "a|c"],
1312
                    ["bc", "||3",  "c||c"],
1313
                    ["c",  "2b",   "a"]]
1314
        );
1315 15
    testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1", file1Path],
1316
                   file1,
1317
                   [["fld3", "fld1_values"],
1318
                    ["3",  "a|c"],
1319
                    ["2b", "c|a"],
1320
                    ["",   "c|"]]
1321
        );
1322 15
    testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2", file1Path],
1323
                   file1,
1324
                   [["fld3", "fld2_values"],
1325
                    ["3",  "a|bc"],
1326
                    ["2b", "a|c"],
1327
                    ["",   "bc|bc"]]
1328
        );
1329 15
    testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2", file1Path],
1330
                   file1,
1331
                   [["fld3", "fld1_values", "fld2_values"],
1332
                    ["3",  "a|c", "a|bc"],
1333
                    ["2b", "c|a", "a|c"],
1334
                    ["",   "c|",  "bc|bc"]]
1335
        );
1336 15
    testSummarizer(["unittest-sk-15-named", "-H", "--group-by", "fld3", "--values", "fld1,fld2", file1Path],
1337
                   file1,
1338
                   [["fld3", "fld1_values", "fld2_values"],
1339
                    ["3",  "a|c", "a|bc"],
1340
                    ["2b", "c|a", "a|c"],
1341
                    ["",   "c|",  "bc|bc"]]
1342
        );
1343

1344
    /* Multi-key summarizer tests.
1345
     */
1346 15
    testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1", file1Path],
1347
                   file1,
1348
                   [["fld1", "fld2", "fld1_values"],
1349
                    ["a", "a",  "a"],
1350
                    ["c", "a",  "c"],
1351
                    ["c", "bc", "c|c"],
1352
                    ["a", "c",  "a"],
1353
                    ["", "bc",  ""]]
1354
        );
1355 15
    testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2", file1Path],
1356
                   file1,
1357
                   [["fld1", "fld2", "fld2_values"],
1358
                    ["a", "a",  "a"],
1359
                    ["c", "a",  "a"],
1360
                    ["c", "bc", "bc|bc"],
1361
                    ["a", "c",  "c"],
1362
                    ["", "bc",  "bc"]]
1363
        );
1364 15
    testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3", file1Path],
1365
                   file1,
1366
                   [["fld1", "fld2", "fld3_values"],
1367
                    ["a", "a",  "3"],
1368
                    ["c", "a",  "2b"],
1369
                    ["c", "bc", "|3"],
1370
                    ["a", "c",  "2b"],
1371
                    ["", "bc",  ""]]
1372
        );
1373 15
    testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1", file1Path],
1374
                   file1,
1375
                   [["fld1", "fld2", "fld3_values", "fld1_values"],
1376
                    ["a", "a",  "3", "a"],
1377
                    ["c", "a",  "2b", "c"],
1378
                    ["c", "bc", "|3", "c|c"],
1379
                    ["a", "c",  "2b", "a"],
1380
                    ["",  "bc", "",   ""]]
1381
        );
1382 15
    testSummarizer(["unittest-mk-4-named", "-H", "--group-by", "fld1,fld2", "--values", "fld3,fld1", file1Path],
1383
                   file1,
1384
                   [["fld1", "fld2", "fld3_values", "fld1_values"],
1385
                    ["a", "a",  "3", "a"],
1386
                    ["c", "a",  "2b", "c"],
1387
                    ["c", "bc", "|3", "c|c"],
1388
                    ["a", "c",  "2b", "a"],
1389
                    ["",  "bc", "",   ""]]
1390
        );
1391 15
    testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1", file1Path],
1392
                   file1,
1393
                   [["fld3", "fld2", "fld1_values"],
1394
                    ["3",  "a",  "a"],
1395
                    ["2b", "a",  "c"],
1396
                    ["",   "bc", "c|"],
1397
                    ["2b", "c",  "a"],
1398
                    ["3",  "bc", "c"]]
1399
        );
1400 15
    testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1", file1Path],
1401
                   file1,
1402
                   [["fld3", "fld2", "fld1_values"],
1403
                    ["3",  "a",  "a"],
1404
                    ["2b", "a",  "c"],
1405
                    ["",   "bc", "c|"],
1406
                    ["2b", "c",  "a"],
1407
                    ["3",  "bc", "c"]]
1408
        );
1409 15
    testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2", file1Path],
1410
                   file1,
1411
                   [["fld2", "fld1", "fld3", "fld2_values"],
1412
                    ["a",  "a", "3",  "a"],
1413
                    ["a",  "c", "2b", "a"],
1414
                    ["bc", "c", "",   "bc"],
1415
                    ["c",  "a", "2b", "c"],
1416
                    ["bc", "",  "",   "bc"],
1417
                    ["bc", "c", "3",  "bc"]]
1418
        );
1419

1420
    /* Missing policies. */
1421 15
    testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing", file1Path],
1422
                   file1,
1423
                   [["fld1", "fld1_values"],
1424
                    ["a", "a|a"],
1425
                    ["c", "c|c|c"],
1426
                    ["",  ""]]
1427
        );
1428 15
    testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x", file1Path],
1429
                   file1,
1430
                   [["fld1", "fld2_values"],
1431
                    ["a", "a|c"],
1432
                    ["c", "a|bc|bc"],
1433
                    ["",  "bc"]]
1434
        );
1435 15
    testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x", file1Path],
1436
                   file1,
1437
                   [["fld1", "fld3_values"],
1438
                    ["a", "3|2b"],
1439
                    ["c", "2b|3"],
1440
                    ["",  ""]]
1441
        );
1442 15
    testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x", file1Path],
1443
                   file1,
1444
                   [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1445
                    ["a", "a|a",   "a|c",     "3|2b"],
1446
                    ["c", "c|c|c", "a|bc|bc", "2b|3"],
1447
                    ["",  "",      "bc",      ""]]
1448
        );
1449 15
    testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA", file1Path],
1450
                   file1,
1451
                   [["fld1", "fld1_values"],
1452
                    ["a", "a|a"],
1453
                    ["c", "c|c|c"],
1454
                    ["",  "NA"]]
1455
        );
1456 15
    testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA", file1Path],
1457
                   file1,
1458
                   [["fld1", "fld2_values"],
1459
                    ["a", "a|c"],
1460
                    ["c", "a|bc|bc"],
1461
                    ["",  "bc"]]
1462
        );
1463 15
    testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA", file1Path],
1464
                   file1,
1465
                   [["fld1", "fld3_values"],
1466
                    ["a", "3|2b"],
1467
                    ["c", "2b|NA|3"],
1468
                    ["",  "NA"]]
1469
        );
1470 15
    testSummarizer(["unittest-mis-7-named", "-H", "-g", "fld1", "--values", "fld3", "-r", "NA", file1Path],
1471
                   file1,
1472
                   [["fld1", "fld3_values"],
1473
                    ["a", "3|2b"],
1474
                    ["c", "2b|NA|3"],
1475
                    ["",  "NA"]]
1476
        );
1477 15
    testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA", file1Path],
1478
                   file1,
1479
                   [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1480
                    ["a", "a|a",   "a|c",     "3|2b"],
1481
                    ["c", "c|c|c", "a|bc|bc", "2b|NA|3"],
1482
                    ["",  "NA",      "bc",      "NA"]]
1483
        );
1484 15
    testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x", file1Path],
1485
                   file1,
1486
                   [["fld1", "fld2", "fld3_values", "fld1_values"],
1487
                    ["a", "a",  "3", "a"],
1488
                    ["c", "a",  "2b", "c"],
1489
                    ["c", "bc", "3", "c|c"],
1490
                    ["a", "c",  "2b", "a"],
1491
                    ["",  "bc", "",   ""]]
1492
        );
1493 15
    testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x", file1Path],
1494
                   file1,
1495
                   [["fld3", "fld2", "fld1_values"],
1496
                    ["3",  "a",  "a"],
1497
                    ["2b", "a",  "c"],
1498
                    ["",   "bc", "c"],
1499
                    ["2b", "c",  "a"],
1500
                    ["3",  "bc", "c"]]
1501
        );
1502 15
    testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x", file1Path],
1503
                   file1,
1504
                   [["fld2", "fld1", "fld3", "fld2_values"],
1505
                    ["a",  "a", "3",  "a"],
1506
                    ["a",  "c", "2b", "a"],
1507
                    ["bc", "c", "",   "bc"],
1508
                    ["c",  "a", "2b", "c"],
1509
                    ["bc", "",  "",   "bc"],
1510
                    ["bc", "c", "3",  "bc"]]
1511
        );
1512 15
    testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA", file1Path],
1513
                   file1,
1514
                   [["fld1", "fld2", "fld3_values", "fld1_values"],
1515
                    ["a", "a",  "3", "a"],
1516
                    ["c", "a",  "2b", "c"],
1517
                    ["c", "bc", "NA|3", "c|c"],
1518
                    ["a", "c",  "2b", "a"],
1519
                    ["",  "bc", "NA",   "NA"]]
1520
        );
1521 15
    testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA", file1Path],
1522
                   file1,
1523
                   [["fld3", "fld2", "fld1_values"],
1524
                    ["3",  "a",  "a"],
1525
                    ["2b", "a",  "c"],
1526
                    ["",   "bc", "c|NA"],
1527
                    ["2b", "c",  "a"],
1528
                    ["3",  "bc", "c"]]
1529
        );
1530 15
    testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA", file1Path],
1531
                   file1,
1532
                   [["fld2", "fld1", "fld3", "fld2_values"],
1533
                    ["a",  "a", "3",  "a"],
1534
                    ["a",  "c", "2b", "a"],
1535
                    ["bc", "c", "",   "bc"],
1536
                    ["c",  "a", "2b", "c"],
1537
                    ["bc", "",  "",   "bc"],
1538
                    ["bc", "c", "3",  "bc"]]
1539
        );
1540

1541
    /* Validate that the no-key summarizer works with testSummarizer helper function.
1542
     */
1543 15
    testSummarizer(["unittest-nk-1", "-H", "--values", "1,2", file1Path],
1544
                   file1,
1545
                   [["fld1_values", "fld2_values"],
1546
                    ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1547
        );
1548 15
    testSummarizer(["unittest-nk-1-named", "-H", "--values", "fld1,fld2", file1Path],
1549
                   file1,
1550
                   [["fld1_values", "fld2_values"],
1551
                    ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1552
        );
1553

1554
    /* Header variations: no header line; auto-generated header line; custom headers.
1555
     */
1556 15
    testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1", file1NoHeaderPath],
1557
                   file1[1..$],
1558
                   [["a", "a|a"],
1559
                    ["c", "c|c|c"],
1560
                    ["",  ""]]
1561
        );
1562 15
    testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2", file1NoHeaderPath],
1563
                   file1[1..$],
1564
                   [["a", "a",  "a"],
1565
                    ["c", "a",  "a"],
1566
                    ["c", "bc", "bc|bc"],
1567
                    ["a", "c",  "c"],
1568
                    ["", "bc",  "bc"]]
1569
        );
1570 15
    testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1", file1NoHeaderPath],
1571
                   file1[1..$],
1572
                   [["field2", "field1_values"],
1573
                    ["a",  "a|c"],
1574
                    ["bc", "c||c"],
1575
                    ["c",  "a"]]
1576
        );
1577 15
    testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1", file1NoHeaderPath],
1578
                   file1[1..$],
1579
                   [["field3", "field2", "field1_values"],
1580
                    ["3",  "a",  "a"],
1581
                    ["2b", "a",  "c"],
1582
                    ["",   "bc", "c|"],
1583
                    ["2b", "c",  "a"],
1584
                    ["3",  "bc", "c"]]
1585
        );
1586 15
    testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values", file1Path],
1587
                   file1,
1588
                   [["fld2", "Field3Values"],
1589
                    ["a",  "3|2b"],
1590
                    ["bc", "||3"],
1591
                    ["c",  "2b"]]
1592
        );
1593 15
    testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues", file1Path],
1594
                   file1,
1595
                   [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1596
                    ["a", "a",  "3", "a"],
1597
                    ["c", "a",  "2b", "c"],
1598
                    ["c", "bc", "|3", "c|c"],
1599
                    ["a", "c",  "2b", "a"],
1600
                    ["",  "bc", "",   ""]]
1601
        );
1602 15
    testSummarizer(["unittest-hdr-6-named-a", "-H", "--group-by", "fld1,fld2", "--values", "fld3:FieldThreeValues", "--values", "fld1:FieldOneValues", file1Path],
1603
                   file1,
1604
                   [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1605
                    ["a", "a",  "3", "a"],
1606
                    ["c", "a",  "2b", "c"],
1607
                    ["c", "bc", "|3", "c|c"],
1608
                    ["a", "c",  "2b", "a"],
1609
                    ["",  "bc", "",   ""]]
1610
        );
1611 15
    testSummarizer(["unittest-hdr-6-named-b", "-H", "--group-by", "fld1,fld2", "--values", "fld3 FieldThreeValues", "--values", "fld1 FieldOneValues", file1Path],
1612
                   file1,
1613
                   [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1614
                    ["a", "a",  "3", "a"],
1615
                    ["c", "a",  "2b", "c"],
1616
                    ["c", "bc", "|3", "c|c"],
1617
                    ["a", "c",  "2b", "a"],
1618
                    ["",  "bc", "",   ""]]
1619
        );
1620 15
    testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals", file1NoHeaderPath],
1621
                   file1[1..$],
1622
                   [["field1", "f3_vals", "f2_vals", "f1_vals"],
1623
                    ["a", "3|2b",  "a|c",     "a|a"],
1624
                    ["c", "2b||3", "a|bc|bc", "c|c|c"],
1625
                    ["",  "",      "bc",      ""]]
1626
        );
1627 15
    testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath],
1628
                   file1[1..$],
1629
                   [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1630
                    ["a", "3",  "a",  "3",  "a", "a"],
1631
                    ["c", "2b", "a",  "2b", "c", "a"],
1632
                    ["c", "",   "bc", "",   "c", "bc"],
1633
                    ["a", "2b", "c",  "2b", "a", "c"],
1634
                    ["",  "",   "bc", "",   "",  "bc"],
1635
                    ["c", "3",  "bc", "3",  "c", "bc"]]
1636
        );
1637 15
    testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath],
1638
                   file1[1..$],
1639
                   [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1640
                    ["a", "3",  "a",  "3",  "a", "a"],
1641
                    ["c", "2b", "a",  "2b", "c", "a"],
1642
                    ["c", "",   "bc", "",   "c", "bc"],
1643
                    ["a", "2b", "c",  "2b", "a", "c"],
1644
                    ["",  "",   "bc", "",   "",  "bc"],
1645
                    ["c", "3",  "bc", "3",  "c", "bc"]]
1646
        );
1647

1648
    /* Alternate file widths and lengths.
1649
     */
1650

1651 15
    auto file3x2 = [["fld1", "fld2", "fld3"],
1652
                    ["a", "b", "c"],
1653
                    ["c", "b", "a"]];
1654

1655 15
    auto file3x2Path = buildPath(testDir, "file3x2.tsv");
1656 15
    auto file3x2NoHeaderPath = buildPath(testDir, "file3x2_noheader.tsv");
1657 15
    writeDataFile(file3x2Path, file3x2);
1658 15
    writeDataFile(file3x2NoHeaderPath, file3x2[1 .. $]);
1659

1660 15
    testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3", file3x2Path],
1661
                   file3x2,
1662
                   [["fld1", "fld3_values"],
1663
                    ["a", "c"],
1664
                    ["c", "a"]]
1665
        );
1666 15
    testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3", file3x2Path],
1667
                   file3x2,
1668
                   [["fld2", "fld3_values"],
1669
                    ["b", "c|a"]]
1670
        );
1671 15
    testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3", file3x2Path],
1672
                   file3x2,
1673
                   [["fld2", "fld1", "fld3_values"],
1674
                    ["b", "a", "c"],
1675
                    ["b", "c", "a"]]
1676
        );
1677

1678 15
    auto file3x1 = [["fld1", "fld2", "fld3"],
1679
                    ["a", "b", "c"]];
1680

1681 15
    auto file3x1Path = buildPath(testDir, "file3x1.tsv");
1682 15
    auto file3x1NoHeaderPath = buildPath(testDir, "file3x1_noheader.tsv");
1683 15
    writeDataFile(file3x1Path, file3x1);
1684 15
    writeDataFile(file3x1NoHeaderPath, file3x1[1 .. $]);
1685

1686 15
    testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3", file3x1Path],
1687
                   file3x1,
1688
                   [["fld1", "fld3_values"],
1689
                    ["a", "c"]]
1690
        );
1691 15
    testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3", file3x1NoHeaderPath],
1692
                   file3x1[1..$],
1693
                   [["a", "c"]]
1694
        );
1695 15
    testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3", file3x1Path],
1696
                   file3x1,
1697
                   [["fld2", "fld1", "fld3_values"],
1698
                    ["b", "a", "c"]]
1699
        );
1700 15
    testSummarizer(["unittest-3x1-3-named", "-H", "--group-by", "fld2,fld1", "--values", "fld3", file3x1Path],
1701
                   file3x1,
1702
                   [["fld2", "fld1", "fld3_values"],
1703
                    ["b", "a", "c"]]
1704
        );
1705 15
    testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3", file3x1NoHeaderPath],
1706
                   file3x1[1..$],
1707
                   [["b", "a", "c"]]
1708
        );
1709

1710 15
    auto file3x0 = [["fld1", "fld2", "fld3"]];
1711

1712 15
    auto file3x0Path = buildPath(testDir, "file3x0.tsv");
1713 15
    auto file3x0NoHeaderPath = buildPath(testDir, "file3x0_noheader.tsv");
1714 15
    writeDataFile(file3x0Path, file3x0);
1715 15
    writeDataFile(file3x0NoHeaderPath, file3x0[1 .. $]);
1716

1717

1718 15
    testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3", file3x0Path],
1719
                   file3x0,
1720
                   [["fld1", "fld3_values"]]
1721
        );
1722 15
    testSummarizer(["unittest-3x0-1-named", "-H", "--group-by", "fld1", "--values", "fld3", file3x0Path],
1723
                   file3x0,
1724
                   [["fld1", "fld3_values"]]
1725
        );
1726 15
    testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3", file3x0NoHeaderPath],
1727
                   file3x0[1..$],
1728
                   []
1729
        );
1730 15
    testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3", file3x0NoHeaderPath],
1731
                   file3x0[1..$],
1732
                   [["field1", "field3_values"]]
1733
        );
1734

1735

1736 15
    testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3", file3x0Path],
1737
                   file3x0,
1738
                   [["fld2", "fld1", "fld3_values"]]
1739
        );
1740

1741 15
    testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath],
1742
                   file3x0[1..$],
1743
                   []
1744
        );
1745

1746 15
    testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath],
1747
                   file3x0[1..$],
1748
                   [["field2", "field1", "field3_values"]]
1749
        );
1750

1751 15
    auto file2x1 = [["fld1", "fld2"],
1752
                    ["a", "b"]];
1753

1754 15
    auto file2x1Path = buildPath(testDir, "file2x1.tsv");
1755 15
    auto file2x1NoHeaderPath = buildPath(testDir, "file2x1_noheader.tsv");
1756 15
    writeDataFile(file2x1Path, file2x1);
1757 15
    writeDataFile(file2x1NoHeaderPath, file2x1[1 .. $]);
1758

1759 15
    testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2", file2x1Path],
1760
                   file2x1,
1761
                   [["fld1", "fld2_values"],
1762
                    ["a", "b"]]
1763
        );
1764 15
    testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1", file2x1Path],
1765
                   file2x1,
1766
                   [["fld2", "fld1", "fld1_values"],
1767
                    ["b", "a", "a"]]
1768
        );
1769

1770 15
    auto file2x0 = [["fld1", "fld2"]];
1771

1772 15
    auto file2x0Path = buildPath(testDir, "file2x0.tsv");
1773 15
    auto file2x0NoHeaderPath = buildPath(testDir, "file2x0_noheader.tsv");
1774 15
    writeDataFile(file2x0Path, file2x0);
1775 15
    writeDataFile(file2x0NoHeaderPath, file2x0[1 .. $]);
1776

1777 15
    testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2", file2x0Path],
1778
                   file2x0,
1779
                   [["fld1", "fld2_values"]]
1780
        );
1781 15
    testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1", file2x0Path],
1782
                   file2x0,
1783
                   [["fld2", "fld1", "fld1_values"]]
1784
        );
1785

1786 15
    auto file1x2 = [["fld1"],
1787
                    ["a"],
1788
                    [""]];
1789

1790 15
    auto file1x2Path = buildPath(testDir, "file1x2.tsv");
1791 15
    auto file1x2NoHeaderPath = buildPath(testDir, "file1x2_noheader.tsv");
1792 15
    writeDataFile(file1x2Path, file1x2);
1793 15
    writeDataFile(file1x2NoHeaderPath, file1x2[1 .. $]);
1794

1795 15
    testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1", file1x2Path],
1796
                   file1x2,
1797
                   [["fld1", "fld1_values"],
1798
                    ["a", "a"],
1799
                    ["",  ""]]
1800
        );
1801

1802 15
    auto file1x2b = [["fld1"],
1803
                     [""],
1804
                     [""]];
1805

1806 15
    auto file1x2bPath = buildPath(testDir, "file1x2b.tsv");
1807 15
    auto file1x2bNoHeaderPath = buildPath(testDir, "file1x2b_noheader.tsv");
1808 15
    writeDataFile(file1x2bPath, file1x2b);
1809 15
    writeDataFile(file1x2bNoHeaderPath, file1x2b[1 .. $]);
1810

1811 15
    testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1", file1x2bPath],
1812
                   file1x2b,
1813
                   [["fld1", "fld1_values"],
1814
                    ["", "|"]]
1815
        );
1816

1817 15
    auto file1x1 = [["fld1"],
1818
                    ["x"]];
1819

1820 15
    auto file1x1Path = buildPath(testDir, "file1x1.tsv");
1821 15
    auto file1x1NoHeaderPath = buildPath(testDir, "file1x1_noheader.tsv");
1822 15
    writeDataFile(file1x1Path, file1x1);
1823 15
    writeDataFile(file1x1NoHeaderPath, file1x1[1 .. $]);
1824

1825 15
    testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1", file1x1Path],
1826
                   file1x1,
1827
                   [["fld1", "fld1_values"],
1828
                    ["x", "x"]]
1829
        );
1830 15
    testSummarizer(["unittest-1x1-1-named", "-H", "--group-by", "fld1", "--values", "fld1", file1x1Path],
1831
                   file1x1,
1832
                   [["fld1", "fld1_values"],
1833
                    ["x", "x"]]
1834
        );
1835

1836 15
    testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1", file1x1NoHeaderPath],
1837
                   file1x1[1..$],
1838
                   [["x", "x"]]
1839
        );
1840

1841 15
    testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1", file1x1NoHeaderPath],
1842
                   file1x1[1..$],
1843
                   [["field1", "field1_values"],
1844
                    ["x", "x"]]
1845
        );
1846

1847 15
    auto file1x1b = [["fld1"],
1848
                    [""]];
1849

1850 15
    auto file1x1bPath = buildPath(testDir, "file1x1b.tsv");
1851 15
    auto file1x1bNoHeaderPath = buildPath(testDir, "file1x1b_noheader.tsv");
1852 15
    writeDataFile(file1x1bPath, file1x1b);
1853 15
    writeDataFile(file1x1bNoHeaderPath, file1x1b[1 .. $]);
1854

1855 15
    testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1", file1x1bPath],
1856
                   file1x1b,
1857
                   [["fld1", "fld1_values"],
1858
                    ["", ""]]
1859
        );
1860

1861 15
    auto file1x0 = [["fld1"]];
1862

1863 15
    auto file1x0Path = buildPath(testDir, "file1x0.tsv");
1864 15
    auto file1x0NoHeaderPath = buildPath(testDir, "file1x0_noheader.tsv");
1865 15
    writeDataFile(file1x0Path, file1x0);
1866 15
    writeDataFile(file1x0NoHeaderPath, file1x0[1 .. $]);
1867

1868 15
    testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1", file1x0Path],
1869
                   file1x0,
1870
                   [["fld1", "fld1_values"]]
1871
        );
1872

1873 15
    testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1", file1x0NoHeaderPath],
1874
                   file1x0[1..$],
1875
                   []
1876
        );
1877

1878 15
    testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1", file1x0NoHeaderPath],
1879
                   file1x0[1..$],
1880
                   [["field1", "field1_values"]]
1881
        );
1882

1883
    /* Alternate delimiters.
1884
     *
1885
     * Note: In current unit test setup the data is already in memory (file1).
1886
     * 'file1Path' points to a file with equivalent data, but not read, except if
1887
     * processing the header line. A data file is created for the '%' and '#'
1888
     * delimiter cases (these read the header), but we don't bother for the others.
1889
     */
1890 15
    auto file1PctDelimPath = buildPath(testDir, "file1PctDelim.tsv");
1891 15
    auto file1HashDelimPath = buildPath(testDir, "file1HashDelim.tsv");
1892 15
    writeDataFile(file1PctDelimPath, file1, "%");
1893 15
    writeDataFile(file1HashDelimPath, file1, "#");
1894

1895 15
    testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%", file1PctDelimPath],
1896
                   file1,
1897
                   [["fld1_values", "fld2_values"],
1898
                    ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1899
        );
1900 15
    testSummarizer(["unittest-delim-1-named", "-H", "--values", "fld1,fld2", "--delimiter", "%", file1PctDelimPath],
1901
                   file1,
1902
                   [["fld1_values", "fld2_values"],
1903
                    ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1904
        );
1905 15
    testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$", file1Path],
1906
                   file1,
1907
                   [["fld1_values", "fld2_values"],
1908
                    ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]]
1909
        );
1910 15
    testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath],
1911
                   file1,
1912
                   [["fld1_values", "fld2_values"],
1913
                    ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1914
        );
1915 15
    testSummarizer(["unittest-delim-3-named", "-H", "--values", "fld1,fld2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath],
1916
                   file1,
1917
                   [["fld1_values", "fld2_values"],
1918
                    ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1919
        );
1920 15
    testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1",
1921
                    "--delimiter", "^", "--values-delimiter", ":", file1NoHeaderPath],
1922
                   file1[1..$],
1923
                   [["field2", "field1_values"],
1924
                    ["a",  "a:c"],
1925
                    ["bc", "c::c"],
1926
                    ["c",  "a"]]
1927
        );
1928 15
    testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/",
1929
                    "--values-delimiter", "\\", file1NoHeaderPath],
1930
                   file1[1..$],
1931
                   [["a", "a",  "a"],
1932
                    ["c", "a",  "a"],
1933
                    ["c", "bc", "bc\\bc"],
1934
                    ["a", "c",  "c"],
1935
                    ["", "bc",  "bc"]]
1936
        );
1937
}
1938

1939
/* Summary Operators and Calculators
1940
 *
1941
 * Two types of objects are used in implementation: Operators and Calculators. An Operator
1942
 * represents a summary calculation specified on the command line, e.g. '--mean 5'. A
1943
 * Calculator is used to manage the summary calculation for each unique key in the input.
1944
 *
1945
 * As an example, consider the command:
1946
 *
1947
 *    $tsv-summarize --group-by 1 --mean 3 --mean 5
1948
 *
1949
 * This command will create two instances of a MeanOperator, one each for fields 3 and 5.
1950
 * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also
1951
 * create MeanCalculator objects for each unique value in field 1. For 'mean', a
1952
 * calculator needs to track occurrence count and sum. Calculators produce the final
1953
 * value when all processing is finished.
1954
 *
1955
 * Summary field headers
1956
 *
1957
 * There are several options for specifying summary field headers. The defaults combine the
1958
 * operator name and the header of the field summarized. The defaults can be overridden on
1959
 * on the command line. These scenarios are supported via the operator constructor and the
1960
 * processHeaderLine() method.
1961
 *
1962
 * Missing field policy
1963
 *
1964
 * At present, tsv-summarize has a single policy for handling missing values that applies
1965
 * to all operators. However, it is logically operator specific and is implemented that
1966
 * way. The MissingFieldPolicy struct describes the policy, each operator contains one.
1967
 * Calculators access thier operator's policy struct.
1968
 */
1969

1970
/** An Operator represents a summary calculation specified on the command line.
1971
 *  e.g. '--mean 5'.
1972
 */
1973
interface Operator
1974
{
1975
    @property string header();
1976
    @property string name();
1977
    void processHeaderLine(const char[][] fields);
1978
    size_t[] numericFieldsToSave();     // Numeric fields this Operator needs saved
1979
    size_t[] textFieldsToSave();        // Text fields this Operator needs saved
1980
    Calculator makeCalculator();
1981
}
1982

1983
/** Calculators are responsible for the calculation of a single computation. They
1984
 *  process each line and produce the final value when all processing is finished.
1985
 */
1986
interface Calculator
1987
{
1988
    void processNextLine(const char[][] fields);
1989
    string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions);
1990
}
1991

1992
/** This class describes processing behavior when a missing value is encountered.
1993
 */
1994
final class MissingFieldPolicy
1995
{
1996
    private bool _useMissing = true;          // True if missing values are processed unchanged.
1997
    private bool _replaceMissing = false;     // True if missing values are replaced.
1998
    private string _missingReplacement;       // Replacement string if replaceMissing is true.
1999

2000 15
    this (const bool excludeMissing = false, string missingReplacement = "")
2001
    {
2002 15
        updatePolicy(excludeMissing, missingReplacement);
2003
    }
2004

2005
    void updatePolicy(const bool excludeMissing, string missingReplacement)
2006
    {
2007 15
        _missingReplacement = missingReplacement;
2008 15
        _replaceMissing = missingReplacement.length != 0;
2009 15
        _useMissing = !excludeMissing && !replaceMissing;
2010
    }
2011

2012
    final bool isMissingField(const char[] field) const
2013
    {
2014 15
        return field.length == 0;
2015
    }
2016

2017
    final bool useMissing() const @property
2018
    {
2019 15
        return _useMissing;
2020
    }
2021

2022
    final bool excludeMissing() const @property
2023
    {
2024 0
        return !_useMissing && !_replaceMissing;
2025
    }
2026

2027
    final bool replaceMissing() const @property
2028
    {
2029 15
        return _replaceMissing;
2030
    }
2031

2032
    final string missingReplacement() const @property
2033
    {
2034 15
        return _missingReplacement;
2035
    }
2036
}
2037

2038
/* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected
2039
 * while reading data. Operations like median collect all values and operate on them when
2040
 * running the final calculation. Value lists are needed for each unique key. A command
2041
 * using multiple Operators may save multiple fields. And, different Operators may be run
2042
 * against the same field.
2043
 *
2044
 * The last part motivates these classes. Handling large data sets necessitates minimizing
2045
 * in-memory storage, making it desirable to share identical lists between Calculators.
2046
 * Otherwise, each Calculator could implement its own storage, which would be simpler.
2047
 *
2048
 * The setup works as follows:
2049
 *  - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods).
2050
 *  - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list
2051
 *    of the fields advertised by Operators as needing sharing. This list gets created
2052
 *    during command initialization (SummarizerBase.setOperators).
2053
 *  - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every
2054
 *    time a new unique key is found, in parellel to the Calculator objects created for the
2055
 *    key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes.
2056
 *  - A unique key's UniqueKeyValuesLists object is passed each input line, same as
2057
 *    Calculators, saving the values.
2058
 *  - Calculators retrieve the saved values during the calculation phase. The calculator's
2059
 *    ProcessNextField method is typically a no-op.
2060
 *  - Calculators cannot make assumptions about the order of the saved values. This is
2061
 *    pragmatic concession to median and quantile calculations, which need to sort the data,
2062
 *    at least partially. Rather than generate sorted copies, the current algorithms
2063
 *    sort the data in place.
2064
 *
2065
 * One concession to duplicate storage is that text and numeric versions of the same
2066
 * field might be stored. The reason is because it's important to convert text to numbers
2067
 * as they are read so that useful error messages can be generated. And, storing both
2068
 * forms of the same field should be less common.
2069
 *
2070
 * The current implementation uses the same missing values policy for all fields. If
2071
 * multiple policies become supported this will need to change.
2072
 *
2073
 * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is
2074
 * to avoid repeated calculations of the median by different calculations.
2075
 */
2076

2077
final class SharedFieldValues
2078
{
2079
    // Arrays with field indices that need to be saved.
2080
    private size_t[] _numericFieldIndices;
2081
    private size_t[] _textFieldIndices;
2082

2083
    /* Called during summarizer setup to add a shared field value for a specific field index.
2084
     * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index.
2085
     * A specific index is only added once.
2086
     */
2087
    final void addNumericIndex (size_t index)
2088
    {
2089 15
        if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index;
2090
    }
2091

2092
    /* Similar to addNumericIndex, except adds a text index. */
2093
    final void addTextIndex (size_t index)
2094
    {
2095 15
        if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index;
2096
    }
2097

2098
    /* Called every time a new key is found, or once at the beginning of the program if no keys
2099
     * are being used (entire column summarized).
2100
     */
2101
    final UniqueKeyValuesLists makeUniqueKeyValuesLists()
2102
    {
2103 15
        return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices);
2104
    }
2105
}
2106

2107
final class UniqueKeyValuesLists
2108
{
2109
    /* A FieldValues object holds is a list of values collect for a specific field. A
2110
     * unique key may hold several. For example, the command:
2111
     *     $ tsv-summarize --k 1 --median 4 -- median 5
2112
     * requires keeping lists for both fields 4 and 5. This in turn will result in a
2113
     * _numericFieldValues being a 2 element array, one with a list of field 4 values,
2114
     * the second of field 5 values. Linear search is used to find a specific field.
2115
     */
2116
    private FieldValues!double[] _numericFieldValues;
2117
    private FieldValues!string[] _textFieldValues;
2118
    private double[] _numericFieldMedians;
2119

2120
    /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */
2121 15
    this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices)
2122
    {
2123 15
        if (numericFieldIndices.length > 0)
2124
        {
2125 15
            _numericFieldValues = new FieldValues!double[](numericFieldIndices.length);
2126 15
            foreach (i, fieldIndex; numericFieldIndices)
2127 15
                _numericFieldValues[i] = new FieldValues!double(fieldIndex);
2128
        }
2129

2130 15
        if (textFieldIndices.length > 0)
2131
        {
2132 15
            _textFieldValues = new FieldValues!string[](textFieldIndices.length);
2133 15
            foreach (i, fieldIndex; textFieldIndices)
2134 15
                _textFieldValues[i] = new FieldValues!string(fieldIndex);
2135
        }
2136
    }
2137

2138
    void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
2139
    {
2140 15
        _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
2141 15
        _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
2142
    }
2143

2144
    private FieldValues!double findNumericFieldValues(size_t index)
2145
    {
2146 15
        alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b);
2147 15
        auto r = find!pred(_numericFieldValues, index);
2148 15
        assert(!r.empty);
2149 15
        return r.front;
2150
    }
2151

2152
    private FieldValues!string findTextFieldValues(size_t index)
2153
    {
2154 15
        alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b);
2155 15
        auto r = find!pred(_textFieldValues, index);
2156 15
        assert(!r.empty);
2157 15
        return r.front;
2158
    }
2159

2160
    final double[] numericValues(size_t index)
2161
    {
2162 15
        return findNumericFieldValues(index).getArray;
2163
    }
2164

2165
    final double[] numericValuesSorted(size_t index)
2166
    {
2167 15
        return findNumericFieldValues(index).getSortedArray;
2168
    }
2169

2170
    final string[] textValues(size_t index)
2171
    {
2172 15
        return findTextFieldValues(index).getArray;
2173
    }
2174

2175
    final string[] textValuesSorted(size_t index)
2176
    {
2177 0
        return findTextFieldValues(index).getSortedArray;
2178
    }
2179

2180
    final double numericValuesMedian(size_t index)
2181
    {
2182 15
        return findNumericFieldValues(index).median;
2183
    }
2184

2185
    private final class FieldValues(ValueType)
2186
    {
2187
        import std.array : appender;
2188
        private size_t _fieldIndex;
2189
        private Appender!(ValueType[]) _values;
2190
        private bool _haveMedian = false;
2191
        private bool _isSorted = false;
2192
        private ValueType _medianValue;
2193

2194 15
        this(size_t fieldIndex)
2195
        {
2196 15
            _fieldIndex = fieldIndex;
2197
        }
2198

2199
        final size_t length() const @property
2200
        {
2201 0
            return _values.data.length;
2202
        }
2203

2204
        final size_t fieldIndex() const @property
2205
        {
2206 15
            return _fieldIndex;
2207
        }
2208

2209
        final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
2210
        {
2211
            debug writefln("[%s]: %s", __FUNCTION__, fields.to!string);
2212

2213 15
            const char[] field = fields[_fieldIndex];
2214 15
            if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2215
            {
2216 15
                _values.put(field.to!ValueType);
2217 15
                _haveMedian = false;
2218 15
                _isSorted = false;
2219
            }
2220 15
            else if (missingPolicy.replaceMissing)
2221
            {
2222 15
                _values.put(missingPolicy.missingReplacement.to!ValueType);
2223 15
                _haveMedian = false;
2224 15
                _isSorted = false;
2225
            }
2226
        }
2227

2228
        /* Return an input range of the values. */
2229
        final auto values()
2230
        {
2231 0
            return _values.data;
2232
        }
2233

2234
        final ValueType[] getArray()
2235
        {
2236 15
            return _values.data;
2237
        }
2238

2239
        final ValueType[] getSortedArray()
2240
        {
2241 15
            if (!_isSorted)
2242
            {
2243
                import std.algorithm : sort;
2244 15
                sort(_values.data);
2245 15
                _isSorted = true;
2246
            }
2247 15
            return _values.data;
2248
        }
2249

2250
        final ValueType median()
2251
        {
2252 15
            if (!_haveMedian)
2253
            {
2254
                import tsv_utils.common.numerics : rangeMedian;
2255 15
                _medianValue = _values.data.rangeMedian();
2256 15
                _haveMedian = true;
2257
            }
2258

2259 15
            return _medianValue;
2260
        }
2261
    }
2262
}
2263

2264
/** SingleFieldOperator is a base class for single field operators, the most common
2265
 * Operator. Derived classes implement makeCalculator and the Calculator class it returns.
2266
 */
2267
class SingleFieldOperator : Operator
2268
{
2269
    import std.typecons : Flag;
2270

2271
    private string _name;
2272
    private string _header;
2273
    private size_t _fieldIndex;
2274
    private bool _useHeaderSuffix;
2275
    private bool _allowCustomHeader;
2276
    private bool _hasCustomHeader = false;
2277
    private size_t[] _numericFieldsToSave;
2278
    private size_t[] _textFieldsToSave;
2279
    private MissingFieldPolicy _missingPolicy;
2280

2281 15
    this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy,
2282
         Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix,
2283
         Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader)
2284
    {
2285 15
        _name = operatorName;
2286 15
        _fieldIndex = fieldIndex;
2287 15
        _missingPolicy = missingPolicy;
2288 15
        _useHeaderSuffix = useHeaderSuffix;
2289 15
        _allowCustomHeader = allowCustomHeader;
2290
        // Default header. May be overrridden by custom header or header line.
2291 15
        _header =
2292
            fieldHeaderFromIndex(fieldIndex)
2293 15
            .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : "");
2294
    }
2295

2296
    void setCustomHeader (string customHeader)
2297
    {
2298 15
        assert(_allowCustomHeader);
2299 15
        _header = customHeader;
2300 15
        _hasCustomHeader = true;
2301
    }
2302

2303
    final string name() const @property
2304
    {
2305 0
        return _name;
2306
    }
2307

2308
    final bool allowCustomHeader() const @property
2309
    {
2310 15
        return _allowCustomHeader;
2311
    }
2312

2313
    /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field
2314
     * that the field values should be saved. These should called during construction.
2315
     */
2316
    final void setSaveFieldValuesNumeric()
2317
    {
2318 15
        _numericFieldsToSave ~= _fieldIndex;
2319
    }
2320

2321
    final void setSaveFieldValuesText()
2322
    {
2323 15
        _textFieldsToSave ~= _fieldIndex;
2324
    }
2325

2326
    final MissingFieldPolicy missingPolicy() @property
2327
    {
2328 15
        return _missingPolicy;
2329
    }
2330

2331
    final size_t fieldIndex() const @property
2332
    {
2333 15
        return _fieldIndex;
2334
    }
2335

2336
    final string header() const @property
2337
    {
2338 15
        return _header;
2339
    }
2340

2341
    final bool useHeaderSuffix() const @property
2342
    {
2343 0
        return _useHeaderSuffix;
2344
    }
2345

2346
    void processHeaderLine(const char[][] fields)
2347
    {
2348 15
        if (!_hasCustomHeader) {
2349
            debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2350 15
            _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string,
2351 15
                                                   _useHeaderSuffix ? _name : "");
2352
        }
2353
    }
2354

2355
    final size_t[] numericFieldsToSave()
2356
    {
2357 15
        return _numericFieldsToSave;
2358
    }
2359

2360
    final size_t[] textFieldsToSave()
2361
    {
2362 15
        return _textFieldsToSave;
2363
    }
2364

2365
    abstract SingleFieldCalculator makeCalculator();
2366
}
2367

2368
/** SingleFieldCalculator is a base class for the common case of calculators using a single
2369
 * field. Derived classes implement processNextField() rather than processNextLine().
2370
 */
2371
class SingleFieldCalculator : Calculator
2372
{
2373
    private size_t _fieldIndex;
2374

2375 15
    this(size_t fieldIndex)
2376
    {
2377 15
        _fieldIndex = fieldIndex;
2378
    }
2379

2380
    final size_t fieldIndex() const @property
2381
    {
2382 15
        return _fieldIndex;
2383
    }
2384

2385
    final void processNextLine(const char[][] fields)
2386
    {
2387
        debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2388

2389 15
        auto missingPolicy = getOperator.missingPolicy;
2390 15
        const char[] field = fields[_fieldIndex];
2391

2392 15
        if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2393
        {
2394 15
            processNextField(field);
2395
        }
2396 15
        else if (missingPolicy.replaceMissing)
2397
        {
2398 15
            processNextField(missingPolicy.missingReplacement);
2399
        }
2400
    }
2401

2402
    abstract SingleFieldOperator getOperator();
2403

2404
    abstract void processNextField(const char[] field);
2405
}
2406

2407
/* Unittest helper functions. Only compiled when -unittest is in effect. */
2408
version(unittest)
2409
{
2410
    /** A helper for SingleFieldOperator unit tests.
2411
     *
2412
     * testSingleFieldOperator takes a set of split file values, a field index, a header
2413
     * suffix, and a set of expected values. The expected values array contains the
2414
     * initial value (zero entries) and the expected values after each line. (One more
2415
     * expected value than input lines.) The zero entry case is what is generated for an
2416
     * empty file. An example testing the 'min' operator against a file with 2 columns,
2417
     * 3 rows, using field index 1:
2418
     *
2419
     *    testSingleFieldOperator!MinOperator(
2420
     *       [["10", "100"],               // The split file. 3 lines by 2 rows.
2421
     *        ["5", "50"],
2422
     *        ["20", "200"]],
2423
     *       1,                            // Field index (zero-based, so "100", "50", "200")
2424
     *       "min",                        // The header suffix, normally the operator name.
2425
     *       ["nan", "100", "50", "50"]);  // Min value after processing each line.
2426
     *
2427
     * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3.
2428
     * Then run the operator is tested against each column, a total of six calls. Headers
2429
     * are automatically checked. Additional entries can be used to extend coverage.
2430
     *
2431
     * A non-default MissingFieldPolicy can be provide as an optional last argument.
2432
     * Operator tests should include exclusion and replacement variations. See operator
2433
     * unit tests for details.
2434
     *
2435
     * The testSingleFieldOperatorBase adds an additional capability - Custom operator
2436
     * init arguments. Currently this is used only by the quantile operator.
2437
     *
2438
     * These tests do not check unique key behavior (group-by). Operators don't have info
2439
     * about unique keys, and interact with them only indirectly, via Calculators.
2440
     */
2441
    void testSingleFieldOperator(OperatorClass : SingleFieldOperator)
2442
        (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2443
         const char[][] expectedValues,
2444
         MissingFieldPolicy missingPolicy = new MissingFieldPolicy)
2445
    {
2446 15
        testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy);
2447
    }
2448

2449
    void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...)
2450
        (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2451
         const char[][] expectedValues,
2452
         MissingFieldPolicy missingPolicy,
2453
         T extraOpInitArgs)
2454
    {
2455
        import std.format : format;
2456
        import std.array : appender;
2457
        import std.string : chomp;
2458
        import std.traits : EnumMembers;
2459

2460 15
        auto numFields = (splitFile[0]).length;
2461

2462 15
        assert(fieldIndex < numFields,
2463
               format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s",
2464
                      headerSuffix));
2465 15
        assert(splitFile.length + 1 == expectedValues.length,
2466
               format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2467
                      headerSuffix));
2468

2469
        /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */
2470 15
        auto printOptions = SummarizerPrintOptions('#', '|');
2471

2472
        /* An input header line. */
2473 15
        string[] inputHeaderLine = new string[numFields];
2474 15
        foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2475

2476
        /* The different expected output field headers. */
2477 15
        auto outputFieldHeaderWithNoHeaderLine =
2478
            fieldHeaderFromIndex(fieldIndex)
2479
            .summaryHeaderFromFieldHeader(headerSuffix);
2480 15
        auto outputFieldHeaderFromHeaderLine =
2481
            inputHeaderLine[fieldIndex]
2482
            .summaryHeaderFromFieldHeader(headerSuffix);
2483 15
        auto customOutputFieldHeader = "custom";
2484

2485
        enum HeaderUsecase {
2486
            HeaderLine_DefaultHeader,
2487
            HeaderLine_CustomHeader,
2488
            NoHeaderLine_DefaultHeader,
2489
            NoHeaderLine_CustomHeader,
2490
            NoHeaderLine_NoOutputHeader,
2491
        }
2492

2493
        string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2494
        {
2495 0
            return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2496
                          op.name, hc, actual, expected);
2497
        }
2498

2499
        string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex,
2500
                                  const char[] actual, const char[] expected)
2501
        {
2502 0
            return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d, FieldIndex: %d\n    Actual: '%s';  Expected: '%s'",
2503
                          op.name, hc, rowIndex, fieldIndex, actual, expected);
2504
        }
2505

2506
        /* Run the logic for each header use case. */
2507
        foreach (hc; EnumMembers!HeaderUsecase)
2508
        {
2509 15
            bool hasInputHeader = (
2510
                hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2511
                hc == HeaderUsecase.HeaderLine_CustomHeader
2512
                );
2513 15
            bool hasOutputHeader = (
2514
                hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2515
                hc == HeaderUsecase.HeaderLine_CustomHeader ||
2516
                hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2517
                hc == HeaderUsecase.NoHeaderLine_CustomHeader
2518
                );
2519 15
            bool hasCustomHeader = (
2520
                hc == HeaderUsecase.HeaderLine_CustomHeader ||
2521
                hc == HeaderUsecase.NoHeaderLine_CustomHeader
2522
                );
2523

2524 15
            if (hasCustomHeader) assert(hasOutputHeader);
2525

2526 15
            auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs);
2527

2528 15
            if (hasCustomHeader)
2529
            {
2530 15
                if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2531 15
                op.setCustomHeader(customOutputFieldHeader);
2532
            }
2533

2534 15
            Operator[] operatorArray;
2535 15
            operatorArray ~= op;
2536

2537 15
            auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2538 15
            summarizer.setOperators(inputRangeObject(operatorArray));