1
/**
2
A variant of the unix 'cut' program, with the ability to reorder fields.
3

4
tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder
5
fields. Lines are read from files or standard input and split on a delimiter character.
6
Fields are written to standard output in the order listed. Fields can be listed more
7
than once, and fields not listed can be written out as a group.
8

9
This program is intended both as a useful utility and a D programming language example.
10
Functionality and constructs used include command line argument processing, file I/O,
11
exception handling, ranges, tuples and strings, templates, universal function call syntax
12
(UFCS), lambdas and functional programming constructs. Comments are more verbose than
13
typical to shed light on D programming constructs, but not to the level of a tutorial.
14

15
Copyright (c) 2015-2020, eBay Inc.
16
Initially written by Jon Degenhardt
17

18
License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
19
*/
20

21
module tsv_utils.tsv_select;   // Module name defaults to file name, but hyphens not allowed, so set it here.
22

23
// Imports used by multiple routines. Others imports made in local context.
24
import std.exception : enforce;
25
import std.range;
26
import std.stdio;
27
import std.typecons : tuple, Tuple;
28

29
// 'Heredoc' style help text. When printed it is followed by a getopt formatted option list.
30
immutable helpText = q"EOS
31
Synopsis: tsv-select [options] [file...]
32

33
tsv-select reads files or standard input and writes selected fields to
34
standard output. Fields are written in the order listed. This is similar
35
to Unix 'cut', but with the ability to reorder fields.
36

37
Fields can be specified by field number or, for files with header lines,
38
by field name. Use '--H|header' to enable selection by name. This also
39
manages header lines from multiple files, retaining only the first header.
40

41
Field numbers start with one. The field list is comma separated. Ranges
42
can be used, and wildcards can be used when specifying fields by name.
43

44
Fields can be dropped using '--e|exclude'. Fields not included in the
45
'--f|fields' option can be selected as a group using '--r|rest'.
46

47
Examples:
48

49
   # Selecting fields. Output is in the order listed
50
   tsv-select -H date,time file.tsv
51
   tsv-select -f 2,1 file.tsv
52
   tsv-select -f 5-7,2,9-11
53
   tsv-select -H -f '*_date' file.tsv
54

55
   # Dropping fields
56
   tsv-select --exclude 1 file.tsv
57
   tsv-select -H -e date,time file.tsv
58

59
   # Move fields to the front or the back
60
   tsv-select -f 1 --rest first file.tsv  # Move field 1 to the end
61
   tsv-select -H -f date --rest last      # Move 'date' field to the front
62

63
   # Read multiple files, keep the header from only the first
64
   tsv-select data*.tsv -H --fields 1,2,4-7,14
65

66
Use '--help-verbose' for detailed information. Use '--help-fields' for
67
details about field lists and field names.
68

69
Options:
70
EOS";
71

72
immutable helpTextVerbose = q"EOS
73
Synopsis: tsv-select [options] [file...]
74

75
tsv-select reads files or standard input and writes selected fields to
76
standard output. Fields are written in the order listed. This is similar
77
to Unix 'cut', but with the ability to reorder fields.
78

79
Fields can be specified by field number or, for files with header lines,
80
by field name. Use '--H|header' to enable selection by name. This also
81
manages header lines from multiple files, retaining only the first header.
82

83
Field numbers start with one. The field list is comma separated. Fields
84
can be repeated and ranges can be used. Wildcards can be used when
85
specifying fields by name, and escapes can be used to specify fields names
86
containing special characters. Run '--help-fields' for details.
87

88
Fields can be excluded using '--e|exclude'. All fields not excluded are
89
output. Fields not included in the '--f|fields' option can be selected as
90
a group using '--r|rest'. '--f|fields' and '--r|rest' can be used with
91
 '--e|exclude' to reorder non-excluded fields.
92

93
Examples:
94

95
   # Keep the first field from two files
96
   tsv-select -f 1 file1.tsv file2.tsv
97

98
   # Keep fields 1 and 2, retaining the header from only the first file
99
   tsv-select -H -f 1,2 file1.tsv file2.tsv
100

101
   # Keep the 'time' field
102
   tsv-select -H -f time file1.tsv
103

104
   # Keep all fields ending '_date' or '_time'
105
   tsv-select -H -f '*_date,*_time' file.tsv
106

107
   # Drop all the '*_time' fields
108
   tsv-select -H --exclude '*_time' file.tsv
109

110
   # Field reordering and field ranges
111
   tsv-select -f 3,2,1 file.tsv
112
   tsv-select -f 1,4-7,11 file.tsv
113
   tsv-select -f 1,7-4,11 file.tsv
114

115
   # Repeating fields
116
   tsv-select -f 1,2,1 file.tsv
117
   tsv-select -f 1-3,3-1 file.tsv
118

119
   # Move fields to the front
120
   tsv-select -f 5 --rest last file.tsv
121
   tsv-select -H -f Date,Time --rest last file.tsv
122

123
   # Move fields to the end
124
   tsv-select -f 4,5 --rest first file.tsv
125
   tsv-select -f '*_time' --rest first file.tsv
126

127
   # Move field 2 to the front and drop fields 10-15
128
   tsv-select -f 2 -e 10-15 file.tsv
129

130
   # Move field 2 to the end, dropping fields 10-15
131
   tsv-select -f 2 -rest first -e 10-15 file.tsv
132

133
Use '--help-fields' for detailed help on field lists.
134

135
Notes:
136
* One of '--f|fields' or '--e|exclude' is required.
137
* Fields specified by '--f|fields' and '--e|exclude' cannot overlap.
138
* When '--f|fields' and '--e|exclude' are used together, the effect is to
139
  specify '--rest last'. This can be overridden by using '--rest first'.
140
* Each input line must be long enough to contain all fields specified
141
  with '--f|fields'. This is not necessary for '--e|exclude' fields.
142
* Specifying names of fields containing special characters may require
143
  escaping the special characters. See '--help-fields' for details.
144

145
Options:
146
EOS";
147

148
/** Container for command line options.
149
 */
150
struct TsvSelectOptions
151
{
152
    import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange;
153

154
    // The allowed values for the --rest option.
155
    enum RestOption { none, first, last};
156

157
    string programName;                 /// Program name
158
    ByLineSourceRange!() inputSources;  /// Input Files
159
    bool hasHeader = false;             /// --H|header
160
    char delim = '\t';                  /// --d|delimiter
161
    RestOption restArg;                 /// --rest first|last (none is hidden default)
162
    size_t[] fields;                    /// Derived from --f|fields
163
    bool[] excludedFieldsTable;         /// Derived. Lookup table for excluded fields.
164

165
    /** Process command line arguments (getopt cover).
166
     *
167
     * processArgs calls getopt to process command line arguments. It does any additional
168
     * validation and parameter derivations needed. A tuple is returned. First value is
169
     * true if command line arguments were successfully processed and execution should
170
     * continue, or false if an error occurred or the user asked for help. If false, the
171
     * second value is the appropriate exit code (0 or 1).
172
     *
173
     * Returning true (execution continues) means args have been validated and derived
174
     * values calculated. In addition, field indices have been converted to zero-based.
175
     */
176
    auto processArgs (ref string[] cmdArgs)
177
    {
178
        import std.algorithm : any, each, maxElement;
179
        import std.array : split;
180
        import std.conv : to;
181
        import std.format : format;
182
        import std.getopt;
183
        import std.path : baseName, stripExtension;
184
        import std.typecons : Yes, No;
185
        import tsv_utils.common.fieldlist;
186
        import tsv_utils.common.utils : throwIfWindowsNewline;
187

188 15
        bool helpVerbose = false;           // --help-verbose
189 15
        bool helpFields = false;            // --help-fields
190 15
        bool versionWanted = false;         // --V|version
191 15
        string fieldsArg;                   // --f|fields
192 15
        string excludedFieldsArg;           // --e|exclude
193

194 15
        string fieldsOptionString = "f|fields";
195 15
        string excludedFieldsOptionString = "e|exclude";
196

197 15
        programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
198

199
        try
200
        {
201 15
            arraySep = ",";    // Use comma to separate values in command line options
202 15
            auto r = getopt(
203
                cmdArgs,
204
                "help-verbose",
205
                "              Print more detailed help.",
206
                &helpVerbose,
207

208
                "help-fields",
209
                "              Print help on specifying fields.",
210
                &helpFields,
211

212
                std.getopt.config.caseSensitive,
213
                "H|header",
214
                "              Treat the first line of each file as a header.",
215
                &hasHeader,
216
                std.getopt.config.caseInsensitive,
217

218
                fieldsOptionString,
219
                "<field-list>  Fields to retain. Fields are output in the order listed.",
220
                &fieldsArg,
221

222
                excludedFieldsOptionString,
223
                "<field-list>  Fields to exclude.",
224
                &excludedFieldsArg,
225

226
                "r|rest",
227
                "first|last    Output location for fields not included in '--f|fields'.",
228
                &restArg,
229

230
                "d|delimiter",
231
                "CHR           Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)",
232
                &delim,
233

234
                std.getopt.config.caseSensitive,
235
                "V|version",
236
                "              Print version information and exit.",
237
                &versionWanted,
238
                std.getopt.config.caseInsensitive,
239
                );
240

241 15
            if (r.helpWanted)
242
            {
243 15
                defaultGetoptPrinter(helpText, r.options);
244 15
                return tuple(false, 0);
245
            }
246 15
            else if (helpVerbose)
247
            {
248 15
                defaultGetoptPrinter(helpTextVerbose, r.options);
249 15
                return tuple(false, 0);
250
            }
251 15
            else if (helpFields)
252
            {
253 15
                writeln(fieldListHelpText);
254 15
                return tuple(false, 0);
255
            }
256 15
            else if (versionWanted)
257
            {
258
                import tsv_utils.common.tsvutils_version;
259 15
                writeln(tsvutilsVersionNotice("tsv-select"));
260 15
                return tuple(false, 0);
261
            }
262

263
            /* Remaining command line args are files. Use standard input if files
264
             * were not provided. Truncate cmdArgs to consume the arguments.
265
             */
266 15
            string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
267 15
            cmdArgs.length = 1;
268

269
            /* Validation and derivations - Do as much validation prior to header line
270
             * processing as possible (avoids waiting on stdin).
271
             *
272
             * Note: fields and excludedFields depend on header line processing, but
273
             * fieldsArg and excludedFieldsArg can be used to detect whether the
274
             * command line argument was specified.
275
             */
276

277 15
            enforce(!fieldsArg.empty || !excludedFieldsArg.empty,
278 15
                    "One of '--f|fields' or '--e|exclude' is required.");
279

280 15
            string[] headerFields;
281

282
            /* fieldListArgProcessing encapsulates the field list processing. It is
283
             * called prior to reading the header line if headers are not being used,
284
             * and after if headers are being used.
285
             */
286
            void fieldListArgProcessing()
287
            {
288 15
                if (!fieldsArg.empty)
289
                {
290 15
                    fields = fieldsArg
291
                        .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(
292
                            hasHeader, headerFields, fieldsOptionString)
293
                        .array;
294
                }
295

296 15
                size_t[] excludedFields;
297

298 15
                if (!excludedFieldsArg.empty)
299
                {
300 15
                    excludedFields = excludedFieldsArg
301
                        .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(
302
                            hasHeader, headerFields, excludedFieldsOptionString)
303
                        .array;
304
                }
305

306 15
                if (excludedFields.length > 0)
307
                {
308
                    /* Make sure selected and excluded fields do not overlap. */
309 15
                    foreach (e; excludedFields)
310
                    {
311 15
                        foreach (f; fields)
312
                        {
313 15
                            enforce(e != f, "'--f|fields' and '--e|exclude' have overlapping fields.");
314
                        }
315
                    }
316

317
                    /* '--exclude' changes '--rest' default to 'last'. */
318 15
                    if (restArg == RestOption.none) restArg = RestOption.last;
319

320
                    /* Build the excluded field lookup table.
321
                     *
322
                     * Note: Users won't have any reason to expect memory is allocated based
323
                     * on the max field number. However, users might pick arbitrarily large
324
                     * numbers when trimming fields. So, limit the max field number to something
325
                     * big but reasonable (more than 1 million). The limit can be raised if use
326
                     * cases arise.
327
                     */
328 15
                    size_t maxExcludedField = excludedFields.maxElement;
329 15
                    size_t maxAllowedExcludedField = 1024 * 1024;
330

331 15
                    enforce(maxExcludedField < maxAllowedExcludedField,
332 15
                            format("Maximum allowed '--e|exclude' field number is %d.",
333
                                   maxAllowedExcludedField));
334

335 15
                    excludedFieldsTable.length = maxExcludedField + 1;          // Initialized to false
336 15
                    foreach (e; excludedFields) excludedFieldsTable[e] = true;
337
                }
338
            }
339

340 15
            if (!hasHeader) fieldListArgProcessing();
341

342
            /*
343
             * Create the byLineSourceRange and perform header line processing.
344
             */
345 15
            inputSources = byLineSourceRange(filepaths);
346

347 15
            if (hasHeader)
348
            {
349 15
                if (!inputSources.front.byLine.empty)
350
                {
351 15
                    throwIfWindowsNewline(inputSources.front.byLine.front, inputSources.front.name, 1);
352 15
                    headerFields = inputSources.front.byLine.front.split(delim).to!(string[]);
353
                }
354

355 15
                fieldListArgProcessing();
356
            }
357

358
        }
359
        catch (Exception exc)
360
        {
361 15
            stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
362 15
            return tuple(false, 1);
363
        }
364 15
        return tuple(true, 0);
365
    }
366
}
367

368
static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
369

370
/** Main program.
371
 */
372
int main(string[] cmdArgs)
373
{
374
    /* When running in DMD code coverage mode, turn on report merging. */
375
    version(D_Coverage) version(DigitalMars)
376
    {
377
        import core.runtime : dmd_coverSetMerge;
378 15
        dmd_coverSetMerge(true);
379
    }
380

381 15
    TsvSelectOptions cmdopt;
382 15
    const r = cmdopt.processArgs(cmdArgs);
383 15
    if (!r[0]) return r[1];
384
    version(LDC_Profile)
385
    {
386
        import ldc.profile : resetAll;
387
        resetAll();
388
    }
389
    try
390
    {
391
        /* Invoke the tsvSelect template matching the --rest option chosen. Option args
392
         * are removed by command line processing (getopt). The program name and any files
393
         * remain. Pass the files to tsvSelect.
394
         */
395 15
        final switch (cmdopt.restArg)
396
        {
397 15
        case TsvSelectOptions.RestOption.none:
398 15
            tsvSelect!(RestLocation.none)(cmdopt);
399 15
            break;
400 15
        case TsvSelectOptions.RestOption.first:
401 15
            tsvSelect!(RestLocation.first)(cmdopt);
402 15
            break;
403 15
        case TsvSelectOptions.RestOption.last:
404 15
            tsvSelect!(RestLocation.last)(cmdopt);
405 15
            break;
406
        }
407
    }
408
    catch (Exception exc)
409
    {
410 15
        stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
411 15
        return 1;
412
    }
413

414 15
    return 0;
415
}
416

417
// tsvSelect
418

419
/** Enumeration of the different specializations of the tsvSelect template.
420
 *
421
 * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It
422
 * is used by main to choose the appropriate tsvSelect template instantiation to call. It
423
 * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The
424
 * TsvSelectOptions version specifies the text of allowed values in command line arguments.
425
 */
426
enum RestLocation { none, first, last };
427

428
/** tsvSelect does the primary work of the tsv-select program.
429
 *
430
 * Input is read line by line, extracting the listed fields and writing them out in the order
431
 * specified. An exception is thrown on error.
432
 *
433
 * This function is templatized with instantiations for the different --rest options. This
434
 * avoids repeatedly running the same if-tests inside the inner loop. The main function
435
 * instantiates this function three times, once for each of the --rest options. It results
436
 * in a larger program, but is faster. Run-time improvements of 25% were measured compared
437
 * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.)
438
 */
439

440
void tsvSelect(RestLocation rest)(ref TsvSelectOptions cmdopt)
441
{
442
    import tsv_utils.common.utils: BufferedOutputRange, ByLineSourceRange,
443
        InputFieldReordering, throwIfWindowsNewline;
444
    import std.algorithm: splitter;
445
    import std.array : appender, Appender;
446
    import std.format: format;
447
    import std.range;
448

449
    // Ensure the correct template instantiation was called.
450
    static if (rest == RestLocation.none)
451 15
        assert(cmdopt.restArg == TsvSelectOptions.RestOption.none);
452
    else static if (rest == RestLocation.first)
453 15
        assert(cmdopt.restArg == TsvSelectOptions.RestOption.first);
454
    else static if (rest == RestLocation.last)
455 15
        assert(cmdopt.restArg == TsvSelectOptions.RestOption.last);
456
    else
457
        static assert(false, "rest template argument does not match cmdopt.restArg.");
458

459
    /* Check that the input files were setup as expected. Should at least have one
460
     * input, stdin if nothing else, and newlines removed from the byLine range.
461
     */
462 15
    assert(!cmdopt.inputSources.empty);
463
    static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));
464

465
    /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */
466 15
    assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none);
467

468
    /* InputFieldReordering copies select fields from an input line to a new buffer.
469
     * The buffer is reordered in the process.
470
     */
471 15
    auto fieldReordering = new InputFieldReordering!char(cmdopt.fields);
472

473
    /* Fields not on the --fields list are added to a separate buffer so they can be
474
     * output as a group (the --rest option). This is done using an 'Appender', which
475
     * is faster than the ~= operator. The Appender is passed a GC allocated buffer
476
     * that grows as needed and is reused for each line. Typically it'll grow only
477
     * on the first line.
478
     */
479
    static if (rest != RestLocation.none)
480
    {
481 15
        auto leftOverFieldsAppender = appender!(char[][]);
482
    }
483

484
    /* BufferedOutputRange (from common/utils.d) is a performance improvement over
485
     * writing directly to stdout.
486
     */
487 15
    auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
488

489
    /* Read each input file (or stdin) and iterate over each line.
490
     */
491 15
    foreach (fileNum, inputStream; cmdopt.inputSources.enumerate)
492
    {
493 15
        foreach (lineNum, line; inputStream.byLine.enumerate(1))
494
        {
495 15
            if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum);
496

497 15
            if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader)
498
            {
499 15
                continue;   // Drop the header line from all but the first file.
500
            }
501

502
            static if (rest != RestLocation.none)
503
            {
504 15
                leftOverFieldsAppender.clear;
505

506
                /* Track the field location in the line. This enables bulk appending
507
                 * after the last specified field has been processed.
508
                 */
509 15
                size_t nextFieldStart = 0;
510
            }
511

512 15
            fieldReordering.initNewLine;
513

514 15
            foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
515
            {
516
                static if (rest == RestLocation.none)
517
                {
518 15
                    fieldReordering.processNextField(fieldIndex, fieldValue);
519 15
                    if (fieldReordering.allFieldsFilled) break;
520
                }
521
                else
522
                {
523
                    /* Processing with 'rest' fields. States:
524
                     *  - Excluded fields and specified fields remain
525
                     *  - Only specified fields remain
526
                     *  - Only excluded fields remain
527
                     */
528

529 15
                    nextFieldStart += fieldValue.length + 1;
530 15
                    bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length;
531 15
                    immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex];
532

533 15
                    if (!isExcluded)
534
                    {
535 15
                        immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue);
536

537 15
                        if (numMatched == 0) leftOverFieldsAppender.put(fieldValue);
538
                    }
539 15
                    else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length)
540
                    {
541 15
                        excludedFieldsRemain = false;
542
                    }
543

544 15
                    if (fieldReordering.allFieldsFilled && !excludedFieldsRemain)
545
                    {
546
                        /* Processed all specified fields. Bulk append any fields
547
                         * remaining on the line. Cases:
548
                         * - Current field is last field:
549
                         */
550 15
                        if (nextFieldStart <= line.length)
551
                        {
552 15
                            leftOverFieldsAppender.put(line[nextFieldStart .. $]);
553
                        }
554

555 15
                        break;
556
                    }
557
                }
558
            }
559

560
            // Finished with all fields in the line.
561 15
            enforce(fieldReordering.allFieldsFilled,
562 15
                    format("Not enough fields in line. File: %s,  Line: %s",
563
                           inputStream.name, lineNum));
564

565
            // Write the re-ordered line.
566

567
            static if (rest == RestLocation.first)
568
            {
569 15
                if (leftOverFieldsAppender.data.length > 0)
570
                {
571 15
                    bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
572 15
                    if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
573
                }
574
            }
575

576 15
            bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim);
577

578
            static if (rest == RestLocation.last)
579
            {
580 15
                if (leftOverFieldsAppender.data.length > 0)
581
                {
582 15
                    if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
583 15
                    bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
584
                }
585
            }
586

587 15
            bufferedOutput.appendln;
588

589
            /* Send the first line of the first file immediately. This helps detect
590
             * errors quickly in multi-stage unix pipelines. Note that tsv-select may
591
             * have been sent one line from an upstream process, usually a header line.
592
             */
593 15
            if (lineNum == 1 && fileNum == 0) bufferedOutput.flush;
594
        }
595
    }
596
}

Read our documentation on viewing source code .

Loading