1
/**
2
Command line tool that identifies equivalent lines in an input stream. Equivalent
3
lines are identified using either the full line or a set of fields as the key. By
4
default, input is written to standard output, retaining only the first occurrence of
5
equivalent lines. There are also options for marking and numbering equivalent lines
6
rather, without filtering out duplicates.
7

8
This tool is similar in spirit to the Unix 'uniq' tool, with some key differences.
9
First, the key can be composed of individual fields, not just the full line. Second,
10
input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when
11
they are adjacent, hence the usual need for sorting.)
12

13
There are a couple alternative to uniq'ing the input lines. One is to mark lines with
14
an equivalence ID, which is a one-upped counter. The other is to number lines, with
15
each unique key have its own set of numbers.
16

17
Copyright (c) 2015-2020, eBay Inc.
18
Initially written by Jon Degenhardt
19

20
License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
21
*/
22
module tsv_utils.tsv_uniq;
23

24
import std.exception : enforce;
25
import std.format : format;
26
import std.range;
27
import std.stdio;
28
import std.typecons : tuple;
29

30
auto helpText = q"EOS
31
Synopsis: tsv-uniq [options] [file...]
32

33
tsv-uniq filters out duplicate lines using fields as a key. Filtering is
34
based on the entire line when key fields are not provided. Options are
35
also available for assigning a unique id to each key and numbering the
36
occurrences of each key. Fields are specified using field number or field
37
name. Field names can be used when the input file has a header line.
38

39
Use '--help-verbose' for more details.
40
Options:
41
EOS";
42

43
auto helpTextVerbose = q"EOS
44
Synopsis: tsv-uniq [options] [file...]
45

46
tsv-uniq identifies equivalent lines in tab-separated value files. Input
47
is read line by line, recording a key for each line based on one or more
48
of the fields. Two lines are equivalent if they have the same key. The
49
first time a key is seen its line is written to standard output.
50
Subsequent lines containing the same key are discarded. This command
51
uniq's a file on fields 2 and 3:
52

53
   tsv-uniq -f 2,3 file.tsv
54

55
This is similar to the Unix 'uniq' program, but based on individual
56
fields and without requiring sorted data.
57

58
Field names can be used if the input file has a header line. This command
59
uniq's a file based on the 'time' and 'date' fields:
60

61
   tsv-uniq -H -f time,date file.tsv
62

63
Use '--help-fields' for details about field names.
64

65
tsv-uniq can be run without specifying a key field. In this case the
66
whole line is used as a key, same as the Unix 'uniq' program. This works
67
on any line-oriented text file, not just TSV files.
68

69
The above is the default behavior ('uniq' mode). The alternates to 'uniq'
70
mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all
71
lines are written to standard output, but with a field appended marking
72
equivalent entries with an ID. The ID is a one-upped counter. Example:
73

74
   tsv-uniq --header -f 2,3 --equiv file.tsv
75

76
'Number' mode also writes all lines to standard output, but with a field
77
appended numbering the occurrence count for the line's key. The first line
78
with a specific key is assigned the number '1', the second with the key is
79
assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined.
80

81
The '--r|repeated' option can be used to print only lines occurring more
82
than once. Specifically, the second occurrence of a key is printed. The
83
'--a|at-least N' option is similar, printing lines occurring at least N
84
times. (Like repeated, the Nth line with the key is printed.)
85

86
The '--m|max MAX' option changes the behavior to output the first MAX
87
lines for each key, rather than just the first line for each key.
88

89
If both '--a|at-least' and '--m|max' are specified, the occurrences
90
starting with 'at-least' and ending with 'max' are output.
91

92
Options:
93
EOS";
94

95
/** Container for command line options.
96
 */
97
struct TsvUniqOptions
98
{
99
    import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader;
100

101
    enum defaultEquivHeader = "equiv_id";
102
    enum defaultEquivStartID = 1;
103
    enum defaultNumberHeader = "equiv_line";
104

105
    string programName;
106
    InputSourceRange inputSources;            /// Input files
107
    size_t[] fields;                          /// Derived: --f|fields
108
    bool hasHeader = false;                   /// --H|header
109
    bool onlyRepeated = false;                /// --r|repeated. Shorthand for '--atleast 2'
110
    size_t atLeast = 0;                       /// --a|at-least. Zero implies default behavior.
111
    size_t max = 0;                           /// --m|max. Zero implies default behavior.
112
    bool numberMode = false;                  /// --z|number
113
    string numberHeader = defaultNumberHeader;  /// --number-header
114
    bool equivMode = false;                   /// --e|equiv
115
    string equivHeader = defaultEquivHeader;  /// --equiv-header
116
    long equivStartID = defaultEquivStartID;  /// --equiv-start
117
    bool ignoreCase = false;                  /// --i|ignore-case
118
    char delim = '\t';                        /// --d|delimiter
119
    bool keyIsFullLine = false;               /// Derived. True if no fields specified or '--f|fields 0'
120

121
    /* Returns a tuple. First value is true if command line arguments were successfully
122
     * processed and execution should continue, or false if an error occurred or the user
123
     * asked for help. If false, the second value is the appropriate exit code (0 or 1).
124
     *
125
     * Returning true (execution continues) means args have been validated and derived
126
     * values calculated. In addition, field indices have been converted to zero-based.
127
     * If the whole line is the key, the individual fields list will be cleared.
128
     *
129
     * Repeat count control variables 'atLeast' and max' - These values are left at zero
130
     * if no repeat count options are specified. They are set if repeat count options
131
     * are specified, as follows:
132
     *   * atLeast - Will be zero unless --r|repeated or --a|at-least is specified.
133
     *     --r|repeated option sets it 2, --a|at-least sets it to the specified value.
134
     *   * max - Default to zero. Is set to the --m|max value if provided. Is set to
135
     *    'atLeast' if --r|repeated or --a|at-least is provided.
136
     *
137
     * An exception to the above: If --e|equiv-mode is specified, then (max == 0)
138
     * represents the default "output all values" case. In this case max may be less
139
     * than the at-least value.
140
     */
141
    auto processArgs (ref string[] cmdArgs)
142
    {
143
        import std.algorithm : all, each;
144
        import std.conv : to;
145
        import std.getopt;
146
        import std.path : baseName, stripExtension;
147
        import std.typecons : Yes, No;
148
        import tsv_utils.common.fieldlist;
149
        import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
150

151 1
        bool helpVerbose = false;         // --h|help-verbose
152 1
        bool helpFields = false;          // --help-fields
153 1
        bool versionWanted = false;       // --V|version
154 1
        string fieldsArg;                 // --f|fields
155

156 1
        string fieldsOptionString = "f|fields";
157

158 1
        programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
159

160
        try
161
        {
162 1
            arraySep = ",";    // Use comma to separate values in command line options
163 1
            auto r = getopt(
164
                cmdArgs,
165
                "help-verbose",  "              Print full help.", &helpVerbose,
166
                "help-fields",   "              Print help on specifying fields.", &helpFields,
167
                std.getopt.config.caseSensitive,
168
                "V|version",     "              Print version information and exit.", &versionWanted,
169
                "H|header",      "              Treat the first line of each file as a header.", &hasHeader,
170
                std.getopt.config.caseInsensitive,
171

172
                fieldsOptionString,      "<field-list>  Fields to use as the key. Default: 0 (entire line).", &fieldsArg,
173

174
                "i|ignore-case", "              Ignore case when comparing keys.", &ignoreCase,
175
                "r|repeated",    "              Output only lines that are repeated (based on the key).", &onlyRepeated,
176
                "a|at-least",    "INT           Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast,
177
                "m|max",         "INT           Max number of each unique key to output (zero is ignored).", &max,
178
                "e|equiv",       "              Output equivalence class IDs rather than uniq'ing entries.", &equivMode,
179
                "equiv-header",  "STR           Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader,
180
                "equiv-start",   "INT           Use INT as the first equiv-id. Default: 1.", &equivStartID,
181
                "z|number",      "              Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode,
182
                "number-header", "STR           Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader,
183
                "d|delimiter",   "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
184
            );
185

186 1
            if (r.helpWanted)
187
            {
188 1
                defaultGetoptPrinter(helpText, r.options);
189 1
                return tuple(false, 0);
190
            }
191 1
            else if (helpVerbose)
192
            {
193 1
                defaultGetoptPrinter(helpTextVerbose, r.options);
194 1
                return tuple(false, 0);
195
            }
196 1
            else if (helpFields)
197
            {
198 1
                writeln(fieldListHelpText);
199 1
                return tuple(false, 0);
200
            }
201 1
            else if (versionWanted)
202
            {
203
                import tsv_utils.common.tsvutils_version;
204 1
                writeln(tsvutilsVersionNotice("tsv-uniq"));
205 1
                return tuple(false, 0);
206
            }
207

208
            /* Input files. Remaining command line args are files. */
209 1
            string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
210 1
            cmdArgs.length = 1;
211

212
            /* Validation - Do as much validation prior to header line processing as
213
             * possible (avoids waiting on stdin).
214
             */
215 1
            if (!equivMode)
216
            {
217 1
                enforce(equivHeader == defaultEquivHeader, "--equiv-header requires --e|equiv");
218 1
                enforce(equivStartID == defaultEquivStartID, "--equiv-start requires --e|equiv");
219
            }
220

221 1
            enforce(numberMode || numberHeader == defaultNumberHeader,
222 1
                    "--number-header requires --z|number");
223

224 1
            string[] headerFields;
225

226
            /* fieldListArgProcessing encapsulates the field list processing. It is
227
             * called prior to reading the header line if headers are not being used,
228
             * and after if headers are being used.
229
             */
230
            void fieldListArgProcessing()
231
            {
232 1
                if (!fieldsArg.empty)
233
                {
234 1
                    fields =
235
                        fieldsArg
236
                        .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
237
                        (hasHeader, headerFields, fieldsOptionString)
238
                        .array;
239
                }
240

241 1
                enforce(fields.length <= 1 || fields.all!(x => x != 0),
242 1
                        "Whole line as key (--f|field 0) cannot be combined with multiple fields.");
243

244 1
                if (fields.length == 0)
245
                {
246 1
                    keyIsFullLine = true;
247
                }
248 1
                else if (fields.length == 1 && fields[0] == 0)
249
                {
250 1
                    keyIsFullLine = true;
251 1
                    fields.length = 0;
252
                }
253

254 1
                if (onlyRepeated && atLeast <= 1) atLeast = 2;
255 1
                if (atLeast >= 2 && max < atLeast)
256
                {
257
                    // Don't modify max if it is zero and equivMode or numberMode is in effect.
258 1
                    if (max != 0 || (!equivMode && !numberMode)) max = atLeast;
259
                }
260

261 1
                if (!keyIsFullLine) fields.each!((ref x) => --x);  // Convert to 0-based indexing.
262
            }
263

264 1
            if (!hasHeader) fieldListArgProcessing();
265

266
            /*
267
             * Create the inputSourceRange and perform header line processing.
268
             */
269 1
            ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
270 1
            inputSources = inputSourceRange(filepaths, readHeader);
271

272 1
            if (hasHeader)
273
            {
274 1
                throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1);
275 1
                headerFields = inputSources.front.header.split(delim).to!(string[]);
276 1
                fieldListArgProcessing();
277
            }
278
        }
279
        catch (Exception exc)
280
        {
281 1
            stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
282 1
            return tuple(false, 1);
283
        }
284 1
        return tuple(true, 0);
285
    }
286
}
287

288
static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
289

290
/** Main program. Processes command line arguments and calls tsvUniq which implements
291
 * the main processing logic.
292
 */
293
int main(string[] cmdArgs)
294
{
295
    /* When running in DMD code coverage mode, turn on report merging. */
296
    version(D_Coverage) version(DigitalMars)
297
    {
298
        import core.runtime : dmd_coverSetMerge;
299 1
        dmd_coverSetMerge(true);
300
    }
301

302 1
    TsvUniqOptions cmdopt;
303 1
    auto r = cmdopt.processArgs(cmdArgs);
304 1
    if (!r[0]) return r[1];
305

306
    version(LDC_Profile)
307
    {
308
        import ldc.profile : resetAll;
309
        resetAll();
310
    }
311

312 1
    try tsvUniq(cmdopt);
313
    catch (Exception exc)
314
    {
315 1
        stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
316 1
        return 1;
317
    }
318 1
    return 0;
319
}
320

321
/** Outputs the unique lines from all the input files.
322
 *
323
 * Processes the lines in each input file. All lines are added to an associated array.
324
 * The first time a line is seen it is output. If key fields are being used these are
325
 * used as the basis for the associative array entries rather than the full line.
326
 */
327
void tsvUniq(ref TsvUniqOptions cmdopt)
328
{
329
    import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange,
330
        InputFieldReordering, InputSourceRange, joinAppend, throwIfWindowsNewlineOnUnix;
331
    import std.algorithm : splitter;
332
    import std.array : appender;
333
    import std.conv : to;
334
    import std.uni : asLowerCase;
335
    import std.utf : byChar;
336

337
    /* inputSources must be an InputSourceRange and include at least stdin. */
338 1
    assert(!cmdopt.inputSources.empty);
339
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
340

341
    /* InputFieldReordering maps the key fields from an input line to a separate buffer. */
342 1
    auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields);
343

344
    /* BufferedOutputRange is a performance enhancement for writing to stdout. */
345 1
    auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
346

347
    /* The master hash. The key is the specified fields concatenated together (including
348
     * separators). The value is a struct with the equiv-id and occurrence count.
349
     */
350
    static struct EquivEntry { size_t equivID; size_t count; }
351 1
    EquivEntry[string] equivHash;
352

353
    /* Reusable buffers for multi-field keys and case-insensitive keys. */
354 1
    auto multiFieldKeyBuffer = appender!(char[]);
355 1
    auto lowerKeyBuffer = appender!(char[]);
356

357 1
    const size_t numKeyFields = cmdopt.fields.length;
358 1
    long nextEquivID = cmdopt.equivStartID;
359

360
    /* First header is read during command line arg processing. Flush it immediately
361
     * so subsequent processes in a unix command pipeline see it early. This helps
362
     * provide timely error messages.
363
     */
364 1
    if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
365
    {
366 1
        auto inputStream = cmdopt.inputSources.front;
367

368 1
        bufferedOutput.append(inputStream.header);
369

370 1
        if (cmdopt.equivMode)
371
        {
372 1
            bufferedOutput.append(cmdopt.delim);
373 1
            bufferedOutput.append(cmdopt.equivHeader);
374
        }
375

376 1
        if (cmdopt.numberMode)
377
        {
378 1
            bufferedOutput.append(cmdopt.delim);
379 1
            bufferedOutput.append(cmdopt.numberHeader);
380
        }
381

382 1
        bufferedOutput.appendln();
383 1
        bufferedOutput.flush();
384
    }
385

386 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
387

388 1
    foreach (inputStream; cmdopt.inputSources)
389
    {
390 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
391

392 1
        foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine))
393
        {
394 1
            if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum);
395

396
            /* Start by finding the key. */
397 1
            typeof(line) key;
398 1
            if (cmdopt.keyIsFullLine)
399
            {
400 1
                key = line;
401
            }
402
            else
403
            {
404 1
                assert(keyFieldsReordering !is null);
405

406
                /* Copy the key fields to a new buffer. */
407 1
                keyFieldsReordering.initNewLine;
408 1
                foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
409
                {
410 1
                    keyFieldsReordering.processNextField(fieldIndex, fieldValue);
411 1
                    if (keyFieldsReordering.allFieldsFilled) break;
412
                }
413

414 1
                enforce(keyFieldsReordering.allFieldsFilled,
415 1
                        format("Not enough fields in line. File: %s, Line: %s",
416
                               inputStream.name, lineNum));
417

418 1
                if (numKeyFields == 1)
419
                {
420 1
                    key = keyFieldsReordering.outputFields[0];
421
                }
422
                else
423
                {
424 1
                    multiFieldKeyBuffer.clear();
425 1
                    keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim);
426 1
                    key = multiFieldKeyBuffer.data;
427
                }
428
            }
429

430 1
            if (cmdopt.ignoreCase)
431
            {
432
                /* Equivalent to key = key.toLower, but without memory allocation. */
433 1
                lowerKeyBuffer.clear();
434 1
                lowerKeyBuffer.put(key.asLowerCase.byChar);
435 1
                key = lowerKeyBuffer.data;
436
            }
437

438 1
            bool isOutput = false;
439 1
            EquivEntry currEntry;
440 1
            EquivEntry* priorEntry = (key in equivHash);
441 1
            if (priorEntry is null)
442
            {
443 1
                isOutput = (cmdopt.atLeast <= 1);
444 1
                currEntry.equivID = nextEquivID;
445 1
                currEntry.count = 1;
446 1
                equivHash[key.to!string] = currEntry;
447 1
                nextEquivID++;
448
            }
449
            else
450
            {
451 1
                (*priorEntry).count++;
452 1
                currEntry = *priorEntry;
453

454 1
                if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) ||
455 1
                    (cmdopt.equivMode && cmdopt.max == 0) ||
456 1
                    (cmdopt.numberMode && cmdopt.max == 0))
457
                {
458 1
                    isOutput = true;
459
                }
460
            }
461

462 1
            if (isOutput)
463
            {
464 1
                bufferedOutput.append(line);
465

466 1
                if (cmdopt.equivMode)
467
                {
468 1
                    bufferedOutput.append(cmdopt.delim);
469 1
                    bufferedOutput.append(currEntry.equivID.to!string);
470
                }
471

472 1
                if (cmdopt.numberMode)
473
                {
474 1
                    bufferedOutput.append(cmdopt.delim);
475 1
                    bufferedOutput.append(currEntry.count.to!string);
476
                }
477

478 1
                bufferedOutput.appendln();
479
            }
480
        }
481
    }
482
}

Read our documentation on viewing source code .

Loading