1
/**
2
Command line tool that filters TSV files.
3

4
This tool filters tab-delimited files based on numeric or string comparisons
5
against specific fields. See the helpText string for details.
6

7
Copyright (c) 2015-2020, eBay Inc.
8
Initially written by Jon Degenhardt
9

10
License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
11
*/
12
module tsv_utils.tsv_filter;
13

14
import std.algorithm : canFind, equal, findSplit, max, min;
15
import std.conv : to;
16
import std.exception : enforce;
17
import std.format : format;
18
import std.math : abs, isFinite, isInfinity, isNaN;
19
import std.range;
20
import std.regex;
21
import std.stdio;
22
import std.string : isNumeric;
23
import std.typecons;
24
import std.uni: asLowerCase, toLower, byGrapheme;
25

26
/* The program has two main parts, command line arg processing and processing the input
27
 * files. Much of the work is in command line arg processing. This sets up the tests run
28
 * against each input line. The tests are an array of delegates (closures) run against the
29
 * fields in the line. The tests are based on command line arguments, of which there is
30
 * a lengthy set, one for each test.
31
 */
32

33
static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
34

35
/** Main program. Invokes command line arg processing and tsv-filter to perform
36
 * the real work. Any errors are caught and reported.
37
 */
38
int main(string[] cmdArgs)
39
{
40
    /* When running in DMD code coverage mode, turn on report merging. */
41
    version(D_Coverage) version(DigitalMars)
42
    {
43
        import core.runtime : dmd_coverSetMerge;
44 1
        dmd_coverSetMerge(true);
45
    }
46

47 1
    TsvFilterOptions cmdopt;
48 1
    const r = cmdopt.processArgs(cmdArgs);
49 1
    if (!r[0]) return r[1];
50
    version(LDC_Profile)
51
    {
52
        import ldc.profile : resetAll;
53
        resetAll();
54
    }
55 1
    try tsvFilter(cmdopt);
56
    catch (Exception e)
57
    {
58 1
        stderr.writefln("Error [%s]: %s", cmdopt.programName, e.msg);
59 1
        return 1;
60
    }
61 1
    return 0;
62
}
63

64
immutable helpText = q"EOS
65
Synopsis: tsv-filter [options] [file...]
66

67
Filter tab-delimited files for matching lines via comparison tests against
68
individual fields. Use '--help-verbose' for a more detailed description.
69

70
Fields are specified using field number or field name. Field names require
71
that the input file has a header line. Use '--help-fields' for details.
72

73
Global options:
74
  --help-verbose      Print full help.
75
  --help-options      Print the options list by itself.
76
  --help-fields       Print help on specifying fields.
77
  --V|version         Print version information and exit.
78
  --H|header          Treat the first line of each file as a header.
79
  --or                Evaluate tests as an OR rather than an AND clause.
80
  --v|invert          Invert the filter, printing lines that do not match.
81
  --d|delimiter CHR   Field delimiter. Default: TAB.
82

83
Operators:
84
* Test if a field is empty (no characters) or blank (empty or whitespace only).
85
  Syntax:  --empty|not-empty|blank|not-blank  FIELD
86
  Example: --empty name               # True if the 'name' field is empty
87

88
* Test if a field is numeric, finite, NaN, or infinity
89
  Syntax:  --is-numeric|is-finite|is-nan|is-infinity FIELD
90
  Example: --is-numeric 5 --gt 5:100  # Ensure field 5 is numeric before --gt test.
91

92
* Compare a field to a number (integer or float)
93
  Syntax:  --eq|ne|lt|le|gt|ge  FIELD:NUM
94
  Example: --lt size:1000 --gt weight:0.5  # ('size' < 1000) and ('weight' > 0.5)
95

96
* Compare a field to a string
97
  Syntax:  --str-eq|str-ne|istr-eq|istr-ne  FIELD:STR
98
  Example: --str-eq color:red         # True if 'color' field is "red"
99

100
* Test if a field contains a string (substring search)
101
  Syntax:  --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld  FIELD:STR
102
  Example: --str-in-fld color:dark    # True if 'color field contains "dark"
103

104
* Test if a field matches a regular expression.
105
  Syntax:  --regex|iregex|not-regex|not-iregex  FIELD:REGEX
106
  Example: --regex '3:ab*c'     # True if field 3 contains "ac", "abc", "abbc", etc.
107

108
* Test a field's character or byte length
109
  Syntax:  --char-len-[le|lt|ge|gt|eq|ne] FIELD:NUM
110
           --byte-len-[le|lt|ge|gt|eq|ne] FIELD:NUM
111
  Example: --char-len-lt 2:10   # True if field 2 is less than 10 characters long.
112
           --byte-len-gt 2:10   # True if field 2 is greater than 10 bytes long.
113

114
* Field to field comparisons - Similar to field vs literal comparisons, but field vs field.
115
  Syntax:  --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge  FIELD1:FIELD2
116
           --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne  FIELD1:FIELD2
117
  Example: --ff-eq 2:4          # True if fields 2 and 4 are numerically equivalent
118
           --ff-str-eq 2:4      # True if fields 2 and 4 are the same strings
119

120
* Field to field difference comparisons - Absolute and relative difference
121
  Syntax:  --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM
122
           --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM
123
  Example: --ff-absdiff-lt 1:3:0.25   # True if abs(field1 - field2) < 0.25
124

125
EOS";
126

127
immutable helpTextVerbose = q"EOS
128
Synopsis: tsv-filter [options] [file...]
129

130
Filter lines of tab-delimited files via comparison tests against fields.
131
Multiple tests can be specified, by default they are evaluated as an AND
132
clause. Lines satisfying the tests are written to standard output.
133

134
Typical test syntax is '--op field:value', where 'op' is an operator,
135
'field' is a either a field name and or field number, and 'value' is the
136
comparison basis. For example, '--lt length:500' tests if the 'length'
137
field is less than 500. A more complete example:
138

139
  tsv-filter --header --gt length:50 --lt length:100 --le width:200 data.tsv
140

141
This outputs all lines from file data.tsv where the 'length' field is
142
greater than 50 and less than 100, and the 'width' field is less than or
143
equal to 200. The header line is also output.
144

145
Field numbers can also be used to identify fields, and must be used when
146
the input file doesn't have a header line. For example:
147

148
  tsv-filter --gt 1:50 --lt 1:100 --le 2:200 data.tsv
149

150
Field lists can be used to specify multiple fields at once. For example:
151

152
  tsv-filter --not-blank 1-10 --str-ne 1,2,5:'--' data.tsv
153

154
tests that fields 1-10 are not blank and fields 1,2,5 are not "--".
155

156
Wildcarded field names can also be used to specify multiple fields. The
157
following finds lines where any field name ending in '*_id' is empty:
158

159
  tsv-filter -H --or --empty '*_id'
160

161
Use '--help-fields' for details on using field names.
162

163
Tests available include:
164
  * Test if a field is empty (no characters) or blank (empty or whitespace only).
165
  * Test if a field is interpretable as a number, a finite number, NaN, or Infinity.
166
  * Compare a field to a number - Numeric equality and relational tests.
167
  * Compare a field to a string - String equality and relational tests.
168
  * Test if a field matches a regular expression. Case sensitive or insensitive.
169
  * Test if a field contains a string. Sub-string search, case sensitive or insensitive.
170
  * Test a field's character or byte length.
171
  * Field to field comparisons - Similar to the other tests, except comparing
172
    one field to another in the same line.
173

174
Details:
175
  * The run is aborted if there are not enough fields in an input line.
176
  * Numeric tests will fail and abort the run if a field cannot be interpreted as a
177
    number. This includes fields with no text. To avoid this use '--is-numeric' or
178
    '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100'
179
    ensures field 5 is numeric before running the --gt test.
180
  * Regular expression syntax is defined by the D programming language. They follow
181
    common conventions (perl, python, etc.). Most common forms work as expected.
182

183
Options:
184
EOS";
185

186
immutable helpTextOptions = q"EOS
187
Synopsis: tsv-filter [options] [file...]
188

189
Options:
190
EOS";
191

192
/* The next blocks of code define the structure of the boolean tests run against input lines.
193
 * This includes function and delegate (closure) signatures, creation mechanisms, option
194
 * handlers, etc. Command line arg processing to build the test structure.
195
*/
196

197
/* FieldsPredicate delegate signature - Each input line is run against a set of boolean
198
 * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure)
199
 * containing all info about the test except the field values of the line being tested.
200
 * These delegates are created as part of command line arg processing. The wrapped data
201
 * includes operation, field indexes, literal values, etc. At run-time the delegate is
202
 * passed one argument, the split input line.
203
 */
204
alias FieldsPredicate = bool delegate(const char[][] fields);
205

206
/* FieldsPredicate function signatures - These aliases represent the different function
207
 * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make'
208
 * function. The 'make' function takes a real predicate function and closure args and
209
 * returns a FieldsPredicate delegate. Predicates types are:
210
 *
211
 * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4)
212
 * - FieldVsNumberPredicate - Test based on a field index (used to get the field value)
213
 *   and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100).
214
 * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc)
215
 * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string.
216
 *   (e.g. --istr-eq 2:abc)
217
 * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c')
218
 * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4).
219
 *
220
 * An actual FieldsPredicate takes the fields from the line and the closure args and
221
 * runs the test. For example, a function testing if a field is less than a specific
222
 * value would pull the specified field from the fields array, convert the string to
223
 * a number, then run the less-than test.
224
 */
225
alias FieldUnaryPredicate    = bool function(const char[][] fields, size_t index);
226
alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value);
227
alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value);
228
alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value);
229
alias FieldVsRegexPredicate  = bool function(const char[][] fields, size_t index, Regex!char value);
230
alias FieldVsFieldPredicate  = bool function(const char[][] fields, size_t index1, size_t index2);
231
alias FieldFieldNumPredicate  = bool function(const char[][] fields, size_t index1, size_t index2, double value);
232

233
FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index)
234
{
235 1
    return fields => fn(fields, index);
236
}
237

238
FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value)
239
{
240 1
    return fields => fn(fields, index, value);
241
}
242

243
FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value)
244
{
245 1
    return fields => fn(fields, index, value);
246
}
247

248
FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value)
249
{
250 1
    return fields => fn(fields, index, value);
251
}
252

253
FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value)
254
{
255 1
    return fields => fn(fields, index, value);
256
}
257

258
FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2)
259
{
260 1
    return fields => fn(fields, index1, index2);
261
}
262

263
FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value)
264
{
265 1
    return fields => fn(fields, index1, index2, value);
266
}
267

268
/* Predicate functions - These are the actual functions used in a FieldsPredicate. They
269
 * are a direct reflection of the operators available via command line args. Each matches
270
 * one of the FieldsPredicate function aliases defined above.
271
 */
272 1
bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; }
273 1
bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; }
274 1
bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); }
275 1
bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); }
276

277 1
bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; }
278 1
bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; }
279 1
bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; }
280 1
bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; }
281

282 1
bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; }
283 1
bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double  < val; }
284 1
bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; }
285 1
bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double  > val; }
286 1
bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; }
287 1
bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; }
288

289 1
bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; }
290 1
bool strLT(const char[][] fields, size_t index, string val) { return fields[index]  < val; }
291 1
bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; }
292 1
bool strGT(const char[][] fields, size_t index, string val) { return fields[index]  > val; }
293 1
bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; }
294 1
bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; }
295 1
bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); }
296 1
bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); }
297

298
/* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander.
299
 */
300 1
bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); }
301 1
bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); }
302 1
bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); }
303 1
bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); }
304

305
/* Note: Case-sensitivity is built into the regex value, so these regex predicates are
306
 * used for both case-sensitive and case-insensitive regex operators.
307
 */
308 1
bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); }
309 1
bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); }
310

311 1
bool charLenLE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength <= val; }
312 1
bool charLenLT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength < val; }
313 1
bool charLenGE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength >= val; }
314 1
bool charLenGT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength > val; }
315 1
bool charLenEQ(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength == val; }
316 1
bool charLenNE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength != val; }
317

318 1
bool byteLenLE(const char[][] fields, size_t index, double val) { return fields[index].length <= val; }
319 1
bool byteLenLT(const char[][] fields, size_t index, double val) { return fields[index].length < val; }
320 1
bool byteLenGE(const char[][] fields, size_t index, double val) { return fields[index].length >= val; }
321 1
bool byteLenGT(const char[][] fields, size_t index, double val) { return fields[index].length > val; }
322 1
bool byteLenEQ(const char[][] fields, size_t index, double val) { return fields[index].length == val; }
323 1
bool byteLenNE(const char[][] fields, size_t index, double val) { return fields[index].length != val; }
324

325 1
bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; }
326 1
bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double  < fields[index2].to!double; }
327 1
bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; }
328 1
bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double  > fields[index2].to!double; }
329 1
bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; }
330 1
bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; }
331 1
bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; }
332 1
bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; }
333
bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2)
334
{
335 1
    return equal(fields[index1].asLowerCase, fields[index2].asLowerCase);
336
}
337
bool ffIStrNE(const char[][] fields, size_t index1, size_t index2)
338
{
339 1
    return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase);
340
}
341

342 1
auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; }
343 1
auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); }
344

345
bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value)
346
{
347 1
    return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value;
348
}
349
bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value)
350
{
351 1
    return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value;
352
}
353
bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value)
354
{
355 1
    return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value;
356
}
357
bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value)
358
{
359 1
    return RelDiff(fields[index1].to!double, fields[index2].to!double) > value;
360
}
361

362
/* Command line option handlers - There is a command line option handler for each
363
 * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate,
364
 * etc. Option handlers are passed the tests array, the predicate function, and the
365
 * command line option arguments. A FieldsPredicate delegate is created and appended to
366
 * the tests array. An exception is thrown if errors are detected while processing the
367
 * option, the error text is intended for the end user.
368
 *
369
 * All the option handlers have similar functionality, differing in option processing and
370
 * error message generation. fieldVsNumberOptionHandler is described as an example. It
371
 * handles command options such as '--lt 3:1000', which tests field 3 for a values less
372
 * than 1000. It is passed the tests array, the 'numLE' predicate function used for the
373
 * test, and the string "3:1000" representing the option value. It is also passed the
374
 * header line from the first input file and an indication of whether header processing
375
 * is enabled (--H|header). parseFieldList (fieldlist module) is used to parse the
376
 * field-list component of the option ("3" in the example). The comparison value ("1000")
377
 * is converted to a double. These are wrapped in a FieldsPredicate delegate which is
378
 * added to the tests array. An error is signaled if the option string is invalid.
379
 *
380
 * During processing, fields indexes are converted from one-based to zero-based. As an
381
 * optimization, the maximum field index is also tracked. This allows early termination of
382
 * line splitting.
383
 *
384
 * The header line from the input file is not available when std.getop processes the
385
 * command line option. The processing described above must be deferred. This is done
386
 * using a 'CmdOptionHandler' delegate. There is a 'make' function for every Command line
387
 * option handler that creates these. These are created during std.getopt processing.
388
 * They are run when the header line becomes available.
389
 *
390
 * The final setup for the '--lt' (numeric less-than) operator' is as follows:
391
 *   - Function 'handlerNumLE' (in TsvFilterOptions.processArgs) is associated with the
392
 *     command line option "--lt <val>". When called by std.getopt it creates an option
393
 *     hander delegate via 'makeFieldVsNumberOptionHandler'. This is appended to an
394
 *     array of delegates.
395
 *   - 'fieldVsNumberOptionHandler' is invoked via the delegate after the header line
396
 *     becomes available (in TsvFilterOptions.processArgs). If args are valid,
397
 *     'makeFieldVsNumberDelegate' is used to create a delegate invoking the 'numLE'
398
 *     predicate function. This delegate is added to the set of run-time tests.
399
 *
400
 * Note that in the above setup the 'numLE' predicate is specified in 'handlerNumLE'
401
 * and passed through all the steps. This is how the command line option gets
402
 * associated with the predicate function.
403
 */
404

405
/* CmdOptionHandler delegate signature - This is the call made to process the command
406
 * line option arguments after the header line has been read.
407
 */
408
alias CmdOptionHandler = void delegate(ref FieldsPredicate[] tests, ref size_t maxFieldIndex,
409
                                       bool hasHeader, string[] headerFields);
410

411
CmdOptionHandler makeFieldUnaryOptionHandler(FieldUnaryPredicate predicateFn, string option, string optionVal)
412
{
413 1
    return
414
        (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields)
415 1
        => fieldUnaryOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal);
416
}
417

418
void fieldUnaryOptionHandler(
419
    ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields,
420
    FieldUnaryPredicate fn, string option, string optionVal)
421
{
422
    import tsv_utils.common.fieldlist;
423

424 1
    try foreach (fieldNum, fieldIndex;
425
                 optionVal
426
                 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields)
427
                 .enumerate(1))
428
        {
429 1
            tests ~= makeFieldUnaryDelegate(fn, fieldIndex);
430 1
            maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex;
431
        }
432
    catch (Exception e)
433
    {
434 1
         e.msg = format("Invalid option: [--%s %s]. %s\n   Expected: '--%s <field>' or '--%s <field-list>'.",
435
                        option, optionVal, e.msg, option, option);
436 1
         throw e;
437
    }
438
}
439

440
CmdOptionHandler makeFieldVsNumberOptionHandler(FieldVsNumberPredicate predicateFn, string option, string optionVal)
441
{
442 1
    return
443
        (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields)
444 1
        => fieldVsNumberOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal);
445
}
446

447
void fieldVsNumberOptionHandler(
448
    ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields,
449
    FieldVsNumberPredicate fn, string option, string optionVal)
450
{
451
    import tsv_utils.common.fieldlist;
452

453
    auto formatErrorMsg(string option, string optionVal, string errorMessage="")
454
    {
455 1
        string optionalSpace = (errorMessage.length == 0) ? "" : " ";
456 1
        return format(
457
            "Invalid option: [--%s %s].%s%s\n   Expected: '--%s <field>:<val>' or '--%s <field-list>:<val> where <val> is a number.",
458
            option, optionVal, optionalSpace, errorMessage, option, option);
459
    }
460

461
    try
462
    {
463 1
        auto optionValParse =
464
            optionVal
465
            .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
466
            (hasHeader, headerFields);
467

468 1
        auto fieldIndices = optionValParse.array;
469 1
        enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list.");
470 1
        double value = optionVal[optionValParse.consumed + 1 .. $].to!double;
471

472 1
        foreach (fieldIndex; fieldIndices)
473
        {
474 1
            tests ~= makeFieldVsNumberDelegate(fn, fieldIndex, value);
475 1
            maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex;
476
        }
477
    }
478
    catch (Exception e)
479
    {
480 1
        e.msg = formatErrorMsg(option, optionVal, e.msg);
481 1
        throw e;
482
    }
483
}
484

485
CmdOptionHandler makeFieldVsStringOptionHandler(FieldVsStringPredicate predicateFn, string option, string optionVal)
486
{
487 1
    return
488
        (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields)
489 1
        => fieldVsStringOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal);
490
}
491

492
void fieldVsStringOptionHandler(
493
    ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields,
494
    FieldVsStringPredicate fn, string option, string optionVal)
495
{
496
    import tsv_utils.common.fieldlist;
497

498
    try
499
    {
500 1
        auto optionValParse =
501
            optionVal
502
            .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
503
            (hasHeader, headerFields);
504

505 1
        auto fieldIndices = optionValParse.array;
506 1
        enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list.");
507 1
        string value = optionVal[optionValParse.consumed + 1 .. $].idup;
508

509 1
        foreach (fieldIndex; fieldIndices)
510
        {
511 1
            tests ~= makeFieldVsStringDelegate(fn, fieldIndex, value);
512 1
            maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex;
513
        }
514

515
    }
516
    catch (Exception e)
517
    {
518 1
        e.msg = format(
519
            "[--%s %s]. %s\n   Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.",
520
            option, optionVal, e.msg, option, option);
521 1
        throw e;
522
    }
523
}
524

525
CmdOptionHandler makeFieldVsIStringOptionHandler(FieldVsIStringPredicate predicateFn, string option, string optionVal)
526
{
527 1
    return
528
        (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields)
529 1
        => fieldVsIStringOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal);
530
}
531

532
/* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the
533
 * case-insensitive comparison will be done on lower-cased values.
534
 */
535
void fieldVsIStringOptionHandler(
536
    ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields,
537
    FieldVsIStringPredicate fn, string option, string optionVal)
538
{
539
    import tsv_utils.common.fieldlist;
540

541
    try
542
    {
543 1
        auto optionValParse =
544
            optionVal
545
            .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
546
            (hasHeader, headerFields);
547

548 1
        auto fieldIndices = optionValParse.array;
549 1
        enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list.");
550 1
        string value = optionVal[optionValParse.consumed + 1 .. $].idup;
551

552 1
        foreach (fieldIndex; fieldIndices)
553
        {
554 1
            tests ~= makeFieldVsIStringDelegate(fn, fieldIndex, value.to!dstring.toLower);
555 1
            maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex;
556
        }
557
    }
558
    catch (Exception e)
559
    {
560 1
        e.msg = format(
561
            "[--%s %s]. %s\n   Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.",
562
            option, optionVal, e.msg, option, option);
563 1
        throw e;
564
    }
565
}
566

567
CmdOptionHandler makeFieldVsRegexOptionHandler(FieldVsRegexPredicate predicateFn, string option, string optionVal, bool caseSensitive)
568
{
569 1
    return
570
        (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields)
571 1
        => fieldVsRegexOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal, caseSensitive);
572
}
573

574
void fieldVsRegexOptionHandler(
575
    ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields,
576
    FieldVsRegexPredicate fn, string option, string optionVal, bool caseSensitive)
577
{
578
    import tsv_utils.common.fieldlist;
579

580
    try
581
    {
582 1
        auto optionValParse =
583
            optionVal
584
            .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
585
            (hasHeader, headerFields);
586

587 1
        auto fieldIndices = optionValParse.array;
588 1
        enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list.");
589

590 1
        immutable modifiers = caseSensitive ? "" : "i";
591 1
        Regex!char value =
592
            optionVal[optionValParse.consumed + 1 .. $]
593
            .regex(modifiers);
594

595 1
        foreach (fieldIndex; fieldIndices)
596
        {
597 1
            tests ~= makeFieldVsRegexDelegate(fn, fieldIndex, value);
598 1
            maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex;
599
        }
600
    }
601
    catch (RegexException e)
602
    {
603 1
        e.msg = format(
604
            "[--%s %s]. Invalid regular expression: %s\n   Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.",
605
            option, optionVal, e.msg, option, option);
606 1
        throw e;
607
    }
608
    catch (Exception e)
609
    {
610 1
        e.msg = format(
611
            "[--%s %s]. %s\n   Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.",
612
            option, optionVal, e.msg, option, option);
613 1
        throw e;
614
    }
615
}
616

617

618
CmdOptionHandler makeFieldVsFieldOptionHandler(FieldVsFieldPredicate predicateFn, string option, string optionVal)
619
{
620 1
    return
621
        (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields)
622 1
        => fieldVsFieldOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal);
623
}
624

625
void fieldVsFieldOptionHandler(
626
    ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields,
627
    FieldVsFieldPredicate fn, string option, string optionVal)
628
{
629
    import tsv_utils.common.fieldlist;
630

631
    try
632
    {
633 1
        auto optionValParse =
634
            optionVal
635
            .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
636
            (hasHeader, headerFields);
637

638 1
        auto fieldIndices1 = optionValParse.array;
639

640 1
        enforce(fieldIndices1.length != 0, "First field argument is empty.");
641 1
        enforce(fieldIndices1.length == 1, "First field argument references multiple fields.");
642 1
        enforce(optionVal.length - optionValParse.consumed > 1, " Second field argument is empty.");
643

644 1
        auto fieldIndices2 =
645
            optionVal[optionValParse.consumed + 1 .. $]
646
            .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, Yes.consumeEntireFieldListString)
647
            (hasHeader, headerFields)
648
            .array;
649

650 1
        enforce(fieldIndices2.length != 0, "Second field argument is empty.");
651 1
        enforce(fieldIndices2.length == 1, "Second field argument references multiple fields.");
652

653 1
        enforce(fieldIndices1[0] != fieldIndices2[0],
654 1
                format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal));
655

656 1
        tests ~= makeFieldVsFieldDelegate(fn, fieldIndices1[0], fieldIndices2[0]);
657 1
        maxFieldIndex = max(maxFieldIndex, fieldIndices1[0], fieldIndices2[0]);
658
    }
659
    catch (Exception e)
660
    {
661 1
        e.msg = format(
662
            "[--%s %s]. %s\n   Expected: '--%s <field1>:<field2>' where <field1> and <field2> are individual fields.",
663
            option, optionVal, e.msg, option);
664 1
        throw e;
665
    }
666
}
667

668
CmdOptionHandler makeFieldFieldNumOptionHandler(FieldFieldNumPredicate predicateFn, string option, string optionVal)
669
{
670 1
    return
671
        (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields)
672 1
        => fieldFieldNumOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal);
673
}
674

675
void fieldFieldNumOptionHandler(
676
    ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields,
677
    FieldFieldNumPredicate fn, string option, string optionVal)
678
{
679
    import tsv_utils.common.fieldlist;
680

681
    try
682
    {
683 1
        auto optionValParse1 =
684
            optionVal
685
            .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
686
            (hasHeader, headerFields);
687

688 1
        auto fieldIndices1 = optionValParse1.array;
689

690 1
        enforce(fieldIndices1.length != 0, "First field argument is empty.");
691 1
        enforce(fieldIndices1.length == 1, "First field argument references multiple fields.");
692 1
        enforce(optionVal.length - optionValParse1.consumed > 1, " Second field argument is empty.");
693

694 1
        auto optionValSegment2 = optionVal[optionValParse1.consumed + 1 .. $];
695 1
        auto optionValParse2 =
696
            optionValSegment2
697
            .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
698
            (hasHeader, headerFields);
699

700 1
        auto fieldIndices2 = optionValParse2.array;
701

702 1
        enforce(fieldIndices2.length != 0, "Second field argument is empty.");
703 1
        enforce(fieldIndices2.length == 1, "Second field argument references multiple fields.");
704 1
        enforce(optionValSegment2.length - optionValParse2.consumed > 1, "Number argument is empty.");
705

706 1
        size_t field1 = fieldIndices1[0];
707 1
        size_t field2 = fieldIndices2[0];
708 1
        double value = optionValSegment2[optionValParse2.consumed + 1 .. $].to!double;
709

710 1
        enforce(field1 != field2,
711 1
                format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal));
712

713 1
        tests ~= makeFieldFieldNumDelegate(fn, field1, field2, value);
714 1
        maxFieldIndex = max(maxFieldIndex, field1, field2);
715
    }
716
    catch (Exception e)
717
    {
718 1
        e.msg = format(
719
            "[--%s %s]. %s\n   Expected: '--%s <field1>:<field2>:<num>' where <field1> and <field2> are individual fields.",
720
            option, optionVal, e.msg, option);
721 1
        throw e;
722
    }
723
}
724

725
/** Command line options - This struct holds the results of command line option processing.
726
 * It also has a method, processArgs, that invokes command line arg processing.
727
 */
728
struct TsvFilterOptions
729
{
730
    import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader;
731

732
    string programName;
733
    InputSourceRange inputSources;   /// Input files
734
    FieldsPredicate[] tests;         /// Derived from tests
735
    size_t maxFieldIndex;            /// Derived from tests
736
    bool hasHeader = false;          /// --H|header
737
    bool invert = false;             /// --invert
738
    bool disjunct = false;           /// --or
739
    char delim = '\t';               /// --delimiter
740

741
    /* Returns a tuple. First value is true if command line arguments were successfully
742
     * processed and execution should continue, or false if an error occurred or the user
743
     * asked for help. If false, the second value is the appropriate exit code (0 or 1).
744
     *
745
     * Returning true (execution continues) means args have been validated and the
746
     * tests array has been established.
747
     */
748
    auto processArgs (ref string[] cmdArgs)
749
    {
750
        import std.algorithm : each;
751
        import std.array : split;
752
        import std.conv : to;
753
        import std.getopt;
754
        import std.path : baseName, stripExtension;
755
        import tsv_utils.common.getopt_inorder;
756
        import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
757

758 1
        bool helpVerbose = false;        // --help-verbose
759 1
        bool helpOptions = false;        // --help-options
760 1
        bool helpFields = false;         // --help-fields
761 1
        bool versionWanted = false;      // --V|version
762

763 1
        programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
764

765
        /* Command option handlers - One handler for each option. These conform to the
766
         * getopt required handler signature, and separate knowledge the specific command
767
         * option text from the option processing.
768
         */
769

770 1
        CmdOptionHandler[] cmdLineTestOptions;
771

772 1
        void handlerFldEmpty(string option, string value)    { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldEmpty,    option, value); }
773 1
        void handlerFldNotEmpty(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldNotEmpty, option, value); }
774 1
        void handlerFldBlank(string option, string value)    { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldBlank,    option, value); }
775 1
        void handlerFldNotBlank(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldNotBlank, option, value); }
776

777 1
        void handlerFldIsNumeric(string option, string value)  { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsNumeric,  option, value); }
778 1
        void handlerFldIsFinite(string option, string value)   { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsFinite,   option, value); }
779 1
        void handlerFldIsNaN(string option, string value)      { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsNaN,      option, value); }
780 1
        void handlerFldIsInfinity(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsInfinity, option, value); }
781

782 1
        void handlerNumLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numLE, option, value); }
783 1
        void handlerNumLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numLT, option, value); }
784 1
        void handlerNumGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numGE, option, value); }
785 1
        void handlerNumGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numGT, option, value); }
786 1
        void handlerNumEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numEQ, option, value); }
787 1
        void handlerNumNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numNE, option, value); }
788

789 1
        void handlerStrLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strLE, option, value); }
790 1
        void handlerStrLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strLT, option, value); }
791 1
        void handlerStrGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strGE, option, value); }
792 1
        void handlerStrGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strGT, option, value); }
793 1
        void handlerStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strEQ, option, value); }
794 1
        void handlerStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strNE, option, value); }
795

796 1
        void handlerStrInFld(string option, string value)    { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strInFld,    option, value); }
797 1
        void handlerStrNotInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strNotInFld, option, value); }
798

799 1
        void handlerIStrEQ(string option, string value)       { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrEQ,       option, value); }
800 1
        void handlerIStrNE(string option, string value)       { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrNE,       option, value); }
801 1
        void handlerIStrInFld(string option, string value)    { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrInFld,    option, value); }
802 1
        void handlerIStrNotInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrNotInFld, option, value); }
803

804 1
        void handlerRegexMatch(string option, string value)     { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(&regexMatch,    option, value, true); }
805 1
        void handlerRegexNotMatch(string option, string value)  { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(&regexNotMatch, option, value, true); }
806 1
        void handlerIRegexMatch(string option, string value)    { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(&regexMatch,    option, value, false); }
807 1
        void handlerIRegexNotMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(&regexNotMatch, option, value, false); }
808

809 1
        void handlerCharLenLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenLE, option, value); }
810 1
        void handlerCharLenLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenLT, option, value); }
811 1
        void handlerCharLenGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenGE, option, value); }
812 1
        void handlerCharLenGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenGT, option, value); }
813 1
        void handlerCharLenEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenEQ, option, value); }
814 1
        void handlerCharLenNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenNE, option, value); }
815

816 1
        void handlerByteLenLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenLE, option, value); }
817 1
        void handlerByteLenLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenLT, option, value); }
818 1
        void handlerByteLenGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenGE, option, value); }
819 1
        void handlerByteLenGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenGT, option, value); }
820 1
        void handlerByteLenEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenEQ, option, value); }
821 1
        void handlerByteLenNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenNE, option, value); }
822

823 1
        void handlerFFLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffLE, option, value); }
824 1
        void handlerFFLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffLT, option, value); }
825 1
        void handlerFFGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffGE, option, value); }
826 1
        void handlerFFGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffGT, option, value); }
827 1
        void handlerFFEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffEQ, option, value); }
828 1
        void handlerFFNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffNE, option, value); }
829

830 1
        void handlerFFStrEQ(string option, string value)  { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffStrEQ,  option, value); }
831 1
        void handlerFFStrNE(string option, string value)  { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffStrNE,  option, value); }
832 1
        void handlerFFIStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffIStrEQ, option, value); }
833 1
        void handlerFFIStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffIStrNE, option, value); }
834

835 1
        void handlerFFAbsDiffLE(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffAbsDiffLE, option, value); }
836 1
        void handlerFFAbsDiffGT(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffAbsDiffGT, option, value); }
837 1
        void handlerFFRelDiffLE(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffRelDiffLE, option, value); }
838 1
        void handlerFFRelDiffGT(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffRelDiffGT, option, value); }
839

840
        try
841
        {
842 1
            arraySep = ",";    // Use comma to separate values in command line options
843 1
            auto r = getoptInorder(
844
                cmdArgs,
845
                "help-verbose",    "     Print full help.", &helpVerbose,
846
                "help-options",    "     Print the options list by itself.", &helpOptions,
847
                "help-fields",     "     Print help on specifying fields.", &helpFields,
848
                 std.getopt.config.caseSensitive,
849
                "V|version",       "     Print version information and exit.", &versionWanted,
850
                "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
851
                std.getopt.config.caseInsensitive,
852
                "or",              "     Evaluate tests as an OR rather than an AND.", &disjunct,
853
                std.getopt.config.caseSensitive,
854
                "v|invert",        "     Invert the filter, printing lines that do not match.", &invert,
855
                std.getopt.config.caseInsensitive,
856
                "d|delimiter",     "CHR  Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
857

858
                "empty",           "<field-list>       True if FIELD is empty.", &handlerFldEmpty,
859
                "not-empty",       "<field-list>       True if FIELD is not empty.", &handlerFldNotEmpty,
860
                "blank",           "<field-list>       True if FIELD is empty or all whitespace.", &handlerFldBlank,
861
                "not-blank",       "<field-list>       True if FIELD contains a non-whitespace character.", &handlerFldNotBlank,
862

863
                "is-numeric",      "<field-list>       True if FIELD is interpretable as a number.", &handlerFldIsNumeric,
864
                "is-finite",       "<field-list>       True if FIELD is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite,
865
                "is-nan",          "<field-list>       True if FIELD is NaN.", &handlerFldIsNaN,
866
                "is-infinity",     "<field-list>       True if FIELD is infinity.", &handlerFldIsInfinity,
867

868
                "le",              "<field-list>:NUM   FIELD <= NUM (numeric).", &handlerNumLE,
869
                "lt",              "<field-list>:NUM   FIELD <  NUM (numeric).", &handlerNumLT,
870
                "ge",              "<field-list>:NUM   FIELD >= NUM (numeric).", &handlerNumGE,
871
                "gt",              "<field-list>:NUM   FIELD >  NUM (numeric).", &handlerNumGT,
872
                "eq",              "<field-list>:NUM   FIELD == NUM (numeric).", &handlerNumEQ,
873
                "ne",              "<field-list>:NUM   FIELD != NUM (numeric).", &handlerNumNE,
874

875
                "str-le",          "<field-list>:STR   FIELD <= STR (string).", &handlerStrLE,
876
                "str-lt",          "<field-list>:STR   FIELD <  STR (string).", &handlerStrLT,
877
                "str-ge",          "<field-list>:STR   FIELD >= STR (string).", &handlerStrGE,
878
                "str-gt",          "<field-list>:STR   FIELD >  STR (string).", &handlerStrGT,
879
                "str-eq",          "<field-list>:STR   FIELD == STR (string).", &handlerStrEQ,
880
                "istr-eq",         "<field-list>:STR   FIELD == STR (string, case-insensitive).", &handlerIStrEQ,
881
                "str-ne",          "<field-list>:STR   FIELD != STR (string).", &handlerStrNE,
882
                "istr-ne",         "<field-list>:STR   FIELD != STR (string, case-insensitive).", &handlerIStrNE,
883
                "str-in-fld",      "<field-list>:STR   FIELD contains STR (substring search).", &handlerStrInFld,
884
                "istr-in-fld",     "<field-list>:STR   FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld,
885
                "str-not-in-fld",  "<field-list>:STR   FIELD does not contain STR (substring search).", &handlerStrNotInFld,
886
                "istr-not-in-fld", "<field-list>:STR   FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld,
887

888
                "regex",           "<field-list>:REGEX   FIELD matches regular expression.", &handlerRegexMatch,
889
                "iregex",          "<field-list>:REGEX   FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch,
890
                "not-regex",       "<field-list>:REGEX   FIELD does not match regular expression.", &handlerRegexNotMatch,
891
                "not-iregex",      "<field-list>:REGEX   FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch,
892

893
                "char-len-le",     "<field-list>:NUM   character-length(FIELD) <= NUM.", &handlerCharLenLE,
894
                "char-len-lt",     "<field-list>:NUM   character-length(FIELD) < NUM.", &handlerCharLenLT,
895
                "char-len-ge",     "<field-list>:NUM   character-length(FIELD) >= NUM.", &handlerCharLenGE,
896
                "char-len-gt",     "<field-list>:NUM   character-length(FIELD) > NUM.", &handlerCharLenGT,
897
                "char-len-eq",     "<field-list>:NUM   character-length(FIELD) == NUM.", &handlerCharLenEQ,
898
                "char-len-ne",     "<field-list>:NUM   character-length(FIELD) != NUM.", &handlerCharLenNE,
899

900
                "byte-len-le",     "<field-list>:NUM   byte-length(FIELD) <= NUM.", &handlerByteLenLE,
901
                "byte-len-lt",     "<field-list>:NUM   byte-length(FIELD) < NUM.", &handlerByteLenLT,
902
                "byte-len-ge",     "<field-list>:NUM   byte-length(FIELD) >= NUM.", &handlerByteLenGE,
903
                "byte-len-gt",     "<field-list>:NUM   byte-length(FIELD) > NUM.", &handlerByteLenGT,
904
                "byte-len-eq",     "<field-list>:NUM   byte-length(FIELD) == NUM.", &handlerByteLenEQ,
905
                "byte-len-ne",     "<field-list>:NUM   byte-length(FIELD) != NUM.", &handlerByteLenNE,
906

907
                "ff-le",           "FIELD1:FIELD2   FIELD1 <= FIELD2 (numeric).", &handlerFFLE,
908
                "ff-lt",           "FIELD1:FIELD2   FIELD1 <  FIELD2 (numeric).", &handlerFFLT,
909
                "ff-ge",           "FIELD1:FIELD2   FIELD1 >= FIELD2 (numeric).", &handlerFFGE,
910
                "ff-gt",           "FIELD1:FIELD2   FIELD1 >  FIELD2 (numeric).", &handlerFFGT,
911
                "ff-eq",           "FIELD1:FIELD2   FIELD1 == FIELD2 (numeric).", &handlerFFEQ,
912
                "ff-ne",           "FIELD1:FIELD2   FIELD1 != FIELD2 (numeric).", &handlerFFNE,
913
                "ff-str-eq",       "FIELD1:FIELD2   FIELD1 == FIELD2 (string).", &handlerFFStrEQ,
914
                "ff-istr-eq",      "FIELD1:FIELD2   FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ,
915
                "ff-str-ne",       "FIELD1:FIELD2   FIELD1 != FIELD2 (string).", &handlerFFStrNE,
916
                "ff-istr-ne",      "FIELD1:FIELD2   FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE,
917

918
                "ff-absdiff-le",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE,
919
                "ff-absdiff-gt",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2)  > NUM", &handlerFFAbsDiffGT,
920
                "ff-reldiff-le",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE,
921
                "ff-reldiff-gt",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2))  > NUM", &handlerFFRelDiffGT,
922
                );
923

924
            /* Both help texts are a bit long. In this case, for "regular" help, don't
925
             * print options, just the text. The text summarizes the options.
926
             */
927 1
            if (r.helpWanted)
928
            {
929 1
                stdout.write(helpText);
930 1
                return tuple(false, 0);
931
            }
932 1
            else if (helpVerbose)
933
            {
934 1
                defaultGetoptPrinter(helpTextVerbose, r.options);
935 1
                return tuple(false, 0);
936
            }
937 1
            else if (helpOptions)
938
            {
939 1
                defaultGetoptPrinter(helpTextOptions, r.options);
940 1
                return tuple(false, 0);
941
            }
942 1
            else if (helpFields)
943
            {
944
                import tsv_utils.common.fieldlist : fieldListHelpText ;
945 1
                writeln(fieldListHelpText);
946 1
                return tuple(false, 0);
947
            }
948 1
            else if (versionWanted)
949
            {
950
                import tsv_utils.common.tsvutils_version;
951 1
                writeln(tsvutilsVersionNotice("tsv-filter"));
952 1
                return tuple(false, 0);
953
            }
954

955
            /* Input files. Remaining command line args are files. */
956 1
            string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
957 1
            cmdArgs.length = 1;
958

959 1
            string[] headerFields;
960

961
            /* FieldListArgProcessing encapsulates the field list processing. It is
962
             * called prior to reading the header line if headers are not being used,
963
             * and after if headers are being used.
964
             */
965
            void fieldListArgProcessing()
966
            {
967 1
                cmdLineTestOptions.each!(dg => dg(tests, maxFieldIndex, hasHeader, headerFields));
968
            }
969

970 1
            if (!hasHeader) fieldListArgProcessing();
971

972 1
            ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
973 1
            inputSources = inputSourceRange(filepaths, readHeader);
974

975 1
            if (hasHeader)
976
            {
977 1
                throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1);
978 1
                headerFields = inputSources.front.header.split(delim).to!(string[]);
979 1
                fieldListArgProcessing();
980
            }
981
        }
982
        catch (Exception e)
983
        {
984 1
            stderr.writefln("[%s] Error processing command line arguments: %s", programName, e.msg);
985 1
            return tuple(false, 1);
986
        }
987 1
        return tuple(true, 0);
988
    }
989
}
990

991
/** tsvFilter processes the input files and runs the tests.
992
 */
993
void tsvFilter(ref TsvFilterOptions cmdopt)
994
{
995
    import std.algorithm : all, any, splitter;
996
    import std.range;
997
    import tsv_utils.common.utils : BufferedOutputRange, bufferedByLine, InputSourceRange,
998
        throwIfWindowsNewlineOnUnix;
999

1000
    /* inputSources must be an InputSourceRange and include at least stdin. */
1001 1
    assert(!cmdopt.inputSources.empty);
1002
    static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1003

1004
    /* BufferedOutputRange improves performance on narrow files with high percentages of
1005
     * writes. Want responsive output if output is rare, so ensure the first matched
1006
     * line is written, and that writes separated by long stretches of non-matched lines
1007
     * are written.
1008
     */
1009
    enum maxInputLinesWithoutBufferFlush = 1024;
1010 1
    size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1;
1011

1012 1
    auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
1013

1014
     /* First header is read during command line argument processing. Immediately
1015
      * flush it so subsequent processes in a unix command pipeline see it early.
1016
      * This helps provide timely error messages.
1017
      */
1018 1
    if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1019
    {
1020 1
        auto inputStream = cmdopt.inputSources.front;
1021 1
        bufferedOutput.appendln(inputStream.header);
1022 1
        bufferedOutput.flush;
1023
    }
1024

1025
    /* Process each input file, one line at a time. */
1026 1
    immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1027 1
    auto lineFields = new char[][](cmdopt.maxFieldIndex + 1);
1028

1029 1
    foreach (inputStream; cmdopt.inputSources)
1030
    {
1031 1
        if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1032

1033 1
        foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine))
1034
        {
1035 1
            if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum);
1036

1037
            /* Copy the needed number of fields to the fields array. */
1038 1
            int fieldIndex = -1;
1039 1
            foreach (fieldValue; line.splitter(cmdopt.delim))
1040
            {
1041 1
                if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break;
1042 1
                fieldIndex++;
1043 1
                lineFields[fieldIndex] = fieldValue;
1044
            }
1045

1046 1
            if (fieldIndex == -1)
1047
            {
1048 1
                assert(line.length == 0);
1049
                /* Bug work-around. Currently empty lines are not handled properly by splitter.
1050
                 *   Bug: https://issues.dlang.org/show_bug.cgi?id=15735
1051
                 *   Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
1052
                 * Work-around: Point to the line. It's an empty string.
1053
                 */
1054 1
                fieldIndex++;
1055 1
                lineFields[fieldIndex] = line;
1056
            }
1057

1058 1
            enforce(fieldIndex >= cast(long) cmdopt.maxFieldIndex,
1059 1
                    format("Not enough fields in line. File: %s, Line: %s",
1060
                           inputStream.name, lineNum));
1061

1062
            /* Run the tests. Tests will fail (throw) if a field cannot be converted
1063
             * to the expected type.
1064
             */
1065
            try
1066
            {
1067 1
                inputLinesWithoutBufferFlush++;
1068 1
                bool passed = cmdopt.disjunct ?
1069 1
                    cmdopt.tests.any!(x => x(lineFields)) :
1070 1
                    cmdopt.tests.all!(x => x(lineFields));
1071 1
                if (cmdopt.invert) passed = !passed;
1072 1
                if (passed)
1073
                {
1074 1
                    const bool wasFlushed = bufferedOutput.appendln(line);
1075 1
                    if (wasFlushed) inputLinesWithoutBufferFlush = 0;
1076 1
                    else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush)
1077
                    {
1078 1
                        bufferedOutput.flush;
1079 1
                        inputLinesWithoutBufferFlush = 0;
1080
                    }
1081
                }
1082
            }
1083
            catch (Exception e)
1084
            {
1085 1
                throw new Exception(
1086
                    format("Could not process line or field: %s\n  File: %s Line: %s%s",
1087
                           e.msg, inputStream.name, lineNum,
1088 1
                           (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
1089
            }
1090
        }
1091
    }
1092
}

Read our documentation on viewing source code .

Loading