TRAVIS_OS_NAME=linux <<<<<< ENV ./.codecov.yml bash_completion/tsv-utils buildtools/aggregate-codecov.d buildtools/codecov-to-relative-paths.d buildtools/diff-test-result-dirs.d buildtools/dircat.d buildtools/makefile common/dub.json common/makefile common/src/tsv_utils/common/fieldlist.d common/src/tsv_utils/common/getopt_inorder.d common/src/tsv_utils/common/numerics.d common/src/tsv_utils/common/package.d common/src/tsv_utils/common/tsvutils_version.d common/src/tsv_utils/common/unittest_utils.d common/src/tsv_utils/common/utils.d csv2tsv/dub.json csv2tsv/makefile csv2tsv/profile_data/collect_profile_data.sh csv2tsv/src/tsv_utils/csv2tsv.d csv2tsv/src_v1/tsv_utils/csv2tsv.d csv2tsv/tests/tests.sh dub.json dub_build.d extras/scripts/tsv-sort extras/scripts/tsv-sort-fast keep-header/dub.json keep-header/makefile keep-header/src/tsv_utils/keep-header.d keep-header/tests/tests.sh makeapp.mk makedefs.mk makefile number-lines/dub.json number-lines/makefile number-lines/src/tsv_utils/number-lines.d number-lines/tests/tests.sh tsv-append/dub.json tsv-append/makefile tsv-append/src/tsv_utils/tsv-append.d tsv-append/tests/input1x3.tsv tsv-append/tests/input1x4.tsv tsv-append/tests/input3x2.tsv tsv-append/tests/input3x5.tsv tsv-append/tests/tests.sh tsv-filter/dub.json tsv-filter/makefile tsv-filter/profile_data/collect_profile_data.sh tsv-filter/profile_data/profile_data_1.tsv tsv-filter/profile_data/profile_data_2.tsv tsv-filter/profile_data/profile_data_3.tsv tsv-filter/profile_data/profile_data_4.tsv tsv-filter/profile_data/profile_data_5.tsv tsv-filter/src/tsv_utils/tsv-filter.d tsv-filter/tests/input1.tsv tsv-filter/tests/input1_dos.tsv tsv-filter/tests/input1_noheader.tsv tsv-filter/tests/input2.tsv tsv-filter/tests/input2_pipe-sep.tsv tsv-filter/tests/input4.tsv tsv-filter/tests/input_3x0.tsv tsv-filter/tests/input_3x1.tsv tsv-filter/tests/input_3x2.tsv tsv-filter/tests/input_3x3.tsv tsv-filter/tests/input_emptyfile.tsv tsv-filter/tests/input_num_or_empty.tsv tsv-filter/tests/input_numeric_tests.tsv tsv-filter/tests/input_unicode.tsv tsv-filter/tests/test-config.json tsv-filter/tests/tests.sh tsv-join/dub.json tsv-join/makefile tsv-join/src/tsv_utils/tsv-join.d tsv-join/tests/input1.tsv tsv-join/tests/input1_dos.tsv tsv-join/tests/input1_noheader.tsv tsv-join/tests/input1_rotated.tsv tsv-join/tests/input2.tsv tsv-join/tests/input2_dos.tsv tsv-join/tests/input2_noheader.tsv tsv-join/tests/input_1x5.tsv tsv-join/tests/input_2x3_colon.tsv tsv-join/tests/input_5x4_colon.tsv tsv-join/tests/input_emptyfile.tsv tsv-join/tests/tests.sh tsv-pretty/dub.json tsv-pretty/makefile tsv-pretty/src/tsv_utils/tsv-pretty.d tsv-pretty/tests/emptyfile.tsv tsv-pretty/tests/input_5x1.tsv tsv-pretty/tests/input_5x1_alltext.tsv tsv-pretty/tests/input_5x1_noheader.tsv tsv-pretty/tests/input_5x1_noheader_preamble1.tsv tsv-pretty/tests/input_5x1_noheader_preamble2.tsv tsv-pretty/tests/input_5x1_preamble1.tsv tsv-pretty/tests/input_5x1_preamble2.tsv tsv-pretty/tests/input_5x2.tsv tsv-pretty/tests/input_5x2_noheader.tsv tsv-pretty/tests/input_5x2_noheader_preamble1.tsv tsv-pretty/tests/input_5x2_noheader_preamble2.tsv tsv-pretty/tests/input_5x2_preamble1.tsv tsv-pretty/tests/input_5x2_preamble2.tsv tsv-pretty/tests/input_5x3.tsv tsv-pretty/tests/input_5x3_preamble1.tsv tsv-pretty/tests/input_5x3_preamble2.tsv tsv-pretty/tests/input_5x4_noheader.tsv tsv-pretty/tests/input_5x5.tsv tsv-pretty/tests/input_comma_delim.tsv tsv-pretty/tests/input_mixed_1.tsv tsv-pretty/tests/input_mixed_2.tsv tsv-pretty/tests/input_numbers_1.tsv tsv-pretty/tests/input_numbers_2.tsv tsv-pretty/tests/input_numbers_3.tsv tsv-pretty/tests/input_numbers_4.tsv tsv-pretty/tests/input_numbers_noheader_1.tsv tsv-pretty/tests/input_numbers_noheader_2.tsv tsv-pretty/tests/input_numbers_noheader_3.tsv tsv-pretty/tests/input_numbers_noheader_4.tsv tsv-pretty/tests/input_sample_preamble.tsv tsv-pretty/tests/input_text_1.tsv tsv-pretty/tests/input_unicode.tsv tsv-pretty/tests/invalid_unicode.tsv tsv-pretty/tests/tests.sh tsv-sample/dub.json tsv-sample/makefile tsv-sample/profile_data/collect_profile_data.sh tsv-sample/profile_data/profile_data_1.tsv tsv-sample/profile_data/profile_data_2.tsv tsv-sample/profile_data/profile_data_3.tsv tsv-sample/src/tsv_utils/tsv-sample.d tsv-sample/tests/input2x10_noheader.tsv tsv-sample/tests/input2x1_noheader.tsv tsv-sample/tests/input2x5_noheader.tsv tsv-sample/tests/input2x5_noheader_dos.tsv tsv-sample/tests/input2x7_atsign.tsv tsv-sample/tests/input3x0.tsv tsv-sample/tests/input3x10.tsv tsv-sample/tests/input3x25.tsv tsv-sample/tests/input3x25_dos.tsv tsv-sample/tests/input3x25_negative_wt.tsv tsv-sample/tests/input3x3.tsv tsv-sample/tests/input3x4.tsv tsv-sample/tests/input4x15.tsv tsv-sample/tests/input4x50.tsv tsv-sample/tests/test-config.json tsv-sample/tests/tests.sh tsv-select/dub.json tsv-select/makefile tsv-select/profile_data/collect_profile_data.sh tsv-select/profile_data/profile_data_1.tsv tsv-select/profile_data/profile_data_2.tsv tsv-select/profile_data/profile_data_3.tsv tsv-select/src/tsv_utils/tsv-select.d tsv-select/tests/input1.tsv tsv-select/tests/input1_dos.tsv tsv-select/tests/input_1field.tsv tsv-select/tests/input_2fields.tsv tsv-select/tests/input_2plus_hat_delim.tsv tsv-select/tests/input_3plus_fields.tsv tsv-select/tests/input_3x0.tsv tsv-select/tests/input_3x1.tsv tsv-select/tests/input_3x2.tsv tsv-select/tests/input_3x3.tsv tsv-select/tests/input_8xlong.tsv tsv-select/tests/input_emptyfile.tsv tsv-select/tests/input_header1.tsv tsv-select/tests/input_header2.tsv tsv-select/tests/input_header3.tsv tsv-select/tests/input_header4.tsv tsv-select/tests/input_header_variants.tsv tsv-select/tests/tests.sh tsv-split/dub.json tsv-split/makefile tsv-split/profile_data/collect_profile_data.sh tsv-split/profile_data/profile_data_1.tsv tsv-split/profile_data/profile_data_2.tsv tsv-split/profile_data/profile_data_3.tsv tsv-split/src/tsv_utils/tsv-split.d tsv-split/tests/input4x18.tsv tsv-split/tests/input4x58.tsv tsv-split/tests/input4x58_colon-delim.tsv tsv-split/tests/tests.sh tsv-summarize/dub.json tsv-summarize/makefile tsv-summarize/profile_data/collect_profile_data.sh tsv-summarize/profile_data/profile_data_1.tsv tsv-summarize/profile_data/profile_data_2.tsv tsv-summarize/profile_data/profile_data_3.tsv tsv-summarize/src/tsv_utils/tsv-summarize.d tsv-summarize/tests/empty_file.tsv tsv-summarize/tests/input_1field_a.tsv tsv-summarize/tests/input_1field_a_dos.tsv tsv-summarize/tests/input_1field_b.tsv tsv-summarize/tests/input_2field_a.tsv tsv-summarize/tests/input_2field_b.tsv tsv-summarize/tests/input_5field_a.tsv tsv-summarize/tests/input_5field_b.tsv tsv-summarize/tests/input_5field_c.tsv tsv-summarize/tests/input_5field_d.tsv tsv-summarize/tests/input_5field_header_only.tsv tsv-summarize/tests/test-config.json tsv-summarize/tests/tests.sh tsv-uniq/dub.json tsv-uniq/makefile tsv-uniq/profile_data/collect_profile_data.sh tsv-uniq/profile_data/profile_data_1.tsv tsv-uniq/profile_data/profile_data_2.tsv tsv-uniq/profile_data/profile_data_3.tsv tsv-uniq/src/tsv_utils/tsv-uniq.d tsv-uniq/tests/input1.tsv tsv-uniq/tests/input1_noheader.tsv tsv-uniq/tests/input2.tsv tsv-uniq/tests/input3.tsv tsv-uniq/tests/input_delim_underscore.tsv tsv-uniq/tests/tests.sh <<<<<< network # path=./keep-header-src-tsv_utils-keep-header.lst |/** |Command line tool that executes a command while preserving header lines. | |Copyright (c) 2018-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.keep_header; | |auto helpText = q"EOS |Execute a command against one or more files in a header aware fashion. |The first line of each file is assumed to be a header. The first header |is output unchanged. Remaining lines are sent to the given command via |standard input, excluding the header lines of subsequent files. Output |from the command is appended to the initial header line. | |A double dash (--) delimits the command, similar to how the pipe |operator (|) delimits commands. Examples: | | $ keep-header file1.txt -- sort | $ keep-header file1.txt file2.txt -- sort -k1,1nr | |These sort the files as usual, but preserve the header as the first line |output. Data can also be read from from standard input. Example: | | $ cat file1.txt | keep-header -- grep red | |Options: | |-V --version Print version information and exit. |-h --help This help information. |EOS"; | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |/** keep-header is a simple program, it is implemented entirely in main. | */ |int main(string[] args) |{ | import std.algorithm : findSplit, joiner; | import std.path : baseName, stripExtension; | import std.process : pipeProcess, ProcessPipes, Redirect, wait; | import std.range; | import std.stdio; | import std.typecons : tuple; | | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 41| dmd_coverSetMerge(true); | } | 82| auto programName = (args.length > 0) ? args[0].stripExtension.baseName : "Unknown_program_name"; 41| auto splitArgs = findSplit(args, ["--"]); | 75| if (splitArgs[1].length == 0 || splitArgs[2].length == 0) | { 9| auto cmdArgs = splitArgs[0][1 .. $]; 9| stderr.writefln("Synopsis: %s [file...] -- program [args]", programName); 9| if (cmdArgs.length > 0 && 18| (cmdArgs[0] == "-h" || cmdArgs[0] == "--help" || cmdArgs[0] == "--help-verbose")) | { 2| stderr.writeln(); 2| stderr.writeln(helpText); | } 7| else if (cmdArgs.length > 0 && 12| (cmdArgs[0] == "-V" || cmdArgs[0] == "--V" || cmdArgs[0] == "--version")) | { | import tsv_utils.common.tsvutils_version; 3| stderr.writeln(); 3| stderr.writeln(tsvutilsVersionNotice("keep-header")); | } 9| return 0; | } | 64| ProcessPipes pipe; 32| try pipe = pipeProcess(splitArgs[2], Redirect.stdin); | catch (Exception exc) | { 1| stderr.writefln("[%s] Command failed: '%s'", programName, splitArgs[2].joiner(" ")); 1| stderr.writeln(exc.msg); 1| return 1; | } | 31| int status = 0; | { | scope(exit) | { 31| auto pipeStatus = wait(pipe.pid); 31| if (pipeStatus != 0) status = pipeStatus; | } | 31| bool headerWritten = false; 281| foreach (filename; splitArgs[0].length > 1 ? splitArgs[0][1..$] : ["-"]) | { 53| bool isStdin = (filename == "-"); 106| File inputStream; | 63| if (isStdin) inputStream = stdin; | else | { 43| try inputStream = filename.File(); | catch (Exception exc) | { 1| stderr.writefln("[%s] Unable to open file: '%s'", programName, filename); 1| stderr.writeln(exc.msg); 1| status = 1; 1| break; | } | } | 52| auto firstLine = inputStream.readln(); | 74| if (inputStream.eof && firstLine.length == 0) continue; | 41| if (!headerWritten) | { 27| write(firstLine); 27| stdout.flush; 27| headerWritten = true; | } | 41| if (isStdin) | { 84| foreach (line; inputStream.byLine(KeepTerminator.yes)) | { 20| pipe.stdin.write(line); | } | } | else | { 33| ubyte[1024 * 128] readBuffer; 162| foreach (ubyte[] chunk; inputStream.byChunk(readBuffer)) | { 21| pipe.stdin.write(cast(char[])chunk); | } | } 41| pipe.stdin.flush; | } 31| pipe.stdin.close; | } 31| return status; |} keep-header/src/tsv_utils/keep-header.d is 100% covered <<<<<< EOF # path=./tsv-uniq-src-tsv_utils-tsv-uniq.lst |/** |Command line tool that identifies equivalent lines in an input stream. Equivalent |lines are identified using either the full line or a set of fields as the key. By |default, input is written to standard output, retaining only the first occurrence of |equivalent lines. There are also options for marking and numbering equivalent lines |rather, without filtering out duplicates. | |This tool is similar in spirit to the Unix 'uniq' tool, with some key differences. |First, the key can be composed of individual fields, not just the full line. Second, |input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when |they are adjacent, hence the usual need for sorting.) | |There are a couple alternative to uniq'ing the input lines. One is to mark lines with |an equivalence ID, which is a one-upped counter. The other is to number lines, with |each unique key have its own set of numbers. | |Copyright (c) 2015-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.tsv_uniq; | |import std.exception : enforce; |import std.format : format; |import std.range; |import std.stdio; |import std.typecons : tuple; | |auto helpText = q"EOS |Synopsis: tsv-uniq [options] [file...] | |tsv-uniq filters out duplicate lines using fields as a key. Filtering is |based on the entire line when key fields are not provided. Options are |also available for assigning a unique id to each key and numbering the |occurrences of each key. Fields are specified using field number or field |name. Field names can be used when the input file has a header line. | |Use '--help-verbose' for more details. |Options: |EOS"; | |auto helpTextVerbose = q"EOS |Synopsis: tsv-uniq [options] [file...] | |tsv-uniq identifies equivalent lines in tab-separated value files. Input |is read line by line, recording a key for each line based on one or more |of the fields. Two lines are equivalent if they have the same key. The |first time a key is seen its line is written to standard output. |Subsequent lines containing the same key are discarded. This command |uniq's a file on fields 2 and 3: | | tsv-uniq -f 2,3 file.tsv | |This is similar to the Unix 'uniq' program, but based on individual |fields and without requiring sorted data. | |Field names can be used if the input file has a header line. This command |uniq's a file based on the 'time' and 'date' fields: | | tsv-uniq -H -f time,date file.tsv | |Use '--help-fields' for details about field names. | |tsv-uniq can be run without specifying a key field. In this case the |whole line is used as a key, same as the Unix 'uniq' program. This works |on any line-oriented text file, not just TSV files. | |The above is the default behavior ('uniq' mode). The alternates to 'uniq' |mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all |lines are written to standard output, but with a field appended marking |equivalent entries with an ID. The ID is a one-upped counter. Example: | | tsv-uniq --header -f 2,3 --equiv file.tsv | |'Number' mode also writes all lines to standard output, but with a field |appended numbering the occurrence count for the line's key. The first line |with a specific key is assigned the number '1', the second with the key is |assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined. | |The '--r|repeated' option can be used to print only lines occurring more |than once. Specifically, the second occurrence of a key is printed. The |'--a|at-least N' option is similar, printing lines occurring at least N |times. (Like repeated, the Nth line with the key is printed.) | |The '--m|max MAX' option changes the behavior to output the first MAX |lines for each key, rather than just the first line for each key. | |If both '--a|at-least' and '--m|max' are specified, the occurrences |starting with 'at-least' and ending with 'max' are output. | |Options: |EOS"; | |/** Container for command line options. | */ |struct TsvUniqOptions |{ | import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; | | enum defaultEquivHeader = "equiv_id"; | enum defaultEquivStartID = 1; | enum defaultNumberHeader = "equiv_line"; | | string programName; | InputSourceRange inputSources; /// Input files | size_t[] fields; /// Derived: --f|fields | bool hasHeader = false; /// --H|header | bool onlyRepeated = false; /// --r|repeated. Shorthand for '--atleast 2' | size_t atLeast = 0; /// --a|at-least. Zero implies default behavior. | size_t max = 0; /// --m|max. Zero implies default behavior. | bool numberMode = false; /// --z|number | string numberHeader = defaultNumberHeader; /// --number-header | bool equivMode = false; /// --e|equiv | string equivHeader = defaultEquivHeader; /// --equiv-header | long equivStartID = defaultEquivStartID; /// --equiv-start | bool ignoreCase = false; /// --i|ignore-case | char delim = '\t'; /// --d|delimiter | bool keyIsFullLine = false; /// Derived. True if no fields specified or '--f|fields 0' | | /* Returns a tuple. First value is true if command line arguments were successfully | * processed and execution should continue, or false if an error occurred or the user | * asked for help. If false, the second value is the appropriate exit code (0 or 1). | * | * Returning true (execution continues) means args have been validated and derived | * values calculated. In addition, field indices have been converted to zero-based. | * If the whole line is the key, the individual fields list will be cleared. | * | * Repeat count control variables 'atLeast' and max' - These values are left at zero | * if no repeat count options are specified. They are set if repeat count options | * are specified, as follows: | * * atLeast - Will be zero unless --r|repeated or --a|at-least is specified. | * --r|repeated option sets it 2, --a|at-least sets it to the specified value. | * * max - Default to zero. Is set to the --m|max value if provided. Is set to | * 'atLeast' if --r|repeated or --a|at-least is provided. | * | * An exception to the above: If --e|equiv-mode is specified, then (max == 0) | * represents the default "output all values" case. In this case max may be less | * than the at-least value. | */ | auto processArgs (ref string[] cmdArgs) | { | import std.algorithm : all, each; | import std.conv : to; | import std.getopt; | import std.path : baseName, stripExtension; | import std.typecons : Yes, No; | import tsv_utils.common.fieldlist; | import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; | 135| bool helpVerbose = false; // --h|help-verbose 135| bool helpFields = false; // --help-fields 135| bool versionWanted = false; // --V|version 135| string fieldsArg; // --f|fields | 135| string fieldsOptionString = "f|fields"; | 270| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | try | { 135| arraySep = ","; // Use comma to separate values in command line options 135| auto r = getopt( | cmdArgs, | "help-verbose", " Print full help.", &helpVerbose, | "help-fields", " Print help on specifying fields.", &helpFields, | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | "H|header", " Treat the first line of each file as a header.", &hasHeader, | std.getopt.config.caseInsensitive, | | fieldsOptionString, " Fields to use as the key. Default: 0 (entire line).", &fieldsArg, | | "i|ignore-case", " Ignore case when comparing keys.", &ignoreCase, | "r|repeated", " Output only lines that are repeated (based on the key).", &onlyRepeated, | "a|at-least", "INT Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast, | "m|max", "INT Max number of each unique key to output (zero is ignored).", &max, | "e|equiv", " Output equivalence class IDs rather than uniq'ing entries.", &equivMode, | "equiv-header", "STR Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader, | "equiv-start", "INT Use INT as the first equiv-id. Default: 1.", &equivStartID, | "z|number", " Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode, | "number-header", "STR Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader, | "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, | ); | 131| if (r.helpWanted) | { 1| defaultGetoptPrinter(helpText, r.options); 1| return tuple(false, 0); | } 130| else if (helpVerbose) | { 1| defaultGetoptPrinter(helpTextVerbose, r.options); 1| return tuple(false, 0); | } 129| else if (helpFields) | { 1| writeln(fieldListHelpText); 1| return tuple(false, 0); | } 128| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("tsv-uniq")); 2| return tuple(false, 0); | } | | /* Input files. Remaining command line args are files. */ 252| string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 126| cmdArgs.length = 1; | | /* Validation - Do as much validation prior to header line processing as | * possible (avoids waiting on stdin). | */ 126| if (!equivMode) | { 108| enforce(equivHeader == defaultEquivHeader, "--equiv-header requires --e|equiv"); 106| enforce(equivStartID == defaultEquivStartID, "--equiv-start requires --e|equiv"); | } | 230| enforce(numberMode || numberHeader == defaultNumberHeader, 2| "--number-header requires --z|number"); | 120| string[] headerFields; | | /* fieldListArgProcessing encapsulates the field list processing. It is | * called prior to reading the header line if headers are not being used, | * and after if headers are being used. | */ | void fieldListArgProcessing() | { 120| if (!fieldsArg.empty) | { 74| fields = | fieldsArg | .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, headerFields, fieldsOptionString) | .array; | } | 302| enforce(fields.length <= 1 || fields.all!(x => x != 0), 3| "Whole line as key (--f|field 0) cannot be combined with multiple fields."); | 108| if (fields.length == 0) | { 46| keyIsFullLine = true; | } 84| else if (fields.length == 1 && fields[0] == 0) | { 2| keyIsFullLine = true; 2| fields.length = 0; | } | 130| if (onlyRepeated && atLeast <= 1) atLeast = 2; 138| if (atLeast >= 2 && max < atLeast) | { | // Don't modify max if it is zero and equivMode or numberMode is in effect. 62| if (max != 0 || (!equivMode && !numberMode)) max = atLeast; | } | 330| if (!keyIsFullLine) fields.each!((ref x) => --x); // Convert to 0-based indexing. | } | 147| if (!hasHeader) fieldListArgProcessing(); | | /* | * Create the inputSourceRange and perform header line processing. | */ 230| ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 115| inputSources = inputSourceRange(filepaths, readHeader); | 115| if (hasHeader) | { 93| throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1); 93| headerFields = inputSources.front.header.split(delim).to!(string[]); 93| fieldListArgProcessing(); | } | } | catch (Exception exc) | { 22| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 22| return tuple(false, 1); | } 108| return tuple(true, 0); | } |} | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |/** Main program. Processes command line arguments and calls tsvUniq which implements | * the main processing logic. | */ |int main(string[] cmdArgs) |{ | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 135| dmd_coverSetMerge(true); | } | 135| TsvUniqOptions cmdopt; 135| auto r = cmdopt.processArgs(cmdArgs); 162| if (!r[0]) return r[1]; | | version(LDC_Profile) | { | import ldc.profile : resetAll; | resetAll(); | } | 108| try tsvUniq(cmdopt); | catch (Exception exc) | { 4| stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 4| return 1; | } 104| return 0; |} | |/** Outputs the unique lines from all the input files. | * | * Processes the lines in each input file. All lines are added to an associated array. | * The first time a line is seen it is output. If key fields are being used these are | * used as the basis for the associative array entries rather than the full line. | */ |void tsvUniq(ref TsvUniqOptions cmdopt) |{ | import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange, | InputFieldReordering, InputSourceRange, joinAppend, throwIfWindowsNewlineOnUnix; | import std.algorithm : splitter; | import std.array : appender; | import std.conv : to; | import std.uni : asLowerCase; | import std.utf : byChar; | | /* inputSources must be an InputSourceRange and include at least stdin. */ 108| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | | /* InputFieldReordering maps the key fields from an input line to a separate buffer. */ 216| auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields); | | /* BufferedOutputRange is a performance enhancement for writing to stdout. */ 216| auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); | | /* The master hash. The key is the specified fields concatenated together (including | * separators). The value is a struct with the equiv-id and occurrence count. | */ | static struct EquivEntry { size_t equivID; size_t count; } 108| EquivEntry[string] equivHash; | | /* Reusable buffers for multi-field keys and case-insensitive keys. */ 108| auto multiFieldKeyBuffer = appender!(char[]); 108| auto lowerKeyBuffer = appender!(char[]); | 108| const size_t numKeyFields = cmdopt.fields.length; 108| long nextEquivID = cmdopt.equivStartID; | | /* First header is read during command line arg processing. Flush it immediately | * so subsequent processes in a unix command pipeline see it early. This helps | * provide timely error messages. | */ 194| if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) | { 84| auto inputStream = cmdopt.inputSources.front; | 84| bufferedOutput.append(inputStream.header); | 84| if (cmdopt.equivMode) | { 17| bufferedOutput.append(cmdopt.delim); 17| bufferedOutput.append(cmdopt.equivHeader); | } | 84| if (cmdopt.numberMode) | { 10| bufferedOutput.append(cmdopt.delim); 10| bufferedOutput.append(cmdopt.numberHeader); | } | 84| bufferedOutput.appendln(); 84| bufferedOutput.flush(); | } | 216| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; | 628| foreach (inputStream; cmdopt.inputSources) | { 257| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 11622| foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) | { 2291| if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); | | /* Start by finding the key. */ 2270| typeof(line) key; 2270| if (cmdopt.keyIsFullLine) | { 1121| key = line; | } | else | { 1149| assert(keyFieldsReordering !is null); | | /* Copy the key fields to a new buffer. */ 1149| keyFieldsReordering.initNewLine; 19758| foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) | { 3950| keyFieldsReordering.processNextField(fieldIndex, fieldValue); 5095| if (keyFieldsReordering.allFieldsFilled) break; | } | 1149| enforce(keyFieldsReordering.allFieldsFilled, 4| format("Not enough fields in line. File: %s, Line: %s", | inputStream.name, lineNum)); | 1145| if (numKeyFields == 1) | { 406| key = keyFieldsReordering.outputFields[0]; | } | else | { 739| multiFieldKeyBuffer.clear(); 739| keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim); 739| key = multiFieldKeyBuffer.data; | } | } | 2266| if (cmdopt.ignoreCase) | { | /* Equivalent to key = key.toLower, but without memory allocation. */ 601| lowerKeyBuffer.clear(); 601| lowerKeyBuffer.put(key.asLowerCase.byChar); 601| key = lowerKeyBuffer.data; | } | 2266| bool isOutput = false; 2266| EquivEntry currEntry; 2266| EquivEntry* priorEntry = (key in equivHash); 2266| if (priorEntry is null) | { 1402| isOutput = (cmdopt.atLeast <= 1); 1402| currEntry.equivID = nextEquivID; 1402| currEntry.count = 1; 1402| equivHash[key.to!string] = currEntry; 1402| nextEquivID++; | } | else | { 864| (*priorEntry).count++; 864| currEntry = *priorEntry; | 1193| if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) || 762| (cmdopt.equivMode && cmdopt.max == 0) || 529| (cmdopt.numberMode && cmdopt.max == 0)) | { 399| isOutput = true; | } | } | 2266| if (isOutput) | { 1396| bufferedOutput.append(line); | 1396| if (cmdopt.equivMode) | { 387| bufferedOutput.append(cmdopt.delim); 387| bufferedOutput.append(currEntry.equivID.to!string); | } | 1396| if (cmdopt.numberMode) | { 270| bufferedOutput.append(cmdopt.delim); 270| bufferedOutput.append(currEntry.count.to!string); | } | 1396| bufferedOutput.appendln(); | } | } | } |} tsv-uniq/src/tsv_utils/tsv-uniq.d is 100% covered <<<<<< EOF # path=./common-src-tsv_utils-common-unittest_utils.lst |/** |Helper functions for tsv-utils unit tests. | |Copyright (c) 2017-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ | |module tsv_utils.common.unittest_utils; | |version(unittest) |{ | /* Creates a temporary directory for writing unit test files. The path of the created | * directory is returned. The 'toolDirName' argument will be included in the directory | * name, and should consist of generic filename characters. e.g. "tsv_append". This | * name will also be used in assert error messages. | * | * The caller should delete the temporary directory and all its contents when tests | * are finished. This can be done using std.file.rmdirRecurse. For example: | * | * unittest | * { | * import std.file : rmdirRecurse; | * auto testDir = makeUnittestTempDir("tsv_append"); | * scope(exit) testDir.rmdirRecurse; | * ... test code | * } | * | * An assert is triggered if the directory cannot be created. There are two typical | * reasons: | * - Unable to find an available directory name. A number of unique names are tried | * (currently 1000). If they are all taken, it will normally be because the directories | * haven't been properly cleaned up from previous unit test runs. | * - Directory creation failed. e.g. Permission denied. | * | * This routine is intended to be run in 'unittest' mode, so that an assert is triggered | * on failure. However, if run with asserts disabled, the returned path will be empty in | * event of a failure. | */ | string makeUnittestTempDir(string toolDirName) @safe | { | import std.conv : to; | import std.file : exists, mkdir, tempDir; | import std.format : format; | import std.path : buildPath; | import std.range; | 31| string dirNamePrefix = "ebay_tsv_utils__" ~ toolDirName ~ "_unittest_"; 31| string systemTempDirPath = tempDir(); 31| string newTempDirPath = ""; | 186| for (auto i = 0; i < 1000 && newTempDirPath.empty; i++) | { 31| string path = buildPath(systemTempDirPath, dirNamePrefix ~ i.to!string); 62| if (!path.exists) newTempDirPath = path; | } 31| assert (!newTempDirPath.empty, | format("Unable to obtain a new temp directory, paths tried already exist.\nPath prefix: %s", | buildPath(systemTempDirPath, dirNamePrefix))); | 31| if (!newTempDirPath.empty) | { 31| try mkdir(newTempDirPath); | catch (Exception exc) | { 0000000| assert(false, format("Failed to create temp directory: %s\n Error: %s", | newTempDirPath, exc.msg)); | } | } | 31| return newTempDirPath; | } | | /* Write a TSV file. The 'tsvData' argument is a 2-dimensional array of rows and | * columns. Asserts if the file cannot be written. | * | * This routine is intended to be run in 'unittest' mode, so that it will assert | * if the write fails. However, if run in a mode with asserts disabled, it will | * return false if the write failed. | */ | bool writeUnittestTsvFile(string filepath, string[][] tsvData, char delimiter = '\t') @safe | { | import std.algorithm : each, joiner, map; | import std.conv : to; | import std.format: format; | import std.stdio : File; | | try | { 60| auto file = File(filepath, "w"); 30| tsvData 669| .map!(row => row.joiner(delimiter.to!string)) 1338| .each!(str => file.writeln(str)); | } | catch (Exception exc) | { 0000000| assert(false, format("Failed to write TSV file: %s.\n Error: %s", | filepath, exc.msg)); 0000000| return false; | } | 30| return true; | } | | /* Convert a 2-dimensional array of values to an in-memory string. */ | string tsvDataToString(string[][] tsvData, char delimiter = '\t') @safe | { | import std.algorithm : joiner, map; | import std.conv : to; | 539| return tsvData 11856| .map!(row => row.joiner(delimiter.to!string).to!string ~ "\n") | .joiner | .to!string; | } | } common/src/tsv_utils/common/unittest_utils.d is 85% covered <<<<<< EOF # path=./common-src-tsv_utils-common-utils.lst |/** |Utilities used by tsv-utils applications. InputFieldReordering, BufferedOutputRange, |and a several others. | |Utilities in this file: |$(LIST | * [InputFieldReordering] - A class that creates a reordered subset of fields from | an input line. Fields in the subset are accessed by array indicies. This is | especially useful when processing the subset in a specific order, such as the | order listed on the command-line at run-time. | | * [BufferedOutputRange] - An OutputRange with an internal buffer used to buffer | output. Intended for use with stdout, it is a significant performance benefit. | | * [isFlushableOutputRange] - Tests if something is an OutputRange with a flush | member. | | * [bufferedByLine] - An input range that reads from a File handle line by line. | It is similar to the standard library method std.stdio.File.byLine, but quite a | bit faster. This is achieved by reading in larger blocks and buffering. | | * [InputSourceRange] - An input range that provides open file access to a set of | files. It is used to iterate over files passed as command line arguments. This | enable reading header line of a file during command line argument process, then | passing the open file to the main processing functions. | | * [ByLineSourceRange] - Similar to an InputSourceRange, except that it provides | access to a byLine iterator (bufferedByLine) rather than an open file. This is | used by tools that run the same processing logic both header non-header lines. | | * [joinAppend] - A function that performs a join, but appending the join output to | an output stream. It is a performance improvement over using join or joiner with | writeln. | | * [getTsvFieldValue] - A convenience function when only a single value is needed | from an input line. | | * [throwIfWindowsNewlineOnUnix] - A utility for Unix platform builds to detecting | Windows newlines in input. |) | |Copyright (c) 2015-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ | |module tsv_utils.common.utils; | |import std.range; |import std.traits : isIntegral, isSomeChar, isSomeString, isUnsigned, ReturnType; |import std.typecons : Flag, No, Yes; | |// InputFieldReording class. | |/** Flag used by the InputFieldReordering template. */ |alias EnablePartialLines = Flag!"enablePartialLines"; | |/** |InputFieldReordering - Move select fields from an input line to an output array, |reordering along the way. | |The InputFieldReordering class is used to reorder a subset of fields from an input line. |The caller instantiates an InputFieldReordering object at the start of input processing. |The instance contains a mapping from input index to output index, plus a buffer holding |the reordered fields. The caller processes each input line by calling initNewLine, |splitting the line into fields, and calling processNextField on each field. The output |buffer is ready when the allFieldsFilled method returns true. | |Fields are not copied, instead the output buffer points to the fields passed by the caller. |The caller needs to use or copy the output buffer while the fields are still valid, which |is normally until reading the next input line. The program below illustrates the basic use |case. It reads stdin and outputs fields [3, 0, 2], in that order. (See also joinAppend, |below, which has a performance improvement over join used here.) | |--- |int main(string[] args) |{ | import tsv_utils.common.utils; | import std.algorithm, std.array, std.range, std.stdio; | size_t[] fieldIndicies = [3, 0, 2]; | auto fieldReordering = new InputFieldReordering!char(fieldIndicies); | foreach (line; stdin.byLine) | { | fieldReordering.initNewLine; | foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) | { | fieldReordering.processNextField(fieldIndex, fieldValue); | if (fieldReordering.allFieldsFilled) break; | } | if (fieldReordering.allFieldsFilled) | { | writeln(fieldReordering.outputFields.join('\t')); | } | else | { | writeln("Error: Insufficient number of field on the line."); | } | } | return 0; |} |--- | |Field indicies are zero-based. An individual field can be listed multiple times. The |outputFields array is not valid until all the specified fields have been processed. The |allFieldsFilled method tests this. If a line does not have enough fields the outputFields |buffer cannot be used. For most TSV applications this is okay, as it means the line is |invalid and cannot be used. However, if partial lines are okay, the template can be |instantiated with EnablePartialLines.yes. This will ensure that any fields not filled-in |are empty strings in the outputFields return. |*/ |final class InputFieldReordering(C, EnablePartialLines partialLinesOk = EnablePartialLines.no) |if (isSomeChar!C) |{ | /* Implementation: The class works by creating an array of tuples mapping the input | * field index to the location in the outputFields array. The 'fromToMap' array is | * sorted in input field order, enabling placement in the outputFields buffer during a | * pass over the input fields. The map is created by the constructor. An example: | * | * inputFieldIndicies: [3, 0, 7, 7, 1, 0, 9] | * fromToMap: [<0,1>, <0,5>, <1,4>, <3,0>, <7,2>, <7,3>, <9,6>] | * | * During processing of an a line, an array slice, mapStack, is used to track how | * much of the fromToMap remains to be processed. | */ | import std.range; | import std.typecons : Tuple; | | alias TupleFromTo = Tuple!(size_t, "from", size_t, "to"); | | private C[][] outputFieldsBuf; | private TupleFromTo[] fromToMap; | private TupleFromTo[] mapStack; | 970| final this(const ref size_t[] inputFieldIndicies, size_t start = 0) pure nothrow @safe | { | import std.algorithm : sort; | 970| outputFieldsBuf = new C[][](inputFieldIndicies.length); 970| fromToMap.reserve(inputFieldIndicies.length); | 9355| foreach (to, from; inputFieldIndicies.enumerate(start)) | { 1483| fromToMap ~= TupleFromTo(from, to); | } | 970| sort(fromToMap); 970| initNewLine; | } | | /** initNewLine initializes the object for a new line. */ | final void initNewLine() pure nothrow @safe | { 21603| mapStack = fromToMap; | static if (partialLinesOk) | { | import std.algorithm : each; 84| outputFieldsBuf.each!((ref s) => s.length = 0); | } | } | | /** processNextField maps an input field to the correct locations in the | * outputFields array. | * | * processNextField should be called once for each field on the line, in the order | * found. The processing of the line can terminate once allFieldsFilled returns | * true. | * | * The return value is the number of output fields the input field maps to. Zero | * means the field is not mapped to the output fields array. | * | * If, prior to allFieldsProcessed returning true, any fields on the input line | * are not passed to processNextField, the caller should either ensure the fields | * are not part of the output fields or have partial lines enabled. | */ | final size_t processNextField(size_t fieldIndex, C[] fieldValue) pure nothrow @safe @nogc | { 65840| size_t numFilled = 0; 169086| while (!mapStack.empty && fieldIndex == mapStack.front.from) | { 32108| outputFieldsBuf[mapStack.front.to] = fieldValue; 32108| mapStack.popFront; 32108| numFilled++; | } 65840| return numFilled; | } | | /** allFieldsFilled returned true if all fields expected have been processed. */ | final bool allFieldsFilled() const pure nothrow @safe @nogc | { 80954| return mapStack.empty; | } | | /** outputFields is the assembled output fields. Unless partial lines are enabled, | * it is only valid after allFieldsFilled is true. | */ | final C[][] outputFields() pure nothrow @safe @nogc | { 20571| return outputFieldsBuf[]; | } |} | |// InputFieldReordering - Tests using different character types. |@safe unittest |{ | import std.conv : to; | 7| auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], | ["r2f0", "abc", "ÀBCßßZ", "ghi"], | ["r3f0", "123", "456", "789"]]; | 7| size_t[] fields_2_0 = [2, 0]; | 7| auto expected_2_0 = [["r1f2", "r1f0"], | ["ÀBCßßZ", "r2f0"], | ["456", "r3f0"]]; | 7| char[][][] charExpected_2_0 = to!(char[][][])(expected_2_0); 7| wchar[][][] wcharExpected_2_0 = to!(wchar[][][])(expected_2_0); 7| dchar[][][] dcharExpected_2_0 = to!(dchar[][][])(expected_2_0); 7| dstring[][] dstringExpected_2_0 = to!(dstring[][])(expected_2_0); | 7| auto charIFR = new InputFieldReordering!char(fields_2_0); 7| auto wcharIFR = new InputFieldReordering!wchar(fields_2_0); 7| auto dcharIFR = new InputFieldReordering!dchar(fields_2_0); | 105| foreach (lineIndex, line; inputLines) | { 21| charIFR.initNewLine; 21| wcharIFR.initNewLine; 21| dcharIFR.initNewLine; | 399| foreach (fieldIndex, fieldValue; line) | { 84| charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| wcharIFR.processNextField(fieldIndex, to!(wchar[])(fieldValue)); 84| dcharIFR.processNextField(fieldIndex, to!(dchar[])(fieldValue)); | 84| assert ((fieldIndex >= 2) == charIFR.allFieldsFilled); 84| assert ((fieldIndex >= 2) == wcharIFR.allFieldsFilled); 84| assert ((fieldIndex >= 2) == dcharIFR.allFieldsFilled); | } 21| assert(charIFR.allFieldsFilled); 21| assert(wcharIFR.allFieldsFilled); 21| assert(dcharIFR.allFieldsFilled); | 21| assert(charIFR.outputFields == charExpected_2_0[lineIndex]); 21| assert(wcharIFR.outputFields == wcharExpected_2_0[lineIndex]); 21| assert(dcharIFR.outputFields == dcharExpected_2_0[lineIndex]); | } |} | |// InputFieldReordering - Test of partial line support. |@safe unittest |{ | import std.conv : to; | 7| auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], | ["r2f0", "abc", "ÀBCßßZ", "ghi"], | ["r3f0", "123", "456", "789"]]; | 7| size_t[] fields_2_0 = [2, 0]; | | // The expected states of the output field while each line and field are processed. 7| auto expectedBylineByfield_2_0 = | [ | [["", "r1f0"], ["", "r1f0"], ["r1f2", "r1f0"], ["r1f2", "r1f0"]], | [["", "r2f0"], ["", "r2f0"], ["ÀBCßßZ", "r2f0"], ["ÀBCßßZ", "r2f0"]], | [["", "r3f0"], ["", "r3f0"], ["456", "r3f0"], ["456", "r3f0"]], | ]; | 7| char[][][][] charExpectedBylineByfield_2_0 = to!(char[][][][])(expectedBylineByfield_2_0); | 7| auto charIFR = new InputFieldReordering!(char, EnablePartialLines.yes)(fields_2_0); | 105| foreach (lineIndex, line; inputLines) | { 21| charIFR.initNewLine; 399| foreach (fieldIndex, fieldValue; line) | { 84| charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| assert(charIFR.outputFields == charExpectedBylineByfield_2_0[lineIndex][fieldIndex]); | } | } |} | |// InputFieldReordering - Field combination tests. |@safe unittest |{ | import std.conv : to; | import std.stdio; | 7| auto inputLines = [["00", "01", "02", "03"], | ["10", "11", "12", "13"], | ["20", "21", "22", "23"]]; | 7| size_t[] fields_0 = [0]; 7| size_t[] fields_3 = [3]; 7| size_t[] fields_01 = [0, 1]; 7| size_t[] fields_10 = [1, 0]; 7| size_t[] fields_03 = [0, 3]; 7| size_t[] fields_30 = [3, 0]; 7| size_t[] fields_0123 = [0, 1, 2, 3]; 7| size_t[] fields_3210 = [3, 2, 1, 0]; 7| size_t[] fields_03001 = [0, 3, 0, 0, 1]; | 7| auto expected_0 = to!(char[][][])([["00"], | ["10"], | ["20"]]); | 7| auto expected_3 = to!(char[][][])([["03"], | ["13"], | ["23"]]); | 7| auto expected_01 = to!(char[][][])([["00", "01"], | ["10", "11"], | ["20", "21"]]); | 7| auto expected_10 = to!(char[][][])([["01", "00"], | ["11", "10"], | ["21", "20"]]); | 7| auto expected_03 = to!(char[][][])([["00", "03"], | ["10", "13"], | ["20", "23"]]); | 7| auto expected_30 = to!(char[][][])([["03", "00"], | ["13", "10"], | ["23", "20"]]); | 7| auto expected_0123 = to!(char[][][])([["00", "01", "02", "03"], | ["10", "11", "12", "13"], | ["20", "21", "22", "23"]]); | 7| auto expected_3210 = to!(char[][][])([["03", "02", "01", "00"], | ["13", "12", "11", "10"], | ["23", "22", "21", "20"]]); | 7| auto expected_03001 = to!(char[][][])([["00", "03", "00", "00", "01"], | ["10", "13", "10", "10", "11"], | ["20", "23", "20", "20", "21"]]); | 7| auto ifr_0 = new InputFieldReordering!char(fields_0); 7| auto ifr_3 = new InputFieldReordering!char(fields_3); 7| auto ifr_01 = new InputFieldReordering!char(fields_01); 7| auto ifr_10 = new InputFieldReordering!char(fields_10); 7| auto ifr_03 = new InputFieldReordering!char(fields_03); 7| auto ifr_30 = new InputFieldReordering!char(fields_30); 7| auto ifr_0123 = new InputFieldReordering!char(fields_0123); 7| auto ifr_3210 = new InputFieldReordering!char(fields_3210); 7| auto ifr_03001 = new InputFieldReordering!char(fields_03001); | 105| foreach (lineIndex, line; inputLines) | { 21| ifr_0.initNewLine; 21| ifr_3.initNewLine; 21| ifr_01.initNewLine; 21| ifr_10.initNewLine; 21| ifr_03.initNewLine; 21| ifr_30.initNewLine; 21| ifr_0123.initNewLine; 21| ifr_3210.initNewLine; 21| ifr_03001.initNewLine; | 399| foreach (fieldIndex, fieldValue; line) | { 84| ifr_0.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| ifr_3.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| ifr_01.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| ifr_10.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| ifr_03.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| ifr_30.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| ifr_0123.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| ifr_3210.processNextField(fieldIndex, to!(char[])(fieldValue)); 84| ifr_03001.processNextField(fieldIndex, to!(char[])(fieldValue)); | } | 21| assert(ifr_0.outputFields == expected_0[lineIndex]); 21| assert(ifr_3.outputFields == expected_3[lineIndex]); 21| assert(ifr_01.outputFields == expected_01[lineIndex]); 21| assert(ifr_10.outputFields == expected_10[lineIndex]); 21| assert(ifr_03.outputFields == expected_03[lineIndex]); 21| assert(ifr_30.outputFields == expected_30[lineIndex]); 21| assert(ifr_0123.outputFields == expected_0123[lineIndex]); 21| assert(ifr_3210.outputFields == expected_3210[lineIndex]); 21| assert(ifr_03001.outputFields == expected_03001[lineIndex]); | } |} | | |import std.stdio : File, isFileHandle, KeepTerminator; |import std.range : isOutputRange; |import std.traits : Unqual; | |/** |BufferedOutputRange is a performance enhancement over writing directly to an output |stream. It holds a File open for write or an OutputRange. Ouput is accumulated in an |internal buffer and written to the output stream as a block. | |Writing to stdout is a key use case. BufferedOutputRange is often dramatically faster |than writing to stdout directly. This is especially noticable for outputs with short |lines, as it blocks many writes together in a single write. | |The internal buffer is written to the output stream after flushSize has been reached. |This is checked at newline boundaries, when appendln is called or when put is called |with a single newline character. Other writes check maxSize, which is used to avoid |runaway buffers. | |BufferedOutputRange has a put method allowing it to be used a range. It has a number |of other methods providing additional control. | |$(LIST | * `this(outputStream [, flushSize, reserveSize, maxSize])` - Constructor. Takes the | output stream, e.g. stdout. Other arguments are optional, defaults normally suffice. | | * `append(stuff)` - Append to the internal buffer. | | * `appendln(stuff)` - Append to the internal buffer, followed by a newline. The buffer | is flushed to the output stream if is has reached flushSize. | | * `appendln()` - Append a newline to the internal buffer. The buffer is flushed to the | output stream if is has reached flushSize. | | * `joinAppend(inputRange, delim)` - An optimization of `append(inputRange.joiner(delim))`. | For reasons that are not clear, joiner is quite slow. | | * `flushIfFull()` - Flush the internal buffer to the output stream if flushSize has been | reached. | | * `flush()` - Write the internal buffer to the output stream. | | * `put(stuff)` - Appends to the internal buffer. Acts as `appendln()` if passed a single | newline character, '\n' or "\n". |) | |The internal buffer is automatically flushed when the BufferedOutputRange goes out of |scope. |*/ |struct BufferedOutputRange(OutputTarget) |if (isFileHandle!(Unqual!OutputTarget) || isOutputRange!(Unqual!OutputTarget, char)) |{ | import std.range : isOutputRange; | import std.array : appender; | import std.format : format; | | /* Identify the output element type. Only supporting char and ubyte for now. */ | static if (isFileHandle!OutputTarget || isOutputRange!(OutputTarget, char)) | { | alias C = char; | } | else static if (isOutputRange!(OutputTarget, ubyte)) | { | alias C = ubyte; | } | else static assert(false); | | private enum defaultReserveSize = 11264; | private enum defaultFlushSize = 10240; | private enum defaultMaxSize = 4194304; | | private OutputTarget _outputTarget; | private auto _outputBuffer = appender!(C[]); | private immutable size_t _flushSize; | private immutable size_t _maxSize; | 1034| this(OutputTarget outputTarget, | size_t flushSize = defaultFlushSize, | size_t reserveSize = defaultReserveSize, | size_t maxSize = defaultMaxSize) | { 1034| assert(flushSize <= maxSize); | 1034| _outputTarget = outputTarget; 1034| _flushSize = flushSize; 2068| _maxSize = (flushSize <= maxSize) ? maxSize : flushSize; 1034| _outputBuffer.reserve(reserveSize); | } | | ~this() | { 1051| flush(); | } | | void flush() | { 2060| static if (isFileHandle!OutputTarget) _outputTarget.write(_outputBuffer.data); 112| else _outputTarget.put(_outputBuffer.data); | 2172| _outputBuffer.clear; | } | | bool flushIfFull() | { 41752| bool isFull = _outputBuffer.data.length >= _flushSize; 41785| if (isFull) flush(); 41752| return isFull; | } | | /* flushIfMaxSize is a safety check to avoid runaway buffer growth. */ | void flushIfMaxSize() | { 4099| if (_outputBuffer.data.length >= _maxSize) flush(); | } | | /* maybeFlush is intended for the case where put is called with a trailing newline. | * | * Flushing occurs if the buffer has a trailing newline and has reached flush size. | * Flushing also occurs if the buffer has reached max size. | */ | private bool maybeFlush() | { 12949| immutable bool doFlush = | _outputBuffer.data.length >= _flushSize && 140| (_outputBuffer.data[$-1] == '\n' || _outputBuffer.data.length >= _maxSize); | 12963| if (doFlush) flush(); 12949| return doFlush; | } | | | private void appendRaw(T)(T stuff) pure | { | import std.range : rangePut = put; 103685| rangePut(_outputBuffer, stuff); | } | | void append(T)(T stuff) | { 12949| appendRaw(stuff); 12949| maybeFlush(); | } | | bool appendln() | { 41745| appendRaw('\n'); 41745| return flushIfFull(); | } | | bool appendln(T)(T stuff) | { 32280| appendRaw(stuff); 32280| return appendln(); | } | | /* joinAppend is an optimization of append(inputRange.joiner(delimiter). | * This form is quite a bit faster, 40%+ on some benchmarks. | */ | void joinAppend(InputRange, E)(InputRange inputRange, E delimiter) | if (isInputRange!InputRange && | is(ElementType!InputRange : const C[]) && | (is(E : const C[]) || is(E : const C))) | { 4092| if (!inputRange.empty) | { 3764| appendRaw(inputRange.front); 3764| inputRange.popFront; | } 30454| foreach (x; inputRange) | { 6064| appendRaw(delimiter); 6064| appendRaw(x); | } 4092| flushIfMaxSize(); | } | | /* Make this an output range. */ | void put(T)(T stuff) | { | import std.traits; | import std.stdio; | | static if (isSomeChar!T) | { 1615| if (stuff == '\n') appendln(); 819| else appendRaw(stuff); | } | else static if (isSomeString!T) | { 5056| if (stuff == "\n") appendln(); 2454| else append(stuff); | } 507| else append(stuff); | } |} | |// BufferedOutputRange. |unittest |{ | import tsv_utils.common.unittest_utils; | import std.file : rmdirRecurse, readText; | import std.path : buildPath; | 7| auto testDir = makeUnittestTempDir("tsv_utils_buffered_output"); 7| scope(exit) testDir.rmdirRecurse; | | import std.algorithm : map, joiner; | import std.range : iota; | import std.conv : to; | | /* Basic test. Note that exiting the scope triggers flush. */ 7| string filepath1 = buildPath(testDir, "file1.txt"); | { | import std.stdio : File; | 14| auto ostream = BufferedOutputRange!File(filepath1.File("w")); 7| ostream.append("file1: "); 7| ostream.append("abc"); 7| ostream.append(["def", "ghi", "jkl"]); 7| ostream.appendln(100.to!string); 77| ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 7| ostream.appendln(); | } 7| assert(filepath1.readText == "file1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); | | /* Test with no reserve and no flush at every line. */ 7| string filepath2 = buildPath(testDir, "file2.txt"); | { | import std.stdio : File; | 14| auto ostream = BufferedOutputRange!File(filepath2.File("w"), 0, 0); 7| ostream.append("file2: "); 7| ostream.append("abc"); 7| ostream.append(["def", "ghi", "jkl"]); 7| ostream.appendln("100"); 77| ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 7| ostream.appendln(); | } 7| assert(filepath2.readText == "file2: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); | | /* With a locking text writer. Requires version 2.078.0 | See: https://issues.dlang.org/show_bug.cgi?id=9661 | */ | static if (__VERSION__ >= 2078) | { 7| string filepath3 = buildPath(testDir, "file3.txt"); | { | import std.stdio : File; | 14| auto ltw = filepath3.File("w").lockingTextWriter; | { 14| auto ostream = BufferedOutputRange!(typeof(ltw))(ltw); 7| ostream.append("file3: "); 7| ostream.append("abc"); 7| ostream.append(["def", "ghi", "jkl"]); 7| ostream.appendln("100"); 77| ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 7| ostream.appendln(); | } | } 7| assert(filepath3.readText == "file3: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); | } | | /* With an Appender. */ | import std.array : appender; 7| auto app1 = appender!(char[]); | { 14| auto ostream = BufferedOutputRange!(typeof(app1))(app1); 7| ostream.append("appender1: "); 7| ostream.append("abc"); 7| ostream.append(["def", "ghi", "jkl"]); 7| ostream.appendln("100"); 77| ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 7| ostream.appendln(); | } 7| assert(app1.data == "appender1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); | | /* With an Appender, but checking flush boundaries. */ 7| auto app2 = appender!(char[]); | { 14| auto ostream = BufferedOutputRange!(typeof(app2))(app2, 10, 0); // Flush if 10+ 7| bool wasFlushed = false; | 7| assert(app2.data == ""); | 7| ostream.append("12345678"); // Not flushed yet. 7| assert(app2.data == ""); | 7| wasFlushed = ostream.appendln; // Nineth char, not flushed yet. 7| assert(!wasFlushed); 7| assert(app2.data == ""); | 7| wasFlushed = ostream.appendln; // Tenth char, now flushed. 7| assert(wasFlushed); 7| assert(app2.data == "12345678\n\n"); | 7| app2.clear; 7| assert(app2.data == ""); | 7| ostream.append("12345678"); | 7| wasFlushed = ostream.flushIfFull; 7| assert(!wasFlushed); 7| assert(app2.data == ""); | 7| ostream.flush; 7| assert(app2.data == "12345678"); | 7| app2.clear; 7| assert(app2.data == ""); | 7| ostream.append("123456789012345"); 7| assert(app2.data == ""); | } 7| assert(app2.data == "123456789012345"); | | /* Using joinAppend. */ 7| auto app1b = appender!(char[]); | { 14| auto ostream = BufferedOutputRange!(typeof(app1b))(app1b); 7| ostream.append("appenderB: "); 7| ostream.joinAppend(["a", "bc", "def"], '-'); 7| ostream.append(':'); 7| ostream.joinAppend(["g", "hi", "jkl"], '-'); 7| ostream.appendln("*100*"); 49| ostream.joinAppend(iota(0, 6).map!(x => x.to!string), ' '); 7| ostream.append(' '); 35| ostream.joinAppend(iota(6, 10).map!(x => x.to!string), " "); 7| ostream.appendln(); | } 7| assert(app1b.data == "appenderB: a-bc-def:g-hi-jkl*100*\n0 1 2 3 4 5 6 7 8 9\n", | "app1b.data: |" ~app1b.data ~ "|"); | | /* Operating as an output range. When passed to a function as a ref, exiting | * the function does not flush. When passed as a value, it get flushed when | * the function returns. Also test both UCFS and non-UFCS styles. | */ | | void outputStuffAsRef(T)(ref T range) | if (isOutputRange!(T, char)) | { 14| range.put('1'); 14| put(range, "23"); 14| range.put('\n'); 14| range.put(["5", "67"]); 42| put(range, iota(8, 10).map!(x => x.to!string)); 14| put(range, "\n"); | } | | void outputStuffAsVal(T)(T range) | if (isOutputRange!(T, char)) | { 14| put(range, '1'); 14| range.put("23"); 14| put(range, '\n'); 14| put(range, ["5", "67"]); 42| range.put(iota(8, 10).map!(x => x.to!string)); 14| range.put("\n"); | } | 7| auto app3 = appender!(char[]); | { 14| auto ostream = BufferedOutputRange!(typeof(app3))(app3, 12, 0); 7| outputStuffAsRef(ostream); 7| assert(app3.data == "", "app3.data: |" ~app3.data ~ "|"); 7| outputStuffAsRef(ostream); 7| assert(app3.data == "123\n56789\n123\n", "app3.data: |" ~app3.data ~ "|"); | } 7| assert(app3.data == "123\n56789\n123\n56789\n", "app3.data: |" ~app3.data ~ "|"); | 7| auto app4 = appender!(char[]); | { 14| auto ostream = BufferedOutputRange!(typeof(app4))(app4, 12, 0); 7| outputStuffAsVal(ostream); 7| assert(app4.data == "123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 7| outputStuffAsVal(ostream); 7| assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); | } 7| assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); | | /* Test maxSize. */ 7| auto app5 = appender!(char[]); | { 14| auto ostream = BufferedOutputRange!(typeof(app5))(app5, 5, 0, 10); // maxSize 10 7| assert(app5.data == ""); | 7| ostream.append("1234567"); // Not flushed yet (no newline). 7| assert(app5.data == ""); | 7| ostream.append("89012"); // Flushed by maxSize 7| assert(app5.data == "123456789012"); | 7| ostream.put("1234567"); // Not flushed yet (no newline). 7| assert(app5.data == "123456789012"); | 7| ostream.put("89012"); // Flushed by maxSize 7| assert(app5.data == "123456789012123456789012"); | 7| ostream.joinAppend(["ab", "cd"], '-'); // Not flushed yet 7| ostream.joinAppend(["de", "gh", "ij"], '-'); // Flushed by maxSize 7| assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); | } 7| assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); |} | |/** |isFlushableOutputRange returns true if R is an output range with a flush member. |*/ |enum bool isFlushableOutputRange(R, E=char) = isOutputRange!(R, E) | && is(ReturnType!((R r) => r.flush) == void); | |@safe unittest |{ | import std.array; 7| auto app = appender!(char[]); 14| auto ostream = BufferedOutputRange!(typeof(app))(app, 5, 0, 10); // maxSize 10 | | static assert(isOutputRange!(typeof(app), char)); | static assert(!isFlushableOutputRange!(typeof(app), char)); | static assert(!isFlushableOutputRange!(typeof(app))); | | static assert(isOutputRange!(typeof(ostream), char)); | static assert(isFlushableOutputRange!(typeof(ostream), char)); | static assert(isFlushableOutputRange!(typeof(ostream))); | | static assert(isOutputRange!(Appender!string, string)); | static assert(!isFlushableOutputRange!(Appender!string, string)); | static assert(!isFlushableOutputRange!(Appender!string)); | | static assert(isOutputRange!(Appender!(char[]), char)); | static assert(!isFlushableOutputRange!(Appender!(char[]), char)); | static assert(!isFlushableOutputRange!(Appender!(char[]))); | | static assert(isOutputRange!(BufferedOutputRange!(Appender!(char[])), char)); | static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[])))); | static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[])), char)); |} | | |/** |bufferedByLine is a performance enhancement over std.stdio.File.byLine. It works by |reading a large buffer from the input stream rather than just a single line. | |The file argument needs to be a File object open for reading, typically a filesystem |file or standard input. Use the Yes.keepTerminator template parameter to keep the |newline. This is similar to stdio.File.byLine, except specified as a template paramter |rather than a runtime parameter. | |Reading in blocks does mean that input is not read until a full buffer is available or |end-of-file is reached. For this reason, bufferedByLine is not appropriate for |interactive input. |*/ | |auto bufferedByLine(KeepTerminator keepTerminator = No.keepTerminator, Char = char, | ubyte terminator = '\n', size_t readSize = 1024 * 128, size_t growSize = 1024 * 16) | (File file) |if (is(Char == char) || is(Char == ubyte)) |{ | static assert(0 < growSize && growSize <= readSize); | | static final class BufferedByLineImpl | { | /* Buffer state variables | * - _buffer.length - Full length of allocated buffer. | * - _dataEnd - End of currently valid data (end of last read). | * - _lineStart - Start of current line. | * - _lineEnd - End of current line. | */ | private File _file; | private ubyte[] _buffer; | private size_t _lineStart = 0; | private size_t _lineEnd = 0; | private size_t _dataEnd = 0; | 2726| this (File f) | { 2726| _file = f; 2726| _buffer = new ubyte[readSize + growSize]; | } | | bool empty() const pure | { 3448227| return _file.eof && _lineStart == _dataEnd; | } | | Char[] front() pure | { 811044| assert(!empty, "Attempt to take the front of an empty bufferedByLine."); | | static if (keepTerminator == Yes.keepTerminator) | { 224259| return cast(Char[]) _buffer[_lineStart .. _lineEnd]; | } | else | { 586785| assert(_lineStart < _lineEnd); 1173570| immutable end = (_buffer[_lineEnd - 1] == terminator) ? _lineEnd - 1 : _lineEnd; 586785| return cast(Char[]) _buffer[_lineStart .. end]; | } | } | | /* Note: Call popFront at initialization to do the initial read. */ | void popFront() | { | import std.algorithm: copy, find; 812930| assert(!empty, "Attempt to popFront an empty bufferedByLine."); | | /* Pop the current line. */ 812930| _lineStart = _lineEnd; | | /* Set up the next line if more data is available, either in the buffer or | * the file. The next line ends at the next newline, if there is one. | * | * Notes: | * - 'find' returns the slice starting with the character searched for, or | * an empty range if not found. | * - _lineEnd is set to _dataEnd both when the current buffer does not have | * a newline and when it ends with one. | */ 812930| auto found = _buffer[_lineStart .. _dataEnd].find(terminator); 1625860| _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; | 1129626| if (found.empty && !_file.eof) | { | /* No newline in current buffer. Read from the file until the next | * newline is found. | */ 314409| assert(_lineEnd == _dataEnd); | 314409| if (_lineStart > 0) | { | /* Move remaining data to the start of the buffer. */ 311708| immutable remainingLength = _dataEnd - _lineStart; 311708| copy(_buffer[_lineStart .. _dataEnd], _buffer[0 .. remainingLength]); 311708| _lineStart = 0; 311708| _lineEnd = _dataEnd = remainingLength; | } | | do | { | /* Grow the buffer if necessary. */ 21362317| immutable availableSize = _buffer.length - _dataEnd; 21362317| if (availableSize < readSize) | { 51947| size_t growBy = growSize; 76391| while (availableSize + growBy < readSize) growBy += growSize; 51947| _buffer.length += growBy; | } | | /* Read the next block. */ 21362317| _dataEnd += | _file.rawRead(_buffer[_dataEnd .. _dataEnd + readSize]) | .length; | 21362317| found = _buffer[_lineEnd .. _dataEnd].find(terminator); 42724634| _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; | 42410680| } while (found.empty && !_file.eof); | } | } | } | 2726| assert(file.isOpen, "bufferedByLine passed a closed file."); | 2726| auto r = new BufferedByLineImpl(file); 5427| if (!r.empty) r.popFront; 2726| return r; |} | |// BufferedByLine. |unittest |{ | import std.array : appender; | import std.conv : to; | import std.file : rmdirRecurse, readText; | import std.path : buildPath; | import std.range : lockstep; | import std.stdio; | import tsv_utils.common.unittest_utils; | 7| auto testDir = makeUnittestTempDir("tsv_utils_buffered_byline"); 7| scope(exit) testDir.rmdirRecurse; | | /* Create two data files with the same data. Read both in parallel with byLine and | * bufferedByLine and compare each line. | */ 7| auto data1 = appender!(char[])(); | 28021| foreach (i; 1 .. 1001) data1.put('\n'); 28021| foreach (i; 1 .. 1001) data1.put("a\n"); 35021| foreach (i; 1 .. 1001) { data1.put(i.to!string); data1.put('\n'); } 21021| foreach (i; 1 .. 1001) | { 14035000| foreach (j; 1 .. i+1) data1.put('x'); 7000| data1.put('\n'); | } | 7| string file1a = buildPath(testDir, "file1a.txt"); 7| string file1b = buildPath(testDir, "file1b.txt"); | { | 7| file1a.File("w").write(data1.data); 7| file1b.File("w").write(data1.data); | } | | /* Default parameters. */ | { 7| auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator); 14| auto f1bIn = file1b.File().byLine(No.keepTerminator); 28007| foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); | } | { 7| auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator); 14| auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 28007| foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); | } | | /* Smaller read size. This will trigger buffer growth. */ | { 7| auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', 512, 256); 14| auto f1bIn = file1b.File().byLine(No.keepTerminator); 28007| foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); | } | | /* Exercise boundary cases in buffer growth. | * Note: static-foreach requires DMD 2.076 / LDC 1.6 | */ | static foreach (readSize; [1, 2, 4]) | { | static foreach (growSize; 1 .. readSize + 1) | {{ 49| auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 98| auto f1bIn = file1b.File().byLine(No.keepTerminator); 196049| foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); | }} | static foreach (growSize; 1 .. readSize + 1) | {{ 49| auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 98| auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 196049| foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); | }} | } | | | /* Files that do not end in a newline. */ | 7| string file2a = buildPath(testDir, "file2a.txt"); 7| string file2b = buildPath(testDir, "file2b.txt"); 7| string file3a = buildPath(testDir, "file3a.txt"); 7| string file3b = buildPath(testDir, "file3b.txt"); 7| string file4a = buildPath(testDir, "file4a.txt"); 7| string file4b = buildPath(testDir, "file4b.txt"); | { 7| file1a.File("w").write("a"); 7| file1b.File("w").write("a"); 7| file2a.File("w").write("ab"); 7| file2b.File("w").write("ab"); 7| file3a.File("w").write("abc"); 7| file3b.File("w").write("abc"); | } | | static foreach (readSize; [1, 2, 4]) | { | static foreach (growSize; 1 .. readSize + 1) | {{ 49| auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 98| auto f1bIn = file1b.File().byLine(No.keepTerminator); 98| foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); | 49| auto f2aIn = file2a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 98| auto f2bIn = file2b.File().byLine(No.keepTerminator); 98| foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b); | 49| auto f3aIn = file3a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 98| auto f3bIn = file3b.File().byLine(No.keepTerminator); 98| foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b); | }} | static foreach (growSize; 1 .. readSize + 1) | {{ 49| auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 98| auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 98| foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); | 49| auto f2aIn = file2a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 98| auto f2bIn = file2b.File().byLine(Yes.keepTerminator); 98| foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b); | 49| auto f3aIn = file3a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 98| auto f3bIn = file3b.File().byLine(Yes.keepTerminator); 98| foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b); | }} | } |} | |/** |joinAppend performs a join operation on an input range, appending the results to |an output range. | |joinAppend was written as a performance enhancement over using std.algorithm.joiner |or std.array.join with writeln. Using joiner with writeln is quite slow, 3-4x slower |than std.array.join with writeln. The joiner performance may be due to interaction |with writeln, this was not investigated. Using joiner with stdout.lockingTextWriter |is better, but still substantially slower than join. Using join works reasonably well, |but is allocating memory unnecessarily. | |Using joinAppend with Appender is a bit faster than join, and allocates less memory. |The Appender re-uses the underlying data buffer, saving memory. The example below |illustrates. It is a modification of the InputFieldReordering example. The role |Appender plus joinAppend are playing is to buffer the output. BufferedOutputRange |uses a similar technique to buffer multiple lines. | |Note: The original uses joinAppend have been replaced by BufferedOutputRange, which has |its own joinAppend method. However, joinAppend remains useful when constructing internal |buffers where BufferedOutputRange is not appropriate. | |--- |int main(string[] args) |{ | import tsvutil; | import std.algorithm, std.array, std.range, std.stdio; | size_t[] fieldIndicies = [3, 0, 2]; | auto fieldReordering = new InputFieldReordering!char(fieldIndicies); | auto outputBuffer = appender!(char[]); | foreach (line; stdin.byLine) | { | fieldReordering.initNewLine; | foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) | { | fieldReordering.processNextField(fieldIndex, fieldValue); | if (fieldReordering.allFieldsFilled) break; | } | if (fieldReordering.allFieldsFilled) | { | outputBuffer.clear; | writeln(fieldReordering.outputFields.joinAppend(outputBuffer, ('\t'))); | } | else | { | writeln("Error: Insufficient number of field on the line."); | } | } | return 0; |} |--- |*/ |OutputRange joinAppend(InputRange, OutputRange, E) | (InputRange inputRange, ref OutputRange outputRange, E delimiter) |if (isInputRange!InputRange && | (is(ElementType!InputRange : const E[]) && | isOutputRange!(OutputRange, E[])) | || | (is(ElementType!InputRange : const E) && | isOutputRange!(OutputRange, E)) | ) |{ 788| if (!inputRange.empty) | { 788| outputRange.put(inputRange.front); 788| inputRange.popFront; | } 5337| foreach (x; inputRange) | { 991| outputRange.put(delimiter); 991| outputRange.put(x); | } 788| return outputRange; |} | |// joinAppend. |@safe unittest |{ | import std.array : appender; | import std.algorithm : equal; | 7| char[] c1 = ['a', 'b', 'c']; 7| char[] c2 = ['d', 'e', 'f']; 7| char[] c3 = ['g', 'h', 'i']; 7| auto cvec = [c1, c2, c3]; | 7| auto s1 = "abc"; 7| auto s2 = "def"; 7| auto s3 = "ghi"; 7| auto svec = [s1, s2, s3]; | 7| auto charAppender = appender!(char[])(); | 7| assert(cvec.joinAppend(charAppender, '_').data == "abc_def_ghi"); 7| assert(equal(cvec, [c1, c2, c3])); | 7| charAppender.put('$'); 7| assert(svec.joinAppend(charAppender, '|').data == "abc_def_ghi$abc|def|ghi"); 7| assert(equal(cvec, [s1, s2, s3])); | 7| charAppender.clear; 7| assert(svec.joinAppend(charAppender, '|').data == "abc|def|ghi"); | 7| auto intAppender = appender!(int[])(); | 7| auto i1 = [100, 101, 102]; 7| auto i2 = [200, 201, 202]; 7| auto i3 = [300, 301, 302]; 7| auto ivec = [i1, i2, i3]; | 7| assert(ivec.joinAppend(intAppender, 0).data == | [100, 101, 102, 0, 200, 201, 202, 0, 300, 301, 302]); | 7| intAppender.clear; 7| assert(i1.joinAppend(intAppender, 0).data == | [100, 0, 101, 0, 102]); 7| assert(i2.joinAppend(intAppender, 1).data == | [100, 0, 101, 0, 102, | 200, 1, 201, 1, 202]); 7| assert(i3.joinAppend(intAppender, 2).data == | [100, 0, 101, 0, 102, | 200, 1, 201, 1, 202, | 300, 2, 301, 2, 302]); |} | |/** |getTsvFieldValue extracts the value of a single field from a delimited text string. | |This is a convenience function intended for cases when only a single field from an |input line is needed. If multiple values are needed, it will be more efficient to |work directly with std.algorithm.splitter or the InputFieldReordering class. | |The input text is split by a delimiter character. The specified field is converted |to the desired type and the value returned. | |An exception is thrown if there are not enough fields on the line or if conversion |fails. Conversion is done with std.conv.to, it throws a std.conv.ConvException on |failure. If not enough fields, the exception text is generated referencing 1-upped |field numbers as would be provided by command line users. | */ |T getTsvFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim) |if (isSomeChar!C) |{ | import std.algorithm : splitter; | import std.conv : to; | import std.format : format; | import std.range; | 1556| auto splitLine = line.splitter(delim); 1556| size_t atField = 0; | 5822| while (atField < fieldIndex && !splitLine.empty) | { 2129| splitLine.popFront; 2129| atField++; | } | 1556| T val; 1556| if (splitLine.empty) | { 45| if (fieldIndex == 0) | { | /* This is a workaround to a splitter special case - If the input is empty, | * the returned split range is empty. This doesn't properly represent a single | * column file. More correct mathematically, and for this case, would be a | * single value representing an empty string. The input line is a convenient | * source of an empty line. Info: | * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 | * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 | */ 21| assert(line.empty); 21| val = line.to!T; | } | else | { 24| throw new Exception( | format("Not enough fields on line. Number required: %d; Number found: %d", | fieldIndex + 1, atField)); | } | } | else | { 1511| val = splitLine.front.to!T; | } | 1493| return val; |} | |// getTsvFieldValue. |@safe unittest |{ | import std.conv : ConvException, to; | import std.exception; | | /* Common cases. */ 7| assert(getTsvFieldValue!double("123", 0, '\t') == 123.0); 7| assert(getTsvFieldValue!double("-10.5", 0, '\t') == -10.5); 7| assert(getTsvFieldValue!size_t("abc|123", 1, '|') == 123); 7| assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 7| assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 7| assert(getTsvFieldValue!string("紅\t红\t99", 2, '\t') == "99"); 7| assert(getTsvFieldValue!string("紅\t红\t99", 1, '\t') == "红"); 7| assert(getTsvFieldValue!string("紅\t红\t99", 0, '\t') == "紅"); 7| assert(getTsvFieldValue!string("红色和绿色\tred and green\t赤と緑\t10.5", 2, '\t') == "赤と緑"); 7| assert(getTsvFieldValue!double("红色和绿色\tred and green\t赤と緑\t10.5", 3, '\t') == 10.5); | | /* The empty field cases. */ 7| assert(getTsvFieldValue!string("", 0, '\t') == ""); 7| assert(getTsvFieldValue!string("\t", 0, '\t') == ""); 7| assert(getTsvFieldValue!string("\t", 1, '\t') == ""); 7| assert(getTsvFieldValue!string("", 0, ':') == ""); 7| assert(getTsvFieldValue!string(":", 0, ':') == ""); 7| assert(getTsvFieldValue!string(":", 1, ':') == ""); | | /* Tests with different data types. */ 7| string stringLine = "orange and black\tნარინჯისფერი და შავი\t88.5"; 7| char[] charLine = "orange and black\tნარინჯისფერი და შავი\t88.5".to!(char[]); 7| dchar[] dcharLine = stringLine.to!(dchar[]); 7| wchar[] wcharLine = stringLine.to!(wchar[]); | 7| assert(getTsvFieldValue!string(stringLine, 0, '\t') == "orange and black"); 7| assert(getTsvFieldValue!string(stringLine, 1, '\t') == "ნარინჯისფერი და შავი"); 7| assert(getTsvFieldValue!wstring(stringLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 7| assert(getTsvFieldValue!double(stringLine, 2, '\t') == 88.5); | 7| assert(getTsvFieldValue!string(charLine, 0, '\t') == "orange and black"); 7| assert(getTsvFieldValue!string(charLine, 1, '\t') == "ნარინჯისფერი და შავი"); 7| assert(getTsvFieldValue!wstring(charLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 7| assert(getTsvFieldValue!double(charLine, 2, '\t') == 88.5); | 7| assert(getTsvFieldValue!string(dcharLine, 0, '\t') == "orange and black"); 7| assert(getTsvFieldValue!string(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 7| assert(getTsvFieldValue!wstring(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 7| assert(getTsvFieldValue!double(dcharLine, 2, '\t') == 88.5); | 7| assert(getTsvFieldValue!string(wcharLine, 0, '\t') == "orange and black"); 7| assert(getTsvFieldValue!string(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 7| assert(getTsvFieldValue!wstring(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 7| assert(getTsvFieldValue!double(wcharLine, 2, '\t') == 88.5); | | /* Conversion errors. */ 14| assertThrown!ConvException(getTsvFieldValue!double("", 0, '\t')); 14| assertThrown!ConvException(getTsvFieldValue!double("abc", 0, '|')); 14| assertThrown!ConvException(getTsvFieldValue!size_t("-1", 0, '|')); 14| assertThrown!ConvException(getTsvFieldValue!size_t("a23|23.4", 1, '|')); 14| assertThrown!ConvException(getTsvFieldValue!double("23.5|def", 1, '|')); | | /* Not enough field errors. These should throw, but not a ConvException.*/ 21| assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("", 1, '\t'))); 21| assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc", 1, '\t'))); 21| assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc\tdef", 2, '\t'))); |} | |/** [Yes|No.newlineWasRemoved] is a template parameter to throwIfWindowsNewlineOnUnix. | * A Yes value indicates the Unix newline was already removed, as might be done via | * std.File.byLine or similar mechanism. | */ |alias NewlineWasRemoved = Flag!"newlineWasRemoved"; | |/** |throwIfWindowsLineNewlineOnUnix is used to throw an exception if a Windows/DOS |line ending is found on a build compiled for a Unix platform. This is used by |the TSV Utilities to detect Window/DOS line endings and terminate processing |with an error message to the user. | */ |void throwIfWindowsNewlineOnUnix | (NewlineWasRemoved nlWasRemoved = Yes.newlineWasRemoved) | (const char[] line, const char[] filename, size_t lineNum) |{ | version(Posix) | { | static if (nlWasRemoved) | { 6815| immutable bool hasWindowsLineEnding = line.length != 0 && line[$ - 1] == '\r'; | } | else | { 63| immutable bool hasWindowsLineEnding = | line.length > 1 && 56| line[$ - 2] == '\r' && 35| line[$ - 1] == '\n'; | } | 3581| if (hasWindowsLineEnding) | { | import std.format; 118| throw new Exception( | format("Windows/DOS line ending found. Convert file to Unix newlines before processing (e.g. 'dos2unix').\n File: %s, Line: %s", 118| (filename == "-") ? "Standard Input" : filename, lineNum)); | } | } |} | |// throwIfWindowsNewlineOnUnix |@safe unittest |{ | /* Note: Currently only building on Posix. Need to add non-Posix test cases | * if Windows builds are ever done. | */ | version(Posix) | { | import std.exception; | 14| assertNotThrown(throwIfWindowsNewlineOnUnix("", "afile.tsv", 1)); 14| assertNotThrown(throwIfWindowsNewlineOnUnix("a", "afile.tsv", 2)); 14| assertNotThrown(throwIfWindowsNewlineOnUnix("ab", "afile.tsv", 3)); 14| assertNotThrown(throwIfWindowsNewlineOnUnix("abc", "afile.tsv", 4)); | 14| assertThrown(throwIfWindowsNewlineOnUnix("\r", "afile.tsv", 1)); 14| assertThrown(throwIfWindowsNewlineOnUnix("a\r", "afile.tsv", 2)); 14| assertThrown(throwIfWindowsNewlineOnUnix("ab\r", "afile.tsv", 3)); 14| assertThrown(throwIfWindowsNewlineOnUnix("abc\r", "afile.tsv", 4)); | 14| assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\n", "afile.tsv", 1)); 14| assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\n", "afile.tsv", 2)); 14| assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\n", "afile.tsv", 3)); 14| assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\n", "afile.tsv", 4)); | 14| assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "afile.tsv", 5)); 14| assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\r\n", "afile.tsv", 6)); 14| assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\r\n", "afile.tsv", 7)); 14| assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\r\n", "afile.tsv", 8)); | | /* Standard Input formatting. */ | import std.algorithm : endsWith; 7| bool exceptionCaught = false; | 7| try (throwIfWindowsNewlineOnUnix("\r", "-", 99)); | catch (Exception e) | { 7| assert(e.msg.endsWith("File: Standard Input, Line: 99")); 7| exceptionCaught = true; | } | finally | { 7| assert(exceptionCaught); 7| exceptionCaught = false; | } | 7| try (throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "-", 99)); | catch (Exception e) | { 7| assert(e.msg.endsWith("File: Standard Input, Line: 99")); 7| exceptionCaught = true; | } | finally | { 7| assert(exceptionCaught); 7| exceptionCaught = false; | } | } |} | |/** Flag used by InputSourceRange to determine if the header line should be when |opening a file. |*/ |alias ReadHeader = Flag!"readHeader"; | |/** |inputSourceRange is a helper function for creating new InputSourceRange objects. |*/ |InputSourceRange inputSourceRange(string[] filepaths, ReadHeader readHeader) |{ 2142| return new InputSourceRange(filepaths, readHeader); |} | |/** |InputSourceRange is an input range that iterates over a set of input files. | |InputSourceRange is used to iterate over a set of files passed on the command line. |Files are automatically opened and closed during iteration. The caller can choose to |have header lines read automatically. | |The range is created from a set of filepaths. These filepaths are mapped to |InputSource objects during the iteration. This is what enables automatically opening |and closing files and reading the header line. | |The motivation for an InputSourceRange is to provide a standard way to look at the |header line of the first input file during command line argument processing, and then |pass the open input file and the header line along to the main processing functions. |This enables a features like named fields to be implemented in a standard way. | |Both InputSourceRange and InputSource are reference objects. This keeps their use |limited to a single iteration over the set of files. The files can be iterated again |by creating a new InputSourceRange against the same filepaths. | |Currently, InputSourceRange supports files and standard input. It is possible other |types of input sources will be added in the future. | */ |final class InputSourceRange |{ | import std.range; | | private string[] _filepaths; | private ReadHeader _readHeader; | private InputSource _front; | 2127| this(string[] filepaths, ReadHeader readHeader) | { 2142| _filepaths = filepaths.dup; 2142| _readHeader = readHeader; 2142| _front = null; | 2142| if (!_filepaths.empty) | { 2128| _front = new InputSource(_filepaths.front, _readHeader); 2128| _front.open; 2113| _filepaths.popFront; | } | } | | size_t length() const pure nothrow @safe | { 504| return empty ? 0 : _filepaths.length + 1; | } | | bool empty() const pure nothrow @safe | { 16083| return _front is null; | } | | InputSource front() pure @safe | { 7010| assert(!empty, "Attempt to take the front of an empty InputSourceRange"); 7010| return _front; | } | | void popFront() | { 2223| assert(!empty, "Attempt to popFront an empty InputSourceRange"); | 2223| _front.close; | 2223| if (!_filepaths.empty) | { 386| _front = new InputSource(_filepaths.front, _readHeader); 386| _front.open; 376| _filepaths.popFront; | } | else | { 1837| _front = null; | } | } |} | |/** |InputSource is a class of objects produced by iterating over an InputSourceRange. | |An InputSource object provides access to the open file currently the front element |of an InputSourceRange. The main methods application code is likely to need are: | |$(LIST | * `file()` - Returns the File object. The file will be open for reading as long | InputSource instance is the front element of the InputSourceRange it came from. | | * `header(KeepTerminator keepTerminator = No.keepTerminator)` - Returns the | header line from the file. An empty string is returned if InputSource range | was created with readHeader=false. | | * `name()` - The name of the input source. The name returned is intended for | user error messages. For files, this is the filepath that was passed to | InputSourceRange. For standard input, it is "Standard Input". |) | |An InputSource is a reference object, so the copies will retain the state of the |InputSourceRange front element. In particular, all copies will have the open |state of the front element of the InputSourceRange. | |This class is not intended for use outside the context of an InputSourceRange. |*/ |final class InputSource |{ | import std.range; | import std.stdio; | | private immutable string _filepath; | private immutable bool _isStdin; | private bool _isOpen; | private ReadHeader _readHeader; | private bool _hasBeenOpened; | private string _header; | private File _file; | 2514| private this(string filepath, ReadHeader readHeader) pure nothrow @safe | { 2514| _filepath = filepath; 2514| _isStdin = filepath == "-"; 2514| _isOpen = false; 2514| _readHeader = readHeader; 2514| _hasBeenOpened = false; | } | | /** file returns the File object held by the InputSource. | * | * The File will be open for reading as long as the InputSource instance is the | * front element of the InputSourceRange it came from. | */ | File file() nothrow @safe | { 2639| return _file; | } | | /** isReadHeaderEnabled returns true if the header line is being read. | */ | bool isReadHeaderEnabled() const pure nothrow @safe | { 140| return _readHeader == Yes.readHeader; | } | | /** header returns the header line from the input file. | * | * An empty string is returned if InputSource range was created with | * readHeader=false. | */ | string header(KeepTerminator keepTerminator = No.keepTerminator) const pure nothrow @safe | { 4342| assert(_hasBeenOpened); 4342| return (keepTerminator == Yes.keepTerminator || 4167| _header.length == 0 || 3899| _header[$ - 1] != '\n') ? 4342| _header : _header[0 .. $-1]; | } | | /** isHeaderEmpty returns true if there is no data for a header, including the | * terminator. | * | * When headers are being read, this true only if the file is empty. | */ | bool isHeaderEmpty() const pure nothrow @safe | { 848| assert(_hasBeenOpened); 848| return _header.empty; | } | | /** name returns a user friendly name representing the input source. | * | * For files, it is the filepath provided to InputSourceRange. For standard | * input, it is "Standard Input". (Use isStdin() to test for standard input, | * not name(). | */ | string name() const pure nothrow @safe | { 7260| return _isStdin ? "Standard Input" : _filepath; | } | | /** isStdin returns true if the input source is Standard Input, false otherwise. | */ | bool isStdin() const pure nothrow @safe | { 2872| return _isStdin; | } | | /** isOpen returns true if the input source is open for reading, false otherwise. | * | * "Open" in this context is whether the InputSource object is currently open, | * meaning that it is the front element of the InputSourceRange that created it. | * | * For files, this is also reflected in the state of the underlying File object. | * However, standard input is never actually closed. | */ | bool isOpen() const pure nothrow @safe | { 455| return _isOpen; | } | | private void open() | { 2514| assert(!_isOpen); 2514| assert(!_hasBeenOpened); | 5028| _file = isStdin ? stdin : _filepath.File("rb"); 3803| if (_readHeader) _header = _file.readln; 2489| _isOpen = true; 2489| _hasBeenOpened = true; | } | | private void close() | { 4392| if (!_isStdin) _file.close; 2223| _isOpen = false; | } |} | |// InputSourceRange and InputSource |unittest |{ | import std.algorithm : all, each; | import std.array : appender; | import std.exception : assertThrown; | import std.file : rmdirRecurse; | import std.path : buildPath; | import std.range; | import std.stdio; | import tsv_utils.common.unittest_utils; | 7| auto testDir = makeUnittestTempDir("tsv_utils_input_source_range"); 7| scope(exit) testDir.rmdirRecurse; | 7| string file0 = buildPath(testDir, "file0.txt"); 7| string file1 = buildPath(testDir, "file1.txt"); 7| string file2 = buildPath(testDir, "file2.txt"); 7| string file3 = buildPath(testDir, "file3.txt"); | 7| string file0Header = ""; 7| string file1Header = "file 1 header\n"; 7| string file2Header = "file 2 header\n"; 7| string file3Header = "file 3 header\n"; | 7| string file0Body = ""; 7| string file1Body = ""; 7| string file2Body = "file 2 line 1\n"; 7| string file3Body = "file 3 line 1\nfile 3 line 2\n"; | 7| string file0Data = file0Header ~ file0Body; 7| string file1Data = file1Header ~ file1Body; 7| string file2Data = file2Header ~ file2Body; 7| string file3Data = file3Header ~ file3Body; | | { 7| file0.File("w").write(file0Data); 7| file1.File("w").write(file1Data); 7| file2.File("w").write(file2Data); 7| file3.File("w").write(file3Data); | } | 7| auto inputFiles = [file0, file1, file2, file3]; 7| auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 7| auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 7| auto fileData = [file0Data, file1Data, file2Data, file3Data]; | 7| auto readSources = appender!(InputSource[]); 7| auto buffer = new char[1024]; // Must be large enough to hold the test files. | | /* Tests without standard input. Don't want to count on state of standard | * input or modifying it when doing unit tests, so avoid reading from it. | */ | 105| foreach(numFiles; 1 .. inputFiles.length + 1) | { | /* Reading headers. */ | 28| readSources.clear; 28| auto inputSourcesYesHeader = inputSourceRange(inputFiles[0 .. numFiles], Yes.readHeader); 28| assert(inputSourcesYesHeader.length == numFiles); | 406| foreach(fileNum, source; inputSourcesYesHeader.enumerate) | { 70| readSources.put(source); 70| assert(source.isOpen); 70| assert(source.file.isOpen); 140| assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 70| assert(readSources.data[fileNum].isOpen); | 70| assert(source.header(Yes.keepTerminator) == fileHeaders[fileNum]); | 70| auto headerNoTerminatorLength = fileHeaders[fileNum].length; 112| if (headerNoTerminatorLength > 0) --headerNoTerminatorLength; 70| assert(source.header(No.keepTerminator) == | fileHeaders[fileNum][0 .. headerNoTerminatorLength]); | 70| assert(source.name == inputFiles[fileNum]); 70| assert(!source.isStdin); 70| assert(source.isReadHeaderEnabled); | 70| assert(source.file.rawRead(buffer) == fileBodies[fileNum]); | } | | /* The InputSourceRange is a reference range, consumed by the foreach. */ 28| assert(inputSourcesYesHeader.empty); | | /* Without reading headers. */ | 28| readSources.clear; 28| auto inputSourcesNoHeader = inputSourceRange(inputFiles[0 .. numFiles], No.readHeader); 28| assert(inputSourcesNoHeader.length == numFiles); | 406| foreach(fileNum, source; inputSourcesNoHeader.enumerate) | { 70| readSources.put(source); 70| assert(source.isOpen); 70| assert(source.file.isOpen); 140| assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 70| assert(readSources.data[fileNum].isOpen); | 70| assert(source.header(Yes.keepTerminator).empty); 70| assert(source.header(No.keepTerminator).empty); | 70| assert(source.name == inputFiles[fileNum]); 70| assert(!source.isStdin); 70| assert(!source.isReadHeaderEnabled); | 70| assert(source.file.rawRead(buffer) == fileData[fileNum]); | } | | /* The InputSourceRange is a reference range, consumed by the foreach. */ 28| assert(inputSourcesNoHeader.empty); | } | | /* Tests with standard input. No actual reading in these tests. | */ | 7| readSources.clear; 84| foreach(fileNum, source; inputSourceRange(["-", "-"], No.readHeader).enumerate) | { 14| readSources.put(source); 14| assert(source.isOpen); 14| assert(source.file.isOpen); 21| assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); // InputSource objects are "closed". 21| assert(readSources.data[0 .. fileNum].all!(s => s.file.isOpen)); // Actual stdin should not be closed. 14| assert(readSources.data[fileNum].isOpen); | 14| assert(source.header(Yes.keepTerminator).empty); 14| assert(source.header(No.keepTerminator).empty); | 14| assert(source.name == "Standard Input"); 14| assert(source.isStdin); | } | | /* Empty filelist. */ 7| string[] nofiles; | { 7| auto sources = inputSourceRange(nofiles, No.readHeader); 7| assert(sources.empty); | } | { 7| auto sources = inputSourceRange(nofiles, Yes.readHeader); 7| assert(sources.empty); | } | | /* Error cases. */ 14| assertThrown(inputSourceRange([file0, "no_such_file.txt"], No.readHeader).each); 14| assertThrown(inputSourceRange(["no_such_file.txt", file1], Yes.readHeader).each); |} | |/** |byLineSourceRange is a helper function for creating new byLineSourceRange objects. |*/ |auto byLineSourceRange( | KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') |(string[] filepaths) |if (is(Char == char) || is(Char == ubyte)) |{ 623| return new ByLineSourceRange!(keepTerminator, Char, terminator)(filepaths); |} | |/** |ByLineSourceRange is an input range that iterates over a set of input files. It |provides bufferedByLine access to each file. | |A ByLineSourceRange is used to iterate over a set of files passed on the command line. |Files are automatically opened and closed during iteration. The front element of the |range provides access to a bufferedByLine for iterating over the lines in the file. | |The range is created from a set of filepaths. These filepaths are mapped to |ByLineSource objects during the iteration. This is what enables automatically opening |and closing files and providing bufferedByLine access. | |The motivation behind ByLineSourceRange is to provide a standard way to look at the |header line of the first input file during command line argument processing, and then |pass the open input file along to the main processing functions. This enables |features like named fields to be implemented in a standard way. | |Access to the first line of the first file is available after creating the |ByLineSourceRange instance. The first file is opened and a bufferedByLine created. |The first line of the first file is via byLine.front (after checking !byLine.empty). | |Both ByLineSourceRange and ByLineSource are reference objects. This keeps their use |limited to a single iteration over the set of files. The files can be iterated again |by creating a new InputSourceRange against the same filepaths. | |Currently, ByLineSourceRange supports files and standard input. It is possible other |types of input sources will be added in the future. | */ |final class ByLineSourceRange( | KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') |if (is(Char == char) || is(Char == ubyte)) |{ | import std.range; | | alias ByLineSourceType = ByLineSource!(keepTerminator, char, terminator); | | private string[] _filepaths; | private ByLineSourceType _front; | 611| this(string[] filepaths) | { 623| _filepaths = filepaths.dup; 623| _front = null; | 623| if (!_filepaths.empty) | { 609| _front = new ByLineSourceType(_filepaths.front); 609| _front.open; 597| _filepaths.popFront; | } | } | | size_t length() const pure nothrow @safe | { 882| return empty ? 0 : _filepaths.length + 1; | } | | bool empty() const pure nothrow @safe | { 4511| return _front is null; | } | | ByLineSourceType front() pure @safe | { 1885| assert(!empty, "Attempt to take the front of an empty ByLineSourceRange"); 1885| return _front; | } | | void popFront() | { 603| assert(!empty, "Attempt to popFront an empty ByLineSourceRange"); | 603| _front.close; | 603| if (!_filepaths.empty) | { 190| _front = new ByLineSourceType(_filepaths.front); 190| _front.open; 182| _filepaths.popFront; | } | else | { 413| _front = null; | } | } |} | |/** |ByLineSource is a class of objects produced by iterating over an ByLineSourceRange. | |A ByLineSource instance provides a bufferedByLine range for the current the front |element of a ByLineSourceRange. The main methods application code is likely to |need are: | |$(LIST | * `byLine()` - Returns the bufferedByLine range accessing the open file. The file | will be open for reading (using the bufferedByLine range) as long as the | ByLineSource instance is the front element of the ByLineSourceRange | it came from. | | * `name()` - The name of the input source. The name returned is intended for | user error messages. For files, this is the filepath that was passed to | ByLineSourceRange. For standard input, it is "Standard Input". |) | |A ByLineSource is a reference object, so the copies have the same state as the |ByLineSourceRange front element. In particular, all copies will have the open |state of the front element of the ByLineSourceRange. | |This class is not intended for use outside the context of an ByLineSourceRange. |*/ |final class ByLineSource( | KeepTerminator keepTerminator, Char = char, ubyte terminator = '\n') |if (is(Char == char) || is(Char == ubyte)) |{ | import std.range; | import std.stdio; | import std.traits : ReturnType; | | alias newByLineFn = bufferedByLine!(keepTerminator, char, terminator); | alias ByLineType = ReturnType!newByLineFn; | | private immutable string _filepath; | private immutable bool _isStdin; | private bool _isOpen; | private bool _hasBeenOpened; | private File _file; | private ByLineType _byLineRange; | 799| private this(string filepath) pure nothrow @safe | { 799| _filepath = filepath; 799| _isStdin = filepath == "-"; 799| _isOpen = false; 799| _hasBeenOpened = false; | } | | /** byLine returns the bufferedByLine object held by the ByLineSource instance. | * | * The File underlying the BufferedByLine object is open for reading as long as | * the ByLineSource instance is the front element of the ByLineSourceRange it | * came from. | */ | ByLineType byLine() nothrow @safe | { 1787| return _byLineRange; | } | | /** name returns a user friendly name representing the underlying input source. | * | * For files, it is the filepath provided to ByLineSourceRange. For standard | * input, it is "Standard Input". (Use isStdin() to test for standard input, | * compare against name().) | */ | string name() const pure nothrow @safe | { 1822| return _isStdin ? "Standard Input" : _filepath; | } | | /** isStdin returns true if the underlying input source is Standard Input, false | * otherwise. | */ | bool isStdin() const pure nothrow @safe | { 939| return _isStdin; | } | | /** isOpen returns true if the ByLineSource instance is open for reading, false | * otherwise. | * | * "Open" in this context is whether the ByLineSource object is currently "open". | * The underlying input source backing it does not necessarily have the same | * state. The ByLineSource instance is "open" if is the front element of the | * ByLineSourceRange that created it. | * | * The underlying input source object follows the same open/close state as makes | * sense. In particular, real files are closed when the ByLineSource object is | * closed. The exception is standard input, which is never actually closed. | */ | bool isOpen() const pure nothrow @safe | { 420| return _isOpen; | } | | private void open() | { 799| assert(!_isOpen); 799| assert(!_hasBeenOpened); | 1598| _file = isStdin ? stdin : _filepath.File("rb"); 779| _byLineRange = newByLineFn(_file); 779| _isOpen = true; 779| _hasBeenOpened = true; | } | | private void close() | { 1200| if (!_isStdin) _file.close; 603| _isOpen = false; | } |} | |// ByLineSourceRange and ByLineSource |unittest |{ | import std.algorithm : all, each; | import std.array : appender; | import std.exception : assertThrown; | import std.file : rmdirRecurse; | import std.path : buildPath; | import std.range; | import std.stdio; | import tsv_utils.common.unittest_utils; | 7| auto testDir = makeUnittestTempDir("tsv_utils_byline_input_source_range"); 7| scope(exit) testDir.rmdirRecurse; | 7| string file0 = buildPath(testDir, "file0.txt"); 7| string file1 = buildPath(testDir, "file1.txt"); 7| string file2 = buildPath(testDir, "file2.txt"); 7| string file3 = buildPath(testDir, "file3.txt"); | 7| string file0Header = ""; 7| string file1Header = "file 1 header\n"; 7| string file2Header = "file 2 header\n"; 7| string file3Header = "file 3 header\n"; | 7| string file0Body = ""; 7| string file1Body = ""; 7| string file2Body = "file 2 line 1\n"; 7| string file3Body = "file 3 line 1\nfile 3 line 2\n"; | 7| string file0Data = file0Header ~ file0Body; 7| string file1Data = file1Header ~ file1Body; 7| string file2Data = file2Header ~ file2Body; 7| string file3Data = file3Header ~ file3Body; | | { 7| file0.File("w").write(file0Data); 7| file1.File("w").write(file1Data); 7| file2.File("w").write(file2Data); 7| file3.File("w").write(file3Data); | } | 7| auto inputFiles = [file0, file1, file2, file3]; 7| auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 7| auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 7| auto fileData = [file0Data, file1Data, file2Data, file3Data]; | 7| auto buffer = new char[1024]; // Must be large enough to hold the test files. | | /* Tests without standard input. Don't want to count on state of standard | * input or modifying it when doing unit tests, so avoid reading from it. | */ | 7| auto readSourcesNoTerminator = appender!(ByLineSource!(No.keepTerminator)[]); 7| auto readSourcesYesTerminator = appender!(ByLineSource!(Yes.keepTerminator)[]); | 105| foreach(numFiles; 1 .. inputFiles.length + 1) | { | /* Using No.keepTerminator. */ 28| readSourcesNoTerminator.clear; 28| auto inputSourcesNoTerminator = byLineSourceRange!(No.keepTerminator)(inputFiles[0 .. numFiles]); 28| assert(inputSourcesNoTerminator.length == numFiles); | 406| foreach(fileNum, source; inputSourcesNoTerminator.enumerate) | { 70| readSourcesNoTerminator.put(source); 70| assert(source.isOpen); 70| assert(source._file.isOpen); 140| assert(readSourcesNoTerminator.data[0 .. fileNum].all!(s => !s.isOpen)); 70| assert(readSourcesNoTerminator.data[fileNum].isOpen); | 70| auto headerNoTerminatorLength = fileHeaders[fileNum].length; 112| if (headerNoTerminatorLength > 0) --headerNoTerminatorLength; | 70| assert(source.byLine.empty || 42| source.byLine.front == fileHeaders[fileNum][0 .. headerNoTerminatorLength]); | 70| assert(source.name == inputFiles[fileNum]); 70| assert(!source.isStdin); | 70| auto readFileData = appender!(char[]); 350| foreach(line; source.byLine) | { 70| readFileData.put(line); 70| readFileData.put('\n'); | } | 70| assert(readFileData.data == fileData[fileNum]); | } | | /* The ByLineSourceRange is a reference range, consumed by the foreach. */ 28| assert(inputSourcesNoTerminator.empty); | | /* Using Yes.keepTerminator. */ 28| readSourcesYesTerminator.clear; 28| auto inputSourcesYesTerminator = byLineSourceRange!(Yes.keepTerminator)(inputFiles[0 .. numFiles]); 28| assert(inputSourcesYesTerminator.length == numFiles); | 406| foreach(fileNum, source; inputSourcesYesTerminator.enumerate) | { 70| readSourcesYesTerminator.put(source); 70| assert(source.isOpen); 70| assert(source._file.isOpen); 140| assert(readSourcesYesTerminator.data[0 .. fileNum].all!(s => !s.isOpen)); 70| assert(readSourcesYesTerminator.data[fileNum].isOpen); | 112| assert(source.byLine.empty || source.byLine.front == fileHeaders[fileNum]); | 70| assert(source.name == inputFiles[fileNum]); 70| assert(!source.isStdin); | 70| auto readFileData = appender!(char[]); 350| foreach(line; source.byLine) | { 70| readFileData.put(line); | } | 70| assert(readFileData.data == fileData[fileNum]); | } | | /* The ByLineSourceRange is a reference range, consumed by the foreach. */ 28| assert(inputSourcesYesTerminator.empty); | } | | /* Empty filelist. */ 7| string[] nofiles; | { 7| auto sources = byLineSourceRange!(No.keepTerminator)(nofiles); 7| assert(sources.empty); | } | { 7| auto sources = byLineSourceRange!(Yes.keepTerminator)(nofiles); 7| assert(sources.empty); | } | | /* Error cases. */ 14| assertThrown(byLineSourceRange!(No.keepTerminator)([file0, "no_such_file.txt"]).each); 14| assertThrown(byLineSourceRange!(Yes.keepTerminator)(["no_such_file.txt", file1]).each); |} common/src/tsv_utils/common/utils.d is 100% covered <<<<<< EOF # path=./common-src-tsv_utils-common-getopt_inorder.lst |/** |A cover for D standard library 'getopt' routine (std.getopt.getopt) function that preserves |command line argument processing order. | |This is a work-around to a limitation in getopt, in that getopt does not process arguments |in command line order. Instead, getopt processes options in the order specified in the call |to getopt. That is, the order in the text of the code. This prevents using command line |options in ways where order specified by the user is taken into account. | |More details here: https://issues.dlang.org/show_bug.cgi?id=16539 | |This should only be used when retaining order is important. Though minimized, there are |cases that don't work as expected, the most important involving option arguments starting |with a dash. See the getoptInorder function comments for specifics. | |Copyright (c) 2016-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) | |Acknowledgments: | |- Unit tests in this file have been adopted from unit tests for the D programming language | std.getopt standard library modules (https://dlang.org/phobos/std_getopt.html). | | License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) | Copyright: 2008-2015 Andrei Alexandrescu |*/ | |module tsv_utils.common.getopt_inorder; | |import std.getopt; | |/** checkForUnsupportedConfigOptions walks the option list looking for unsupported config | * options. | * | * Currently everything except std.getopt.config.required is supported. An exception is | * thrown if an unsupported config parameter is found. | * | * Note: A compile-time check would be ideal. That does not appear doable, as values of | * parameters cannot be read at compile-type, only the data type (template parameter part). | * The generated code creates a test against each 'config' parameter in the options list. | * (An option list contains both config and non-config parameters.) | */ |private void checkForUnsupportedConfigOptions(T...)(T opts) |{ | static if (opts.length > 0) | { | /* opts contains a mixture of types (varadic template parameter). Can | * only place tests on config option tests. | */ | static if (is(typeof(opts[0]) : std.getopt.config)) | { 2368| if (opts[0] == std.getopt.config.required) | { 14| throw new Exception( | "getoptInorder does not support std.getopt.config.required"); | } | } | 92816| checkForUnsupportedConfigOptions(opts[1..$]); | } |} | |/** hasStopOnFirstNotOption walks the config list returns true if one of the | * options in is std.getopt.config.stopOnFirstNonOption. | */ |private bool hasStopOnFirstNonOption(T...)(T opts) |{ | static if (opts.length > 0) | { | static if (is(typeof(opts[0]) : std.getopt.config)) | { 2417| if (opts[0] == std.getopt.config.stopOnFirstNonOption) return true; | } | 92900| return hasStopOnFirstNonOption(opts[1..$]); | } | else | { 998| return false; | } |} | |unittest |{ 7| int[] vals; | 7| assert(!hasStopOnFirstNonOption( | "a|aa", "aaa VAL", &vals, | "b|bb", "bbb VAL", &vals, | "c|cc", "ccc VAL", &vals, | )); | 7| assert(hasStopOnFirstNonOption( | std.getopt.config.stopOnFirstNonOption, | "a|aa", "aaa VAL", &vals, | "b|bb", "bbb VAL", &vals, | "c|cc", "ccc VAL", &vals, | )); | 7| assert(hasStopOnFirstNonOption( | "a|aa", "aaa VAL", &vals, | std.getopt.config.stopOnFirstNonOption, | "b|bb", "bbb VAL", &vals, | "c|cc", "ccc VAL", &vals, | )); | 7| assert(hasStopOnFirstNonOption( | "a|aa", "aaa VAL", &vals, | "b|bb", "bbb VAL", &vals, | std.getopt.config.stopOnFirstNonOption, | "c|cc", "ccc VAL", &vals, | )); | 7| assert(hasStopOnFirstNonOption( | "a|aa", "aaa VAL", &vals, | "b|bb", "bbb VAL", &vals, | "c|cc", "ccc VAL", &vals, | std.getopt.config.stopOnFirstNonOption, | )); |} | |/** getoptInorder is a cover to std.getopt that processes command line options in the | * order on the command. | * | * This is intended for command line argument processing where the order of arguments | * on the command line is important. The standard library std.getopt routine processes | * options in the order listed in call to getopt. Behavioral changes involve order of | * callback processing and array filling. | * | * Other changes from std.getopt: | * $(LIST | * * The std.getopt.config.required option is not supported. | * * Single digits cannot be used as short options. e.g. '-1' cannot be an option. | * * Non-numeric option arguments starting with a dash are not interpreted correctly, | * unless it looks like a negative number or is a single dash. Some examples, | * assuming ("--val") takes one argument: | * $(LIST | * * `["--val", "-9"]` - Okay, "-9" is arg | * * `["--val", "-"]` - Okay, "-" is arg | * * `["--val", "-a"]` - Not okay, "-a" is treated as separate option. | * ) | * ) | */ |GetoptResult getoptInorder(T...)(ref string[] args, T opts) |{ | import std.algorithm : min, remove; | import std.typecons : tuple; | | debug import std.stdio; | debug writeln("\n=========================\n"); | debug writeln("[getoptInorder] args: ", args, " opts: ", opts); | 1026| checkForUnsupportedConfigOptions(opts); 1012| bool configHasStopOnFirstNonOption = hasStopOnFirstNonOption(opts); | | bool isOption(string arg) | { | import std.string : isNumeric; | import std.ascii : isDigit; | 2717| return | (arg == std.getopt.endOfOptions) || 2680| (arg.length >= 2 && 2294| arg[0] == std.getopt.optionChar && 1052| !(arg[1].isDigit && arg.isNumeric)); | } | | /* Walk input args, passing one command option at a time to getopt. | * Example - Assume the args array is: | * | * ["program_name", "--foo", "--bar", "1", "--baz", "2", "3", "--goo"] | * | * The above array is passed to getopt in the following calls: | * | * ["program_name", "--foo"] | * ["program_name", "--bar", "1"] | * ["program_name", "--baz", "2", "3"] | * ["program_name", "--goo"] | * | * The same output variable references are passed each time, with the result that they | * are filled in command option order. The result of the last call to getopt is | * returned. This works because getopt is returning two pieces of info: the help | * options and whether help was wanted. The first is the same on all calls, so the | * last call is fine. The 'help wanted' status needs to be tracked, as it could issued | * any point in the command line. | * | * getopt will remove all arguments accounted for by option processing, but others will | * be passed through. These are kept as part of the command args as they are encountered. | */ 1012| GetoptResult result; 1012| bool helpWanted = false; // Need to track if help is ever requested. 1012| size_t argStart = 1; // Start at 1, index zero is program name. 1012| bool isLastCall = false; | 3626| while (!isLastCall) | { | /* This is the last call to getopt if: | * - There are zero or one args left | * - The next arg is '--' (endOfOptions), which terminates the arg string. | */ 4424| isLastCall = (args.length <= argStart + 1) || (args[argStart] == std.getopt.endOfOptions); | 2678| size_t argEnd = args.length; 2678| if (!isLastCall) | { | /* Find the next option. */ 6738| for (size_t i = argStart + 1; i < args.length; i++) | { 2717| if (isOption(args[i])) | { 1057| argEnd = i; 1057| break; | } | } | } | 2678| auto currArg = args[0..argEnd].dup; 2678| size_t currArgLength = currArg.length; | debug writeln("[getoptInorder] Calling getopt. args: ", currArg, " opts: ", opts); | 2678| result = getopt(currArg, opts); 2614| helpWanted |= result.helpWanted; | | debug writeln("[getoptInorder] After getopt call"); | 2614| size_t numRemoved = currArgLength - currArg.length; | 2614| if (numRemoved > 0) | { | debug import std.conv; | /* Current arg array was modified. Repeat the modification against the full | * array. Assumption in this code is that the removal occurs at the start. | * e.g. Assume the args passed to getopt are [program --foo abc def ghi]. If | * two args are consumed, assumption is the two consumed are [--foo abc] and | * [def ghi] are left as pass-through. This code could go be enhanced to | * validate the specific args removed, at present does not do this. | */ | debug writefln("[getoptInorder] Arg modified. argStart: %d, argEnd: %d, currArgLength: %d, currArg.length: %d, numRemoved: %d, currArg: %s", | argStart, argEnd, currArgLength, currArg.length, numRemoved, currArg.to!string); 1953| args = args.remove(tuple(argStart, argStart + numRemoved)); | debug writeln("[getoptInorder] Updated args: ", args); | } | 2614| size_t numPassThrough = currArgLength - (argStart + numRemoved); | 2614| if (numPassThrough > 0) | { 616| argStart += numPassThrough; 616| isLastCall |= configHasStopOnFirstNonOption; | debug writeln("[getoptInorder] argStart moved forward: ", numPassThrough, " postions."); | } | } | 948| result.helpWanted = helpWanted; | 948| return result; |} | |version(unittest) |{ | import std.exception; |} | |unittest // Issue 16539 |{ | | // Callback order 7| auto args = ["program", | "-a", "1", "-b", "2", "-c", "3", | "--cc", "4", "--bb", "5", "--aa", "6", | "-a", "7", "-b", "8", "-c", "9"]; | 7| string optionHandlerResult; | | void optionHandler(string option, string optionVal) | { 119| if (optionHandlerResult.length > 0) optionHandlerResult ~= "; "; 63| optionHandlerResult ~= option ~ "=" ~ optionVal; | } | 7| getoptInorder( | args, | "a|aa", "aaa VAL", &optionHandler, | "b|bb", "bbb VAL", &optionHandler, | "c|cc", "ccc VAL", &optionHandler, | ); | 7| assert(optionHandlerResult == "a|aa=1; b|bb=2; c|cc=3; c|cc=4; b|bb=5; a|aa=6; a|aa=7; b|bb=8; c|cc=9"); | | // Array population order 7| string[] cmdvals; | 7| args = ["program", | "-a", "1", "-b", "2", "-c", "3", | "--cc", "4", "--bb", "5", "--aa", "6", | "-a", "7", "-b", "8", "-c", "9"]; | 7| getoptInorder( | args, | "a|aa", "aaa VAL", &cmdvals, | "b|bb", "bbb VAL", &cmdvals, | "c|cc", "ccc VAL", &cmdvals, | ); | 7| assert(cmdvals == ["1", "2", "3", "4", "5", "6", "7", "8", "9"]); |} | |unittest // Dashes |{ 7| auto args = ["program", "-m", "-5", "-n", "-50", "-c", "-"]; | 7| int m; 7| int n; 7| char c; | 7| getoptInorder( | args, | "m|mm", "integer", &m, | "n|nn", "integer", &n, | "c|cc", "character", &c, | ); | 7| assert(m == -5); 7| assert(n == -50); 7| assert(c == '-'); |} | | |/* NOTE: The following unit tests have been adapted from unit tests in std.getopt.d | * See https://github.com/dlang/phobos/blob/master/std/getopt.d and | * https://dlang.org/phobos/std_getopt.html. | */ | |@system unittest |{ 7| auto args = ["prog", "--foo", "-b"]; | 7| bool foo; 7| bool bar; 7| auto rslt = getoptInorder(args, "foo|f", "Some information about foo.", &foo, "bar|b", | "Some help message about bar.", &bar); | 7| if (rslt.helpWanted) | { 0000000| defaultGetoptPrinter("Some information about the program.", | rslt.options); | } |} | |@system unittest // bugzilla 15914 |{ 7| bool opt; 7| string[] args = ["program", "-a"]; 7| getoptInorder(args, config.passThrough, 'a', &opt); 7| assert(opt); 7| opt = false; 7| args = ["program", "-a"]; 7| getoptInorder(args, 'a', &opt); 7| assert(opt); 7| opt = false; 7| args = ["program", "-a"]; 7| getoptInorder(args, 'a', "help string", &opt); 7| assert(opt); 7| opt = false; 7| args = ["program", "-a"]; 7| getoptInorder(args, config.caseSensitive, 'a', "help string", &opt); 7| assert(opt); | | version(none) | { | /* About version(none) - This case crashes, whether calling getoptInorder or simply | * getopt. Not clear why. Even converting the whole test case to getopt still results | * in failure at this line. (Implies getoptInorder is not itself the cause, but could | * involve context in which the test is run.) | */ | assertThrown(getoptInorder(args, "", "forgot to put a string", &opt)); | } |} | |// 5316 - arrays with arraySep |@system unittest |{ | import std.conv; | 7| arraySep = ","; 7| scope (exit) arraySep = ""; | 7| string[] names; 7| auto args = ["program.name", "-nfoo,bar,baz"]; 7| getoptInorder(args, "name|n", &names); 7| assert(names == ["foo", "bar", "baz"], to!string(names)); | 7| names = names.init; 7| args = ["program.name", "-n", "foo,bar,baz"]; 7| getoptInorder(args, "name|n", &names); 7| assert(names == ["foo", "bar", "baz"], to!string(names)); | 7| names = names.init; 7| args = ["program.name", "--name=foo,bar,baz"]; 7| getoptInorder(args, "name|n", &names); 7| assert(names == ["foo", "bar", "baz"], to!string(names)); | 7| names = names.init; 7| args = ["program.name", "--name", "foo,bar,baz"]; 7| getoptInorder(args, "name|n", &names); 7| assert(names == ["foo", "bar", "baz"], to!string(names)); |} | |// 5316 - associative arrays with arraySep |@system unittest |{ | import std.conv; | 7| arraySep = ","; 7| scope (exit) arraySep = ""; | 7| int[string] values; 7| values = values.init; 7| auto args = ["program.name", "-vfoo=0,bar=1,baz=2"]; 7| getoptInorder(args, "values|v", &values); 7| assert(values == ["foo":0, "bar":1, "baz":2], to!string(values)); | 7| values = values.init; 7| args = ["program.name", "-v", "foo=0,bar=1,baz=2"]; 7| getoptInorder(args, "values|v", &values); 7| assert(values == ["foo":0, "bar":1, "baz":2], to!string(values)); | 7| values = values.init; 7| args = ["program.name", "--values=foo=0,bar=1,baz=2"]; 7| getoptInorder(args, "values|t", &values); 7| assert(values == ["foo":0, "bar":1, "baz":2], to!string(values)); | 7| values = values.init; 7| args = ["program.name", "--values", "foo=0,bar=1,baz=2"]; 7| getoptInorder(args, "values|v", &values); 7| assert(values == ["foo":0, "bar":1, "baz":2], to!string(values)); |} | |@system unittest |{ | import std.conv; | import std.math; | 7| uint paranoid = 2; 7| string[] args = ["program.name", "--paranoid", "--paranoid", "--paranoid"]; 7| getoptInorder(args, "paranoid+", ¶noid); 7| assert(paranoid == 5, to!(string)(paranoid)); | | enum Color { no, yes } 7| Color color; 7| args = ["program.name", "--color=yes",]; 7| getoptInorder(args, "color", &color); 7| assert(color, to!(string)(color)); | 7| color = Color.no; 7| args = ["program.name", "--color", "yes",]; 7| getoptInorder(args, "color", &color); 7| assert(color, to!(string)(color)); | 7| string data = "file.dat"; 7| int length = 24; 7| bool verbose = false; 7| args = ["program.name", "--length=5", "--file", "dat.file", "--verbose"]; 7| getoptInorder( | args, | "length", &length, | "file", &data, | "verbose", &verbose); 7| assert(args.length == 1); 7| assert(data == "dat.file"); 7| assert(length == 5); 7| assert(verbose); | | // 7| string[] outputFiles; 7| args = ["program.name", "--output=myfile.txt", "--output", "yourfile.txt"]; 7| getoptInorder(args, "output", &outputFiles); 7| assert(outputFiles.length == 2 14| && outputFiles[0] == "myfile.txt" && outputFiles[1] == "yourfile.txt"); | 7| outputFiles = []; 7| arraySep = ","; 7| args = ["program.name", "--output", "myfile.txt,yourfile.txt"]; 7| getoptInorder(args, "output", &outputFiles); 7| assert(outputFiles.length == 2 14| && outputFiles[0] == "myfile.txt" && outputFiles[1] == "yourfile.txt"); 7| arraySep = ""; | 84| foreach (testArgs; | [["program.name", "--tune=alpha=0.5", "--tune", "beta=0.6"], | ["program.name", "--tune=alpha=0.5,beta=0.6"], | ["program.name", "--tune", "alpha=0.5,beta=0.6"]]) | { 21| arraySep = ","; 21| double[string] tuningParms; 21| getoptInorder(testArgs, "tune", &tuningParms); 21| assert(testArgs.length == 1); 21| assert(tuningParms.length == 2); 21| assert(approxEqual(tuningParms["alpha"], 0.5)); 21| assert(approxEqual(tuningParms["beta"], 0.6)); 21| arraySep = ""; | } | 7| uint verbosityLevel = 1; | void myHandler(string option) | { 14| if (option == "quiet") | { 7| verbosityLevel = 0; | } | else | { 7| assert(option == "verbose"); 7| verbosityLevel = 2; | } | } 7| args = ["program.name", "--quiet"]; 7| getoptInorder(args, "verbose", &myHandler, "quiet", &myHandler); 7| assert(verbosityLevel == 0); 7| args = ["program.name", "--verbose"]; 7| getoptInorder(args, "verbose", &myHandler, "quiet", &myHandler); 7| assert(verbosityLevel == 2); | 7| verbosityLevel = 1; | void myHandler2(string option, string value) | { 7| assert(option == "verbose"); 7| verbosityLevel = 2; | } 7| args = ["program.name", "--verbose", "2"]; 7| getoptInorder(args, "verbose", &myHandler2); 7| assert(verbosityLevel == 2); | 7| verbosityLevel = 1; | void myHandler3() | { 7| verbosityLevel = 2; | } 7| args = ["program.name", "--verbose"]; 7| getoptInorder(args, "verbose", &myHandler3); 7| assert(verbosityLevel == 2); | 14| bool foo, bar; 7| args = ["program.name", "--foo", "--bAr"]; 7| getoptInorder(args, | std.getopt.config.caseSensitive, | std.getopt.config.passThrough, | "foo", &foo, | "bar", &bar); 7| assert(args[1] == "--bAr"); | | // test stopOnFirstNonOption | 7| args = ["program.name", "--foo", "nonoption", "--bar"]; 7| foo = bar = false; 7| getoptInorder(args, | std.getopt.config.stopOnFirstNonOption, | "foo", &foo, | "bar", &bar); 28| assert(foo && !bar && args[1] == "nonoption" && args[2] == "--bar"); | 7| args = ["program.name", "--foo", "nonoption", "--zab"]; 7| foo = bar = false; 7| getoptInorder(args, | std.getopt.config.stopOnFirstNonOption, | "foo", &foo, | "bar", &bar); 28| assert(foo && !bar && args[1] == "nonoption" && args[2] == "--zab"); | 7| args = ["program.name", "--fb1", "--fb2=true", "--tb1=false"]; 14| bool fb1, fb2; 7| bool tb1 = true; 7| getoptInorder(args, "fb1", &fb1, "fb2", &fb2, "tb1", &tb1); 21| assert(fb1 && fb2 && !tb1); | | // test keepEndOfOptions | 7| args = ["program.name", "--foo", "nonoption", "--bar", "--", "--baz"]; 7| getoptInorder(args, | std.getopt.config.keepEndOfOptions, | "foo", &foo, | "bar", &bar); 7| assert(args == ["program.name", "nonoption", "--", "--baz"]); | | // Ensure old behavior without the keepEndOfOptions | 7| args = ["program.name", "--foo", "nonoption", "--bar", "--", "--baz"]; 7| getoptInorder(args, | "foo", &foo, | "bar", &bar); 7| assert(args == ["program.name", "nonoption", "--baz"]); | | // test function callbacks | | static class MyEx : Exception | { 42| this() { super(""); } 42| this(string option) { this(); this.option = option; } 21| this(string option, string value) { this(option); this.value = value; } | | string option; | string value; | } | 7| static void myStaticHandler1() { throw new MyEx(); } 7| args = ["program.name", "--verbose"]; 7| try { getoptInorder(args, "verbose", &myStaticHandler1); assert(0); } 14| catch (MyEx ex) { assert(ex.option is null && ex.value is null); } | 7| static void myStaticHandler2(string option) { throw new MyEx(option); } 7| args = ["program.name", "--verbose"]; 7| try { getoptInorder(args, "verbose", &myStaticHandler2); assert(0); } 14| catch (MyEx ex) { assert(ex.option == "verbose" && ex.value is null); } | 7| static void myStaticHandler3(string option, string value) { throw new MyEx(option, value); } 7| args = ["program.name", "--verbose", "2"]; 7| try { getoptInorder(args, "verbose", &myStaticHandler3); assert(0); } 14| catch (MyEx ex) { assert(ex.option == "verbose" && ex.value == "2"); } |} | |@system unittest |{ | // From bugzilla 2142 14| bool f_linenum, f_filename; 7| string[] args = [ "", "-nl" ]; 7| getoptInorder | ( | args, | std.getopt.config.bundling, | //std.getopt.config.caseSensitive, | "linenum|l", &f_linenum, | "filename|n", &f_filename | ); 7| assert(f_linenum); 7| assert(f_filename); |} | |@system unittest |{ | // From bugzilla 6887 7| string[] p; 7| string[] args = ["", "-pa"]; 7| getoptInorder(args, "p", &p); 7| assert(p.length == 1); 7| assert(p[0] == "a"); |} | |@system unittest |{ | // From bugzilla 6888 7| int[string] foo; 7| auto args = ["", "-t", "a=1"]; 7| getoptInorder(args, "t", &foo); 7| assert(foo == ["a":1]); |} | |@system unittest |{ | // From bugzilla 9583 7| int opt; 7| auto args = ["prog", "--opt=123", "--", "--a", "--b", "--c"]; 7| getoptInorder(args, "opt", &opt); 7| assert(args == ["prog", "--a", "--b", "--c"]); |} | |@system unittest |{ 14| string foo, bar; 7| auto args = ["prog", "-thello", "-dbar=baz"]; 7| getoptInorder(args, "t", &foo, "d", &bar); 7| assert(foo == "hello"); 7| assert(bar == "bar=baz"); | | // From bugzilla 5762 7| string a; 7| args = ["prog", "-a-0x12"]; 7| getoptInorder(args, config.bundling, "a|addr", &a); 7| assert(a == "-0x12", a); 7| args = ["prog", "--addr=-0x12"]; 7| getoptInorder(args, config.bundling, "a|addr", &a); 7| assert(a == "-0x12"); | | // From https://d.puremagic.com/issues/show_bug.cgi?id=11764 7| args = ["main", "-test"]; 7| bool opt; 7| args.getoptInorder(config.passThrough, "opt", &opt); 7| assert(args == ["main", "-test"]); | | // From https://issues.dlang.org/show_bug.cgi?id=15220 7| args = ["main", "-o=str"]; 7| string o; 7| args.getoptInorder("o", &o); 7| assert(o == "str"); | 7| args = ["main", "-o=str"]; 7| o = null; 7| args.getoptInorder(config.bundling, "o", &o); 7| assert(o == "str"); |} | |@system unittest // 5228 |{ | import std.exception; | import std.conv; | 7| auto args = ["prog", "--foo=bar"]; 7| int abc; 14| assertThrown!GetOptException(getoptInorder(args, "abc", &abc)); | 7| args = ["prog", "--abc=string"]; 14| assertThrown!ConvException(getoptInorder(args, "abc", &abc)); |} | |@system unittest // From bugzilla 7693 |{ | import std.exception; | | enum Foo { | bar, | baz | } | 7| auto args = ["prog", "--foo=barZZZ"]; 7| Foo foo; 14| assertThrown(getoptInorder(args, "foo", &foo)); 7| args = ["prog", "--foo=bar"]; 14| assertNotThrown(getoptInorder(args, "foo", &foo)); 7| args = ["prog", "--foo", "barZZZ"]; 14| assertThrown(getoptInorder(args, "foo", &foo)); 7| args = ["prog", "--foo", "baz"]; 14| assertNotThrown(getoptInorder(args, "foo", &foo)); |} | |@system unittest // same bug as 7693 only for bool |{ | import std.exception; | 7| auto args = ["prog", "--foo=truefoobar"]; 7| bool foo; 14| assertThrown(getoptInorder(args, "foo", &foo)); 7| args = ["prog", "--foo"]; 7| getoptInorder(args, "foo", &foo); 7| assert(foo); |} | |@system unittest |{ 7| bool foo; 7| auto args = ["prog", "--foo"]; 7| getoptInorder(args, "foo", &foo); 7| assert(foo); |} | |@system unittest |{ 7| bool foo; 7| bool bar; 7| auto args = ["prog", "--foo", "-b"]; 7| getoptInorder(args, config.caseInsensitive,"foo|f", "Some foo", &foo, | config.caseSensitive, "bar|b", "Some bar", &bar); 7| assert(foo); 7| assert(bar); |} | |@system unittest |{ 7| bool foo; 7| bool bar; 7| auto args = ["prog", "-b", "--foo", "-z"]; 7| assertThrown( 7| getoptInorder(args, config.caseInsensitive, config.required, "foo|f", "Some foo", | &foo, config.caseSensitive, "bar|b", "Some bar", &bar, | config.passThrough)); | version(none) // These tests only appy if config.required is supported. | { | assert(foo); | assert(bar); | } |} | |@system unittest |{ | import std.exception; | 7| bool foo; 7| bool bar; 7| auto args = ["prog", "-b", "-z"]; 14| assertThrown(getoptInorder(args, config.caseInsensitive, config.required, "foo|f", | "Some foo", &foo, config.caseSensitive, "bar|b", "Some bar", &bar, | config.passThrough)); |} | |@system unittest |{ | version(none) // No point running this test without config.required support. | { | import std.exception; | | bool foo; | bool bar; | auto args = ["prog", "--foo", "-z"]; | assertNotThrown(getoptInorder(args, config.caseInsensitive, config.required, | "foo|f", "Some foo", &foo, config.caseSensitive, "bar|b", "Some bar", | &bar, config.passThrough)); | assert(foo); | assert(!bar); | } |} | |@system unittest |{ 7| bool foo; 7| auto args = ["prog", "-f"]; 7| auto r = getoptInorder(args, config.caseInsensitive, "help|f", "Some foo", &foo); 7| assert(foo); 7| assert(!r.helpWanted); |} | |@safe unittest // implicit help option without config.passThrough |{ 7| string[] args = ["program", "--help"]; 7| auto r = getoptInorder(args); 7| assert(r.helpWanted); |} | |// Issue 13316 - std.getopt: implicit help option breaks the next argument |@system unittest |{ 7| string[] args = ["program", "--help", "--", "something"]; 7| getoptInorder(args); 7| assert(args == ["program", "something"]); | 7| args = ["program", "--help", "--"]; 7| getoptInorder(args); 7| assert(args == ["program"]); | 7| bool b; 7| args = ["program", "--help", "nonoption", "--option"]; 7| getoptInorder(args, config.stopOnFirstNonOption, "option", &b); 7| assert(args == ["program", "nonoption", "--option"]); |} | |// Issue 13317 - std.getopt: endOfOptions broken when it doesn't look like an option |@system unittest |{ 7| auto endOfOptionsBackup = endOfOptions; 7| scope(exit) endOfOptions = endOfOptionsBackup; 7| endOfOptions = "endofoptions"; 7| string[] args = ["program", "endofoptions", "--option"]; 7| bool b = false; 7| getoptInorder(args, "option", &b); 7| assert(!b); 7| assert(args == ["program", "--option"]); |} | |@system unittest |{ | import std.conv; | | import std.array; | import std.string; 7| bool a; 7| auto args = ["prog", "--foo"]; 7| auto t = getoptInorder(args, "foo|f", "Help", &a); 7| string s; 7| auto app = appender!string(); 7| defaultGetoptFormatter(app, "Some Text", t.options); | 7| string helpMsg = app.data; | //writeln(helpMsg); 7| assert(helpMsg.length); 7| assert(helpMsg.count("\n") == 3, to!string(helpMsg.count("\n")) ~ " " | ~ helpMsg); 7| assert(helpMsg.indexOf("--foo") != -1); 7| assert(helpMsg.indexOf("-f") != -1); 7| assert(helpMsg.indexOf("-h") != -1); 7| assert(helpMsg.indexOf("--help") != -1); 7| assert(helpMsg.indexOf("Help") != -1); | 7| string wanted = "Some Text\n-f --foo Help\n-h --help This help " | ~ "information.\n"; 7| assert(wanted == helpMsg); |} | |@system unittest |{ | version(none) // No point in running this unit test without config.required support | { | import std.conv; | import std.string; | import std.array ; | bool a; | auto args = ["prog", "--foo"]; | auto t = getoptInorder(args, config.required, "foo|f", "Help", &a); | string s; | auto app = appender!string(); | defaultGetoptFormatter(app, "Some Text", t.options); | | string helpMsg = app.data; | //writeln(helpMsg); | assert(helpMsg.length); | assert(helpMsg.count("\n") == 3, to!string(helpMsg.count("\n")) ~ " " | ~ helpMsg); | assert(helpMsg.indexOf("Required:") != -1); | assert(helpMsg.indexOf("--foo") != -1); | assert(helpMsg.indexOf("-f") != -1); | assert(helpMsg.indexOf("-h") != -1); | assert(helpMsg.indexOf("--help") != -1); | assert(helpMsg.indexOf("Help") != -1); | | string wanted = "Some Text\n-f --foo Required: Help\n-h --help " | ~ " This help information.\n"; | assert(wanted == helpMsg, helpMsg ~ wanted); | } |} | |@system unittest // Issue 14724 |{ | version(none) // No point running this unit test without config.required support | { | bool a; | auto args = ["prog", "--help"]; | GetoptResult rslt; | try | { | rslt = getoptInorder(args, config.required, "foo|f", "bool a", &a); | } | catch (Exception e) | { | enum errorMsg = "If the request for help was passed required options" ~ | "must not be set."; | assert(false, errorMsg); | } | | assert(rslt.helpWanted); | } |} common/src/tsv_utils/common/getopt_inorder.d is 99% covered <<<<<< EOF # path=./common-src-tsv_utils-common-tsvutils_version.lst |/** tsv-utils version file. | */ | |module tsv_utils.common.tsvutils_version; | |enum string tsvutilsVersion = "v2.1.0"; | |string tsvutilsVersionNotice (string toolName) @safe pure nothrow |{ 32| return toolName ~ " (eBay/tsv-utils) " ~ tsvutilsVersion ~ "\n" ~ q"EOS |Copyright (c) 2015-2020, eBay Inc. |https://github.com/eBay/tsv-utils |EOS"; |} | |@safe unittest |{ 7| string programName = "program.name"; 7| assert(tsvutilsVersionNotice(programName).length > programName.length); |} common/src/tsv_utils/common/tsvutils_version.d is 100% covered <<<<<< EOF # path=./tsv-join-src-tsv_utils-tsv-join.lst |/** |Command line tool that joins tab-separated value files based on a common key. | |This tool joins lines from tab-delimited files based on a common key. One file, the 'filter' |file, contains the records (lines) being matched. The other input files are searched for |matching records. Matching records are written to standard output, along with any designated |fields from the 'filter' file. In database parlance this is a 'hash semi-join'. | |Copyright (c) 2015-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.tsv_join; | |import std.exception : enforce; |import std.stdio; |import std.format : format; |import std.range; |import std.typecons : tuple; | |auto helpText = q"EOS |Synopsis: tsv-join --filter-file file [options] [file...] | |tsv-join matches input lines (the 'data stream') against lines from a |'filter' file. The match is based on individual fields or the entire |line. Fields can be specified either by field number or field name. |Use '--help-verbose' for details. | |Options: |EOS"; | |auto helpTextVerbose = q"EOS |Synopsis: tsv-join --filter-file file [options] [file...] | |tsv-join matches input lines (the 'data stream') against lines from a |'filter' file. The match is based on exact match comparison of one or more |'key' fields. Fields are TAB delimited by default. Input lines are read |from files or standard input. Matching lines are written to standard |output, along with any additional fields from the filter file that have |been specified. For example: | | tsv-join --filter-file filter.tsv --key-fields 1 --append-fields 5,6 data.tsv | |This reads filter.tsv, creating a hash table keyed on field 1. Lines from |data.tsv are read one at a time. If field 1 is found in the hash table, |the line is written to standard output with fields 5 and 6 from the filter |file appended. In database parlance this is a "hash semi join". Note the |asymmetric relationship: Records in the filter file should be unique, but |lines in the data stream (data.tsv) can repeat. | |Field names can be used instead of field numbers if the files have header |lines. The following command is similar to the previous example, except |using field names: | | tsv-join -H -f filter.tsv -k ID --append-fields Date,Time data.tsv | |tsv-join can also work as a simple filter based on the whole line. This is |the default behavior. Example: | | tsv-join -f filter.tsv data.tsv | |This outputs all lines from data.tsv found in filter.tsv. | |Multiple fields can be specified as keys and append fields. Field numbers |start at one, zero represents the whole line. Fields are comma separated |and ranges can be used. Example: | | tsv-join -f filter.tsv -k 1,2 --append-fields 3-7 data.tsv | |The --e|exclude option can be used to exclude matched lines rather than |keep them. | |The joins supported are similar to the "stream-static" joins available in |Spark Structured Streaming and "KStream-KTable" joins in Kafka. The filter |file plays the same role as the Spark static dataset or Kafka KTable. | |Options: |EOS"; | |/** Container for command line options. | */ |struct TsvJoinOptions |{ | import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange, | inputSourceRange, InputSourceRange, ReadHeader; | | /* Data available the main program. Variables used only command line argument | * processing are local to processArgs. | */ | string programName; /// Program name | InputSourceRange inputSources; /// Input Files | ByLineSourceRange!() filterSource; /// Derived: --filter | size_t[] keyFields; /// Derived: --key-fields | size_t[] dataFields; /// Derived: --data-fields | size_t[] appendFields; /// Derived: --append-fields | bool hasHeader = false; /// --H|header | string appendHeaderPrefix = ""; /// --append-header-prefix | bool writeAll = false; /// --write-all | string writeAllValue; /// --write-all | bool exclude = false; /// --exclude | char delim = '\t'; /// --delimiter | bool allowDupliateKeys = false; /// --allow-duplicate-keys | bool keyIsFullLine = false; /// Derived: --key-fields 0 | bool dataIsFullLine = false; /// Derived: --data-fields 0 | bool appendFullLine = false; /// Derived: --append-fields 0 | | /* Returns a tuple. First value is true if command line arguments were successfully | * processed and execution should continue, or false if an error occurred or the user | * asked for help. If false, the second value is the appropriate exit code (0 or 1). | * | * Returning true (execution continues) means args have been validated and derived | * values calculated. In addition, field indices have been converted to zero-based. | * If the whole line is the key, the individual fields lists will be cleared. | */ | auto processArgs (ref string[] cmdArgs) | { | import std.array : split; | import std.conv : to; | import std.getopt; | import std.path : baseName, stripExtension; | import std.typecons : Yes, No; | import tsv_utils.common.fieldlist; | import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; | 267| bool helpVerbose = false; // --help-verbose 267| bool helpFields = false; // --help-fields 267| bool versionWanted = false; // --V|version 267| string filterFile; // --filter 267| string keyFieldsArg; // --key-fields 267| string dataFieldsArg; // --data-fields 267| string appendFieldsArg; // --append-fields | 267| string keyFieldsOptionString = "k|key-fields"; 267| string dataFieldsOptionString = "d|data-fields"; 267| string appendFieldsOptionString = "a|append-fields"; | 534| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | /* Handler for --write-all. Special handler so two values can be set. */ | void writeAllHandler(string option, string value) | { | debug stderr.writeln("[writeAllHandler] |", option, "| |", value, "|"); 15| writeAll = true; 15| writeAllValue = value; | } | | try | { 267| arraySep = ","; // Use comma to separate values in command line options 267| auto r = getopt( | cmdArgs, | "help-verbose", " Print full help.", &helpVerbose, | "help-fields", " Print help on specifying fields.", &helpFields, | | "f|filter-file", "FILE (Required) File with records to use as a filter.", &filterFile, | | keyFieldsOptionString, | " Fields to use as join key. Default: 0 (entire line).", | &keyFieldsArg, | | dataFieldsOptionString, | " Data stream fields to use as join key, if different than --key-fields.", | &dataFieldsArg, | | appendFieldsOptionString, | " Filter file fields to append to matched data stream records.", | &appendFieldsArg, | | std.getopt.config.caseSensitive, | "H|header", " Treat the first line of each file as a header.", &hasHeader, | std.getopt.config.caseInsensitive, | "p|prefix", "STR String to use as a prefix for --append-fields when writing a header line.", &appendHeaderPrefix, | "w|write-all", "STR Output all data stream records. STR is the --append-fields value when writing unmatched records.", &writeAllHandler, | "e|exclude", " Exclude matching records.", &exclude, | "delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, | "z|allow-duplicate-keys", | " Allow duplicate keys with different append values (last entry wins).", &allowDupliateKeys, | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | std.getopt.config.caseInsensitive, | ); | 267| if (r.helpWanted) | { 1| defaultGetoptPrinter(helpText, r.options); 1| return tuple(false, 0); | } 266| else if (helpVerbose) | { 1| defaultGetoptPrinter(helpTextVerbose, r.options); 1| return tuple(false, 0); | } 265| else if (helpFields) | { 1| writeln(fieldListHelpText); 1| return tuple(false, 0); | } 264| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("tsv-join")); 2| return tuple(false, 0); | } | | /* File arguments. | * * --filter-file required, converted to a one-element ByLineSourceRange | * * Remaining command line args are input files. | */ 262| enforce(filterFile.length != 0, 2| "Required option --f|filter-file was not supplied."); | 264| enforce(!(filterFile == "-" && cmdArgs.length == 1), 2| "A data file is required when standard input is used for the filter file (--f|filter-file -)."); | 516| string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 258| cmdArgs.length = 1; | | /* Validation and derivations - Do as much validation prior to header line | * processing as possible (avoids waiting on stdin). | * | * Note: In tsv-join, when header processing is on, there is very little | * validatation that can be done prior to reading the header line. All the | * logic is in the fieldListArgProcessing function. | */ | 258| string[] filterFileHeaderFields; 258| string[] inputSourceHeaderFields; | | /* fieldListArgProcessing encapsulates the field list dependent processing. | * It is called prior to reading the header line if headers are not being used, | * and after if headers are being used. | */ | void fieldListArgProcessing() | { | import std.algorithm : all, each; | | /* field list parsing. */ 252| if (!keyFieldsArg.empty) | { 235| keyFields = | keyFieldsArg | .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, filterFileHeaderFields, keyFieldsOptionString) | .array; | } | 242| if (!dataFieldsArg.empty) | { 86| dataFields = | dataFieldsArg | .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, inputSourceHeaderFields, dataFieldsOptionString) | .array; | } 156| else if (!keyFieldsArg.empty) | { 139| dataFields = | keyFieldsArg | .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, inputSourceHeaderFields, dataFieldsOptionString) | .array; | } | 238| if (!appendFieldsArg.empty) | { 109| appendFields = | appendFieldsArg | .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, filterFileHeaderFields, appendFieldsOptionString) | .array; | } | | /* Validations */ 234| if (writeAll) | { 15| enforce(appendFields.length != 0, 3| "Use --a|append-fields when using --w|write-all."); | 23| enforce(!(appendFields.length == 1 && appendFields[0] == 0), 3| "Cannot use '--a|append-fields 0' (whole line) when using --w|write-all."); | } | 330| enforce(!(appendFields.length > 0 && exclude), 6| "--e|exclude cannot be used with --a|append-fields."); | 257| enforce(appendHeaderPrefix.length == 0 || hasHeader, 1| "Use --header when using --p|prefix."); | 425| enforce(dataFields.length == 0 || keyFields.length == dataFields.length, 8| "Different number of --k|key-fields and --d|data-fields."); | 213| enforce(keyFields.length != 1 || 129| dataFields.length != 1 || 136| (keyFields[0] == 0 && dataFields[0] == 0) || 247| (keyFields[0] != 0 && dataFields[0] != 0), 6| "If either --k|key-field or --d|data-field is zero both must be zero."); | 412| enforce((keyFields.length <= 1 || all!(a => a != 0)(keyFields)) && 388| (dataFields.length <= 1 || all!(a => a != 0)(dataFields)) && 401| (appendFields.length <= 1 || all!(a => a != 0)(appendFields)), 18| "Field 0 (whole line) cannot be combined with individual fields (non-zero)."); | | /* Derivations. */ | | // Convert 'full-line' field indexes (index zero) to boolean flags. 189| if (keyFields.length == 0) | { 17| assert(dataFields.length == 0); 17| keyIsFullLine = true; 17| dataIsFullLine = true; | } 289| else if (keyFields.length == 1 && keyFields[0] == 0) | { 4| keyIsFullLine = true; 4| keyFields.popFront; 4| dataIsFullLine = true; | 4| if (dataFields.length == 1) | { 4| assert(dataFields[0] == 0); 4| dataFields.popFront; | } | } | 233| if (appendFields.length == 1 && appendFields[0] == 0) | { 9| appendFullLine = true; 9| appendFields.popFront; | } | 210| assert(!(keyIsFullLine && keyFields.length > 0)); 210| assert(!(dataIsFullLine && dataFields.length > 0)); 198| assert(!(appendFullLine && appendFields.length > 0)); | | // Switch to zero-based field indexes. 419| keyFields.each!((ref a) => --a); 419| dataFields.each!((ref a) => --a); 369| appendFields.each!((ref a) => --a); | | } // End fieldListArgProcessing() | | 337| if (!hasHeader) fieldListArgProcessing(); | | /* | * Create the input source ranges for the filter file and data stream files | * and perform header line processing. | */ | 236| filterSource = byLineSourceRange([filterFile]); 466| ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 233| inputSources = inputSourceRange(filepaths, readHeader); | 230| if (hasHeader) | { 175| if (!filterSource.front.byLine.empty) | { 174| throwIfWindowsNewlineOnUnix(filterSource.front.byLine.front, filterSource.front.name, 1); 173| filterFileHeaderFields = filterSource.front.byLine.front.split(delim).to!(string[]); | } 174| throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1); 173| inputSourceHeaderFields = inputSources.front.header.split(delim).to!(string[]); 173| fieldListArgProcessing(); | } | } | catch (Exception exc) | { 75| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 75| return tuple(false, 1); | } 187| return tuple(true, 0); | } |} | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |/** Main program. | */ |int main(string[] cmdArgs) |{ | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 267| dmd_coverSetMerge(true); | } | 267| TsvJoinOptions cmdopt; 267| auto r = cmdopt.processArgs(cmdArgs); 347| if (!r[0]) return r[1]; 187| try tsvJoin(cmdopt); | catch (Exception exc) | { 16| stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 16| return 1; | } 171| return 0; |} | |/** tsvJoin does the primary work of the tsv-join program. | */ |void tsvJoin(ref TsvJoinOptions cmdopt) |{ | import tsv_utils.common.utils : ByLineSourceRange, bufferedByLine, BufferedOutputRange, | isFlushableOutputRange, InputFieldReordering, InputSourceRange, throwIfWindowsNewlineOnUnix; | import std.algorithm : splitter; | import std.array : join; | import std.range; | import std.conv : to; | | /* Check that the input files were setup correctly. Should have one filter file as a | * ByLineSourceRange. There should be at least one input file as an InputSourceRange. | */ 187| assert(cmdopt.filterSource.length == 1); | static assert(is(typeof(cmdopt.filterSource) == ByLineSourceRange!(No.keepTerminator))); | 187| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | | /* State, variables, and convenience derivations. | * | * Combinations of individual fields and whole line (field zero) are convenient for the | * user, but create complexities for the program. Many combinations are disallowed by | * command line processing, but the remaining combos still leave several states. Also, | * this code optimizes by doing only necessary operations, further complicating state | * Here's a guide to variables and state. | * - cmdopt.keyFields, cmdopt.dataFields arrays - Individual field indexes used as keys. | * Empty if the whole line is used as a key. Must be the same length. | * - cmdopt.keyIsFullLine, cmdopt.dataIsFullLine - True when the whole line is used key. | * - cmdopt.appendFields array - Indexes of individual filter file fields being appended. | * Empty if appending the full line, or if not appending anything. | * - cmdopt.appendFullLine - True when the whole line is being appended. | * - isAppending - True is something is being appended. | * - cmdopt.writeAll - True if all lines are being written | */ | /* Convenience derivations. */ 187| auto numKeyFields = cmdopt.keyFields.length; 187| auto numAppendFields = cmdopt.appendFields.length; 365| bool isAppending = (cmdopt.appendFullLine || numAppendFields > 0); | | /* Mappings from field indexes in the input lines to collection arrays. */ 187| auto filterKeysReordering = new InputFieldReordering!char(cmdopt.keyFields); 187| auto dataKeysReordering = (cmdopt.dataFields.length == 0) ? 187| filterKeysReordering : new InputFieldReordering!char(cmdopt.dataFields); 187| auto appendFieldsReordering = new InputFieldReordering!char(cmdopt.appendFields); | | /* The master filter hash. The key is the delimited fields concatenated together | * (including separators). The value is the appendFields concatenated together, as | * they will be appended to the input line. Both the keys and append fields are | * assembled in the order specified, though this only required for append fields. | */ 187| string[string] filterHash; | | /* The append values for unmatched records. */ 187| char[] appendFieldsUnmatchedValue; | 187| if (cmdopt.writeAll) | { 6| assert(cmdopt.appendFields.length > 0); // Checked in consistencyValidations | | // reserve space for n values and n-1 delimiters 6| appendFieldsUnmatchedValue.reserve(cmdopt.appendFields.length * (cmdopt.writeAllValue.length + 1) - 1); | 6| appendFieldsUnmatchedValue ~= cmdopt.writeAllValue; 14| for (size_t i = 1; i < cmdopt.appendFields.length; ++i) | { 1| appendFieldsUnmatchedValue ~= cmdopt.delim; 1| appendFieldsUnmatchedValue ~= cmdopt.writeAllValue; | } | } | | /* Buffered output range for the final output. Setup here because the header line | * (if any) gets written while reading the filter file. | */ 374| auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); | | /* Read the filter file. */ | { 208| bool needPerFieldProcessing = (numKeyFields > 0) || (numAppendFields > 0); 187| auto filterStream = cmdopt.filterSource.front; 17693| foreach (lineNum, line; filterStream.byLine.enumerate(1)) | { | debug writeln("[filter line] |", line, "|"); 3469| if (needPerFieldProcessing) | { 3263| filterKeysReordering.initNewLine; 3263| appendFieldsReordering.initNewLine; | 54388| foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) | { 10876| filterKeysReordering.processNextField(fieldIndex,fieldValue); 10876| appendFieldsReordering.processNextField(fieldIndex,fieldValue); | 16645| if (filterKeysReordering.allFieldsFilled && appendFieldsReordering.allFieldsFilled) | { 3259| break; | } | } | | // Processed all fields in the line. 6524| enforce(filterKeysReordering.allFieldsFilled && appendFieldsReordering.allFieldsFilled, 4| format("Not enough fields in line. File: %s, Line: %s", | filterStream.name, lineNum)); | } | 3465| string key = cmdopt.keyIsFullLine ? 3465| line.to!string : filterKeysReordering.outputFields.join(cmdopt.delim).to!string; 3465| string appendValues = cmdopt.appendFullLine ? 3465| line.to!string : appendFieldsReordering.outputFields.join(cmdopt.delim).to!string; | | debug writeln(" --> [key]:[append] => [", key, "]:[", appendValues, "]"); | 3646| if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filterStream.name, lineNum); | 3644| if (lineNum == 1 && cmdopt.hasHeader) | { | /* When the input has headers, the header line from the first data | * file is read during command line argument processing. Output the | * header now to push it to the next tool in the unix pipeline. This | * enables earlier error detection in downstream tools. | * | * If the input data is empty there will be no header. | */ 129| auto inputStream = cmdopt.inputSources.front; | 129| if (!inputStream.isHeaderEmpty) | { 128| string appendFieldsHeader; | 128| if (cmdopt.appendHeaderPrefix.length == 0) | { 94| appendFieldsHeader = appendValues; | } | else | { 603| foreach (fieldIndex, fieldValue; appendValues.splitter(cmdopt.delim).enumerate) | { 180| if (fieldIndex > 0) appendFieldsHeader ~= cmdopt.delim; 107| appendFieldsHeader ~= cmdopt.appendHeaderPrefix; 107| appendFieldsHeader ~= fieldValue; | } | } | 128| bufferedOutput.append(inputStream.header); 128| if (isAppending) | { 62| bufferedOutput.append(cmdopt.delim); 62| bufferedOutput.append(appendFieldsHeader); | } 128| bufferedOutput.appendln; 128| bufferedOutput.flush; | } | } | else | { 4902| if (isAppending && !cmdopt.allowDupliateKeys) | { 707| string* currAppendValues = (key in filterHash); | 715| enforce(currAppendValues is null || *currAppendValues == appendValues, 8| format("Duplicate keys with different append values (use --z|allow-duplicate-keys to ignore)\n [key 1][values]: [%s][%s]\n [key 2][values]: [%s][%s]", | key, *currAppendValues, key, appendValues)); | } 3327| filterHash[key] = appendValues; | } | } | | /* popFront here closes the filter file. */ 174| cmdopt.filterSource.popFront; | } | | /* Now process each input file, one line at a time. */ | 348| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; | 879| foreach (inputStream; cmdopt.inputSources) | { 305| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 26767| foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) | { | debug writeln("[input line] |", line, "|"); | 5335| if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); | | /* | * Next block checks if the input line matches a hash entry. Two cases: | * a) The whole line is the key. Simply look it up in the hash. | * b) Individual fields are used as the key - Assemble key and look it up. | * | * At the end of the appendFields will contain the result of hash lookup. | */ 5282| string* appendFields; 5282| if (cmdopt.keyIsFullLine) | { 609| appendFields = (line in filterHash); | } | else | { 4673| dataKeysReordering.initNewLine; 61754| foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) | { 12350| dataKeysReordering.processNextField(fieldIndex, fieldValue); 17021| if (dataKeysReordering.allFieldsFilled) break; | } | // Processed all fields in the line. 4673| enforce(dataKeysReordering.allFieldsFilled, 2| format("Not enough fields in line. File: %s, Line: %s", | inputStream.name, lineNum)); | 4671| appendFields = (dataKeysReordering.outputFields.join(cmdopt.delim) in filterHash); | } | 5280| bool matched = (appendFields !is null); | debug writeln(" --> matched? ", matched); 18267| if (cmdopt.writeAll || (matched && !cmdopt.exclude) || (!matched && cmdopt.exclude)) | { 2704| bufferedOutput.append(line); 2704| if (isAppending) | { 1593| bufferedOutput.append(cmdopt.delim); 3186| bufferedOutput.append(matched ? *appendFields : appendFieldsUnmatchedValue); | } 2704| bufferedOutput.appendln(); | } | } | } |} tsv-join/src/tsv_utils/tsv-join.d is 100% covered <<<<<< EOF # path=./tsv-pretty-src-tsv_utils-tsv-pretty.lst |/** |Command line tool that prints TSV data aligned for easier reading on consoles |and traditional command-line environments. | |Copyright (c) 2017-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.tsv_pretty; | |import std.exception : enforce; |import std.range; |import std.stdio; |import std.typecons : Flag, Yes, No, tuple; | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |version(unittest) |{ | // When running unit tests, use main from -main compiler switch. |} |else |{ | /** Main program. Invokes command line arg processing and tsv-pretty to perform | * the real work. Any errors are caught and reported. | */ | int main(string[] cmdArgs) | { | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 441| dmd_coverSetMerge(true); | } | 441| TsvPrettyOptions options; 441| auto r = options.processArgs(cmdArgs); 457| if (!r[0]) return r[1]; 425| try tsvPretty(options, cmdArgs[1 .. $]); | catch (Exception exc) | { 1| stderr.writefln("Error [%s]: %s", options.programName, exc.msg); 1| return 1; | } 424| return 0; | } |} | |auto helpTextVerbose = q"EOS |Synopsis: tsv-pretty [options] [file...] | |tsv-pretty outputs TSV data in a format intended to be more human readable when |working on the command line. This is done primarily by lining up data into |fixed-width columns. Text is left aligned, numbers are right aligned. Floating |points numbers are aligned on the decimal point when feasible. | |Processing begins by reading the initial set of lines into memory to determine |the field widths and data types of each column. This look-ahead buffer is used |for header detection as well. Output begins after this processing is complete. | |By default, only the alignment is changed, the actual values are not modified. |Several of the formatting options do modify the values. | |Features: | |* Floating point numbers: Floats can be printed in fixed-width precision, using | the same precision for all floats in a column. This makes then line up nicely. | Precision is determined by values seen during look-ahead processing. The max | precision defaults to 9, this can be changed when smaller or larger values are | desired. See the '--f|format-floats' and '--p|precision' options. | |* Header lines: Headers are detected automatically when possible. This can be | overridden when automatic detection doesn't work as desired. Headers can be | underlined and repeated at regular intervals. | |* Missing values: A substitute value can be used for empty fields. This is often | less confusing than spaces. See '--e|replace-empty' and '--E|empty-replacement'. | |* Exponential notion: As part float formatting, '--f|format-floats' re-formats | columns where exponential notation is found so all the values in the column | are displayed using exponential notation with the same precision. | |* Preamble: A number of initial lines can be designated as a preamble and output | unchanged. The preamble is before the header, if a header is present. Preamble | lines can be auto-detected via the heuristic that they lack field delimiters. | This works well when the field delimiter is a TAB. | |* Fonts: Fixed-width fonts are assumed. CJK characters are assumed to be double | width. This is not always correct, but works well in most cases. | |Options: |EOS"; | |auto helpText = q"EOS |Synopsis: tsv-pretty [options] [file...] | |tsv-pretty outputs TSV data in a more human readable format. This is done by lining |up data into fixed-width columns. Text is left aligned, numbers are right aligned. |Floating points numbers are aligned on the decimal point when feasible. | |Options: |EOS"; | |/** TsvPrettyOptions is used to process and store command line options. */ |struct TsvPrettyOptions |{ | string programName; | bool helpVerbose = false; // --help-verbose | bool hasHeader = false; // --H|header (Note: Default false assumed by validation code) | bool autoDetectHeader = true; // Derived (Note: Default true assumed by validation code) | bool noHeader = false; // --x|no-header (Note: Default false assumed by validation code) | size_t lookahead = 1000; // --l|lookahead | size_t repeatHeader = 0; // --r|repeat-header num (zero means no repeat) | bool underlineHeader = false; // --u|underline-header | bool formatFloats = false; // --f|format-floats | size_t floatPrecision = 9; // --p|precision num (max precision when formatting floats.) | bool replaceEmpty = false; // --e|replace-empty | string emptyReplacement = ""; // --E|empty-replacement | size_t emptyReplacementPrintWidth = 0; // Derived | char delim = '\t'; // --d|delimiter | size_t spaceBetweenFields = 2; // --s|space-between-fields num | size_t maxFieldPrintWidth = 40; // --m|max-text-width num; Max width for variable width text fields. | bool autoDetectPreamble = false; // --a|auto-preamble | size_t preambleLines = 0; // --b|preamble; Number of preamble lines. | bool versionWanted = false; // --V|version | | /* Returns a tuple. First value is true if command line arguments were successfully | * processed and execution should continue, or false if an error occurred or the user | * asked for help. If false, the second value is the appropriate exit code (0 or 1). | * | * Returning true (execution continues) means args have been validated and derived | * values calculated. In addition, field indices have been converted to zero-based. | * If the whole line is the key, the individual fields list will be cleared. | */ | auto processArgs (ref string[] cmdArgs) | { | import std.algorithm : any, each; | import std.getopt; | import std.path : baseName, stripExtension; | 882| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | try | { 441| arraySep = ","; // Use comma to separate values in command line options 441| auto r = getopt( | cmdArgs, | "help-verbose", " Print full help.", &helpVerbose, | std.getopt.config.caseSensitive, | "H|header", " Treat the first line of each file as a header.", &hasHeader, | std.getopt.config.caseInsensitive, | "x|no-header", " Assume no header. Turns off automatic header detection.", &noHeader, | "l|lookahead", "NUM Lines to read to interpret data before generating output. Default: 1000", &lookahead, | | "r|repeat-header", "NUM Lines to print before repeating the header. Default: No repeating header", &repeatHeader, | | "u|underline-header", " Underline the header.", &underlineHeader, | "f|format-floats", " Format floats for better readability. Default: No", &formatFloats, | "p|precision", "NUM Max floating point precision. Implies --format-floats. Default: 9", &floatPrecisionOptionHandler, | std.getopt.config.caseSensitive, | "e|replace-empty", " Replace empty fields with '--'.", &replaceEmpty, | "E|empty-replacement", "STR Replace empty fields with a string.", &emptyReplacement, | std.getopt.config.caseInsensitive, | "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, | "s|space-between-fields", "NUM Spaces between each field (Default: 2)", &spaceBetweenFields, | "m|max-text-width", "NUM Max reserved field width for variable width text fields. Default: 40", &maxFieldPrintWidth, | "a|auto-preamble", " Treat initial lines in a file as a preamble if the line contains no field delimiters.", &autoDetectPreamble, | "b|preamble", "NUM Treat the first NUM lines as a preamble and output them unchanged.", &preambleLines, | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | std.getopt.config.caseInsensitive, | ); | 434| if (r.helpWanted) | { 2| defaultGetoptPrinter(helpText, r.options); 2| return tuple(false, 0); | } 432| else if (helpVerbose) | { 2| defaultGetoptPrinter(helpTextVerbose, r.options); 2| return tuple(false, 0); | } 430| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("tsv-pretty")); 2| return tuple(false, 0); | } | | /* Validation and derivations. */ 532| enforce(!(noHeader && hasHeader), 1| "Cannot specify both --H|header and --x|no-header."); | 949| if (noHeader || hasHeader) autoDetectHeader = false; | | /* Zero look-ahead has limited utility unless the first line is known to | * be a header. Good chance the user will get an unintended behavior. | */ 439| if (lookahead == 0 && autoDetectHeader) | { 2| enforce(noHeader || hasHeader, 1| "Cannot auto-detect header with zero look-ahead. Specify either '--H|header' or '--x|no-header' when using '--l|lookahead 0'."); | } | 466| enforce(!(autoDetectPreamble && preambleLines != 0), 1| "Do not use '--b|preamble NUM' and '--a|auto-preamble' together. ('--b|preamble 0' is okay.)"); | 435| if (emptyReplacement.length != 0) replaceEmpty = true; 423| else if (replaceEmpty) emptyReplacement = "--"; | 425| if (emptyReplacement.length != 0) | { 18| emptyReplacementPrintWidth = emptyReplacement.monospacePrintWidth; | } | } | catch (Exception exc) | { 10| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 10| return tuple(false, 1); | } 425| return tuple(true, 0); | } | | /* Option handler for --p|precision. It also sets --f|format-floats. */ | private void floatPrecisionOptionHandler(string option, string optionVal) @safe pure | { | import std.conv : to; 57| floatPrecision = optionVal.to!size_t; 56| formatFloats = true; | } |} | |/** tsvPretty is the main loop, operating on input files and passing control to a | * TSVPrettyProccessor instance. | * | * This separates physical I/O sources and sinks from the underlying processing | * algorithm, which operates on generic ranges. A lockingTextWriter is created and | * released on every input line. This has effect flushing standard output every line, | * desirable in command line tools. | * | * This routine also handles identification of preamble lines. This is mostly for | * simplification of the TsvPrettyProcessor code. | */ |void tsvPretty(const ref TsvPrettyOptions options, const string[] files) |{ | import std.algorithm : canFind; | 425| auto firstNonPreambleLine = options.preambleLines + 1; 425| auto tpp = TsvPrettyProcessor(options); 3357| foreach (filename; (files.length > 0) ? files : ["-"]) | { 553| bool autoDetectPreambleDone = false; 1658| auto inputStream = (filename == "-") ? stdin : filename.File(); 15861| foreach (lineNum, line; inputStream.byLine.enumerate(1)) | { 2841| bool isPreambleLine = false; 2841| bool isFirstNonPreambleLine = false; | 2841| if (options.autoDetectPreamble) | { 222| if (!autoDetectPreambleDone) | { 123| if (line.canFind(options.delim)) | { 55| autoDetectPreambleDone = true; 55| isFirstNonPreambleLine = true; | } | else | { 68| isPreambleLine = true; | } | } | } 2619| else if (lineNum < firstNonPreambleLine) | { 63| isPreambleLine = true; | } 2556| else if (lineNum == firstNonPreambleLine) | { 489| isFirstNonPreambleLine = true; | } | | 2841| if (isPreambleLine) | { 131| tpp.processPreambleLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line); | } 2710| else if (isFirstNonPreambleLine) | { 544| tpp.processFileFirstLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line); | } | else | { 2166| tpp.processLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line); | } | } | } 424| tpp.finish(outputRangeObject!(char, char[])(stdout.lockingTextWriter)); |} | |/** TsvPrettyProcessor maintains state of processing and exposes operations for | * processing individual input lines. | * | * TsvPrettyProcessor knows that input is file-based, but doesn't deal with actual | * files or reading lines from input. That is the job of the caller. Output is | * written to an output range. The caller is expected to pass each line to in the | * order received, that is an assumption built-into the its processing. | * | * In addition to the constructor, there are four API methods: | * - processPreambleLine - Called to process a preamble line occurring before | * the header line or first line of data. | * - processFileFirstLine - Called to process the first line of each file. This | * enables header processing. | * - processLine - Called to process all lines except for the first line a file. | * - finish - Called at the end of all processing. This is needed in case the | * look-ahead cache is still being filled when input terminates. | */ | |struct TsvPrettyProcessor |{ | import std.array : appender; | |private: | private enum AutoDetectHeaderResult { none, hasHeader, noHeader }; | | private TsvPrettyOptions _options; | private size_t _fileCount = 0; | private size_t _dataLineOutputCount = 0; | private bool _stillCaching = true; | private string _candidateHeaderLine; | private auto _lookaheadCache = appender!(string[])(); | private FieldFormat[] _fieldVector; | private AutoDetectHeaderResult _autoDetectHeaderResult = AutoDetectHeaderResult.none; | | /** Constructor. */ 850| this(const TsvPrettyOptions options) @safe pure nothrow @nogc | { 425| _options = options; 534| if (options.noHeader && options.lookahead == 0) _stillCaching = false; | } | | invariant | { 982| assert(_options.hasHeader || _options.noHeader || _options.autoDetectHeader); 436| assert((_options.lookahead == 0 && _lookaheadCache.data.length == 0) || 414| _lookaheadCache.data.length < _options.lookahead); | } | | /** Called to process a preamble line occurring before the header line or first | * line of data. | */ | void processPreambleLine(OutputRange!char outputStream, const char[] line) | { 131| if (_fileCount == 0) | { 83| put(outputStream, line); 83| put(outputStream, '\n'); | } | } | | /** Called to process the first line of each file. This enables header processing. */ | void processFileFirstLine(OutputRange!char outputStream, const char[] line) | { | import std.conv : to; | 544| _fileCount++; | 544| if (_options.noHeader) | { 127| processLine(outputStream, line); | } 417| else if (_options.hasHeader) | { 107| if (_fileCount == 1) | { 95| setHeaderLine(line); 100| if (_options.lookahead == 0) outputLookaheadCache(outputStream); | } | } | else | { 310| assert(_options.autoDetectHeader); | 310| final switch (_autoDetectHeaderResult) | { 10| case AutoDetectHeaderResult.noHeader: 10| assert(_fileCount > 1); 10| processLine(outputStream, line); 10| break; | 22| case AutoDetectHeaderResult.hasHeader: 22| assert(_fileCount > 1); 22| break; | 278| case AutoDetectHeaderResult.none: 278| if (_fileCount == 1) | { 225| assert(_candidateHeaderLine.length == 0); 225| _candidateHeaderLine = line.to!string; | } 53| else if (_fileCount == 2) | { 53| if (_candidateHeaderLine == line) | { 39| _autoDetectHeaderResult = AutoDetectHeaderResult.hasHeader; 39| setHeaderLine(_candidateHeaderLine); | | /* Edge case: First file has only a header line and look-ahead set to zero. */ 78| if (_stillCaching && _options.lookahead == 0) outputLookaheadCache(outputStream); | } | else | { 14| _autoDetectHeaderResult = AutoDetectHeaderResult.noHeader; 14| updateFieldFormatsForLine(_candidateHeaderLine); 14| processLine(outputStream, line); | } | } 278| break; | } | } | } | | /** Called to process all lines except for the first line a file. */ | void processLine(OutputRange!char outputStream, const char[] line) | { 4061| if (_stillCaching) cacheDataLine(outputStream, line); 573| else outputDataLine(outputStream, line); | } | | /** Called at the end of all processing. This is needed in case the look-ahead cache | * is still being filled when input terminates. | */ | void finish(OutputRange!char outputStream) | { 701| if (_stillCaching) outputLookaheadCache(outputStream); | } | |private: | /* outputLookaheadCache finalizes processing of the lookahead cache. This includes | * Setting the type and width of each field, finalizing the auto-detect header | * decision, and outputing all lines in the cache. | */ | void outputLookaheadCache(OutputRange!char outputStream) | { | import std.algorithm : splitter; | 418| assert(_stillCaching); | 418| if (_options.autoDetectHeader && 226| _autoDetectHeaderResult == AutoDetectHeaderResult.none && 173| _candidateHeaderLine.length != 0) | { 172| if (candidateHeaderLooksLikeHeader()) | { 89| _autoDetectHeaderResult = AutoDetectHeaderResult.hasHeader; 89| setHeaderLine(_candidateHeaderLine); | } | else | { 83| _autoDetectHeaderResult = AutoDetectHeaderResult.noHeader; | } | } | | 418| if (_options.hasHeader || 549| (_options.autoDetectHeader && _autoDetectHeaderResult == AutoDetectHeaderResult.hasHeader)) | { 223| finalizeFieldFormatting(); 223| outputHeader(outputStream); | } 293| else if (_options.autoDetectHeader && _autoDetectHeaderResult == AutoDetectHeaderResult.noHeader && 97| _candidateHeaderLine.length != 0) | { 97| updateFieldFormatsForLine(_candidateHeaderLine); 97| finalizeFieldFormatting(); 97| outputDataLine(outputStream, _candidateHeaderLine); | } | else | { 98| finalizeFieldFormatting(); | } | 8230| foreach(line; _lookaheadCache.data) outputDataLine(outputStream, line); 418| _lookaheadCache.clear; 418| _stillCaching = false; | } | | bool candidateHeaderLooksLikeHeader() @safe | { | import std.algorithm : splitter; | | /* The candidate header is declared as the header if the look-ahead cache has at least | * one numeric field that is text in the candidate header. | */ 2656| foreach(fieldIndex, fieldValue; _candidateHeaderLine.splitter(_options.delim).enumerate) | { 498| auto candidateFieldFormat = FieldFormat(fieldIndex); 498| candidateFieldFormat.updateForFieldValue(fieldValue, _options); 498| if (_fieldVector.length > fieldIndex && 463| candidateFieldFormat.fieldType == FieldType.text && 319| (_fieldVector[fieldIndex].fieldType == FieldType.integer || 266| _fieldVector[fieldIndex].fieldType == FieldType.floatingPoint || 230| _fieldVector[fieldIndex].fieldType == FieldType.exponent)) | { 89| return true; | } | } | 83| return false; | } | | void setHeaderLine(const char[] line) @safe | { | import std.algorithm : splitter; | 5541| foreach(fieldIndex, header; line.splitter(_options.delim).enumerate) | { 1539| if (_fieldVector.length == fieldIndex) _fieldVector ~= FieldFormat(fieldIndex); 1019| assert(_fieldVector.length > fieldIndex); 1019| _fieldVector[fieldIndex].setHeader(header); | } | } | | void cacheDataLine(OutputRange!char outputStream, const char[] line) | { | import std.conv : to; | 1744| assert(_lookaheadCache.data.length < _options.lookahead); | 1744| _lookaheadCache ~= line.to!string; 1744| updateFieldFormatsForLine(line); 1880| if (_lookaheadCache.data.length == _options.lookahead) outputLookaheadCache(outputStream); | } | | void updateFieldFormatsForLine(const char[] line) @safe | { | import std.algorithm : splitter; | 44640| foreach(fieldIndex, fieldValue; line.splitter(_options.delim).enumerate) | { 9521| if (_fieldVector.length == fieldIndex) _fieldVector ~= FieldFormat(fieldIndex); 8186| assert(_fieldVector.length > fieldIndex); 8186| _fieldVector[fieldIndex].updateForFieldValue(fieldValue, _options); | } | | } | | void finalizeFieldFormatting() @safe pure @nogc nothrow | { 418| size_t nextFieldStart = 0; 6819| foreach(ref field; _fieldVector) | { 1855| nextFieldStart = field.finalizeFormatting(nextFieldStart, _options) + _options.spaceBetweenFields; | } | } | | void outputHeader(OutputRange!char outputStream) | { 229| size_t nextOutputPosition = 0; 5783| foreach(fieldIndex, ref field; _fieldVector.enumerate) | { 1065| size_t spacesNeeded = field.startPosition - nextOutputPosition; 1065| put(outputStream, repeat(" ", spacesNeeded)); 1065| nextOutputPosition += spacesNeeded; 1065| nextOutputPosition += field.writeHeader(outputStream, _options); | } 229| put(outputStream, '\n'); | 229| if (_options.underlineHeader) | { 53| nextOutputPosition = 0; 1476| foreach(fieldIndex, ref field; _fieldVector.enumerate) | { 274| size_t spacesNeeded = field.startPosition - nextOutputPosition; 274| put(outputStream, repeat(" ", spacesNeeded)); 274| nextOutputPosition += spacesNeeded; 274| nextOutputPosition += field.writeHeader!(Yes.writeUnderline)(outputStream, _options); | } 53| put(outputStream, '\n'); | } | } | | void outputDataLine(OutputRange!char outputStream, const char[] line) | { | import std.algorithm : splitter; | | /* Repeating header option. */ 2453| if (_options.repeatHeader != 0 && _dataLineOutputCount != 0 && 57| (_options.hasHeader || (_options.autoDetectHeader && 9| _autoDetectHeaderResult == AutoDetectHeaderResult.hasHeader)) && 20| _dataLineOutputCount % _options.repeatHeader == 0) | { 6| put(outputStream, '\n'); 6| outputHeader(outputStream); | } | 2414| _dataLineOutputCount++; | 2414| size_t nextOutputPosition = 0; 57958| foreach(fieldIndex, fieldValue; line.splitter(_options.delim).enumerate) | { 10626| if (fieldIndex == _fieldVector.length) | { | /* Line is longer than any seen while caching. Add a new FieldFormat entry | * and set the line formatting based on this field value. | */ 39| _fieldVector ~= FieldFormat(fieldIndex); 39| size_t startPosition = (fieldIndex == 0) ? 6| 0 : 33| _fieldVector[fieldIndex - 1].endPosition + _options.spaceBetweenFields; | 39| _fieldVector[fieldIndex].updateForFieldValue(fieldValue, _options); 39| _fieldVector[fieldIndex].finalizeFormatting(startPosition, _options); | } | 10626| assert(fieldIndex < _fieldVector.length); | 10626| FieldFormat fieldFormat = _fieldVector[fieldIndex]; 10626| size_t nextFieldStart = fieldFormat.startPosition; 10626| size_t spacesNeeded = (nextOutputPosition < nextFieldStart) ? 7518| nextFieldStart - nextOutputPosition : 6216| (fieldIndex == 0) ? 0 : 1; // Previous field went long. One space between fields | 10626| put(outputStream, repeat(" ", spacesNeeded)); 10626| nextOutputPosition += spacesNeeded; 10626| nextOutputPosition += fieldFormat.writeFieldValue(outputStream, nextOutputPosition, fieldValue, _options); | } 2414| put(outputStream, '\n'); | } |} | |/** Field types recognized and tracked by tsv-pretty processing. */ |enum FieldType { unknown, text, integer, floatingPoint, exponent }; | |/** Field alignments used by tsv-pretty processing. */ |enum FieldAlignment { left, right }; | |/** FieldFormat holds all the formatting info needed to format data values in a specific | * column. e.g. Field 1 may be text, field 2 may be a float, etc. This is calculated | * during the caching phase. Each FieldFormat instance is part of a vector representing | * the full row, so each includes the start position on the line and similar data. | * | * APIs used during the caching phase to gather field value samples | * - this - Initial construction. Takes the field index. | * - setHeader - Used to set the header text. | * - updateForFieldValue - Used to add the next field value sample. | * - finalizeFormatting - Used at the end of caching to finalize the format choices. | * | * APIs used after caching is finished (after finalizeFormatting): | * - startPosition - Returns the expected start position for the field. | * - endPosition - Returns the expected end position for the field. | * - writeHeader - Outputs the header, properly aligned. | * - writeFieldValue - Outputs the current field value, properly aligned. | */ | |struct FieldFormat |{ |private: | size_t _fieldIndex; // Zero-based index in the line | string _header = ""; // Original field header | size_t _headerPrintWidth = 0; | FieldType _type = FieldType.unknown; | FieldAlignment _alignment = FieldAlignment.left; | size_t _startPosition = 0; | size_t _printWidth = 0; | size_t _precision = 0; // Number of digits after the decimal point | | /* These are used while doing initial type and print format detection. */ | size_t _minRawPrintWidth = 0; | size_t _maxRawPrintWidth = 0; | size_t _maxDigitsBeforeDecimal = 0; | size_t _maxDigitsAfterDecimal = 0; | size_t _maxSignificantDigits = 0; // Digits to include in exponential notation | |public: | | /** Initial construction. Takes a field index. */ 2392| this(size_t fieldIndex) @safe pure nothrow @nogc | { 2392| _fieldIndex = fieldIndex; | } | | /** Sets the header text. */ | void setHeader(const char[] header) @safe | { | import std.conv : to; | 1019| _header = header.to!string; 1019| _headerPrintWidth = _header.monospacePrintWidth; | } | | /** Returns the expected start position for the field. */ | size_t startPosition() nothrow pure @safe @property | { 11965| return _startPosition; | } | | /** Returns the expected end position for the field. */ | size_t endPosition() nothrow pure @safe @property | { 33| return _startPosition + _printWidth; | } | | /** Returns the type of field. */ | FieldType fieldType() nothrow pure @safe @property | { 1278| return _type; | } | | /** Writes the field header or underline characters to the output stream. | * | * The current output position should have been written up to the field's start position, | * including any spaces between fields. Unlike data fields, there is no need to correct | * for previous fields that have run long. This routine does not output trailing spaces. | * This makes it simpler for lines to avoid unnecessary trailing spaces. | * | * Underlines can either be written the full width of the field or the just under the | * text of the header. At present this is a template parameter (compile-time). | * | * The print width of the output is returned. | */ | size_t writeHeader (Flag!"writeUnderline" writeUnderline = No.writeUnderline, | Flag!"fullWidthUnderline" fullWidthUnderline = No.fullWidthUnderline) | (OutputRange!char outputStream, const ref TsvPrettyOptions options) | { | import std.range : repeat; | 1339| size_t positionsWritten = 0; 1339| if (_headerPrintWidth > 0) | { | static if (writeUnderline) | { | static if (fullWidthUnderline) | { | put(outputStream, repeat("-", _printWidth)); | positionsWritten += _printWidth; | } | else // Underline beneath the header text only | { 274| if (_alignment == FieldAlignment.right) | { 72| put(outputStream, repeat(" ", _printWidth - _headerPrintWidth)); 72| positionsWritten += _printWidth - _headerPrintWidth; | } 274| put(outputStream, repeat("-", _headerPrintWidth)); 274| positionsWritten += _headerPrintWidth; | } | } | else | { 1049| if (_alignment == FieldAlignment.right) | { 488| put(outputStream, repeat(" ", _printWidth - _headerPrintWidth)); 488| positionsWritten += _printWidth - _headerPrintWidth; | } 1049| put(outputStream, _header); 1049| positionsWritten += _headerPrintWidth; | } | } 1339| return positionsWritten; | } | | /** Writes the field value for the current column. | * | * The caller needs to generate output at least to the column's start position, but | * can go beyond if previous fields have run long. | * | * The field value is aligned properly in the field. Either left aligned (text) or | * right aligned (numeric). Floating point fields are both right aligned and | * decimal point aligned. The number of bytes written is returned. Trailing spaces | * are not added, the caller must add any necessary trailing spaces prior to | * printing the next field. | */ | size_t writeFieldValue(OutputRange!char outputStream, size_t currPosition, | const char[] fieldValue, in ref TsvPrettyOptions options) | in | { 10626| assert(currPosition >= _startPosition); // Caller resposible for advancing to field start position. 15456| assert(_type == FieldType.text || _type == FieldType.integer || 3099| _type == FieldType.floatingPoint || _type == FieldType.exponent); | } | do | { | import std.algorithm : find, max, min; | import std.conv : to, ConvException; | import std.format : format; | | /* Create the print version of the string. Either the raw value or a formatted | * version of a float. | */ 10626| string printValue; 14900| if (!options.formatFloats || _type == FieldType.text || _type == FieldType.integer) | { 9610| printValue = fieldValue.to!string; | } | else | { 1016| assert(options.formatFloats); 1848| assert(_type == FieldType.exponent || _type == FieldType.floatingPoint); | 1016| if (_type == FieldType.exponent) | { 184| printValue = fieldValue.formatExponentValue(_precision); | } | else | { 832| printValue = fieldValue.formatFloatingPointValue(_precision); | } | } | 11041| if (printValue.length == 0 && options.replaceEmpty) printValue = options.emptyReplacement; 10626| size_t printValuePrintWidth = printValue.monospacePrintWidth; | | /* Calculate leading spaces needed for right alignment. */ 10626| size_t leadingSpaces = 0; 10626| if (_alignment == FieldAlignment.right) | { | /* Target width adjusts the column width to account for overrun by the previous field. */ 4830| size_t targetWidth; 4830| if (currPosition == _startPosition) | { 4495| targetWidth = _printWidth; | } | else | { 335| size_t startGap = currPosition - _startPosition; 335| targetWidth = max(printValuePrintWidth, 335| startGap < _printWidth ? _printWidth - startGap : 0); | } | 4830| leadingSpaces = (printValuePrintWidth < targetWidth) ? 4830| targetWidth - printValuePrintWidth : 0; | | /* The above calculation assumes the print value is fully right aligned. | * This is not correct when raw value floats are being used rather than | * formatted floats, as different values will have different precision. | * The next adjustment accounts for this, dropping leading spaces as | * needed to align the decimal point. Note that text and exponential | * values get aligned strictly against right boundaries. | */ 7881| if (leadingSpaces > 0 && _precision > 0 && 2883| _type == FieldType.floatingPoint && !options.formatFloats) | { | import std.algorithm : canFind, findSplit; | import std.string : isNumeric; | 11132| if (printValue.isNumeric && !printValue.canFind!(x => x == 'e' || x == 'E')) | { 990| size_t decimalAndDigitsLength = printValue.find(".").length; 990| size_t trailingSpaces = 271| (decimalAndDigitsLength == 0) ? _precision + 1 : 1057| (decimalAndDigitsLength > _precision) ? 0 : 381| _precision + 1 - decimalAndDigitsLength; | 990| leadingSpaces = (leadingSpaces > trailingSpaces) ? 990| leadingSpaces - trailingSpaces : 0; | } | } | } 10626| put(outputStream, repeat(' ', leadingSpaces)); 10626| put(outputStream, printValue); 10626| return printValuePrintWidth + leadingSpaces; | } | | /** Updates type and format given a new field value. | * | * This is called during look-ahead caching to register a new sample value for the | * column. The key components updates are field type and print width. | */ | void updateForFieldValue(const char[] fieldValue, const ref TsvPrettyOptions options) @safe | { | import std.algorithm : findAmong, findSplit, max, min; | import std.conv : to, ConvException; | import std.string : isNumeric; | 8723| size_t fieldValuePrintWidth = fieldValue.monospacePrintWidth; 8723| size_t fieldValuePrintWidthWithEmpty = 225| (fieldValuePrintWidth == 0 && options.replaceEmpty) ? 40| options.emptyReplacementPrintWidth : 8683| fieldValuePrintWidth; | 8723| _maxRawPrintWidth = max(_maxRawPrintWidth, fieldValuePrintWidthWithEmpty); 8723| _minRawPrintWidth = (_minRawPrintWidth == 0) ? 2493| fieldValuePrintWidthWithEmpty : 6230| min(_minRawPrintWidth, fieldValuePrintWidthWithEmpty); | 8723| if (_type == FieldType.text) | { | /* Already text, can't become anything else. */ | } 5046| else if (fieldValuePrintWidth == 0) | { | /* Don't let an empty field override a numeric field type. */ | } 5046| else if (!fieldValue.isNumeric) | { | /* Not parsable as a number. Switch from unknown or numeric type to text. */ 1296| _type = FieldType.text; | } | else | { | /* Field type is currently unknown or numeric, and current field parses as numeric. | * See if it parses as integer or float. Integers will parse as floats, so try | * integer types first. | */ 3750| FieldType parsesAs = FieldType.unknown; 3750| long longValue; 3750| ulong ulongValue; 3750| double doubleValue; | try | { 3750| longValue = fieldValue.to!long; 1881| parsesAs = FieldType.integer; | } | catch (ConvException) | { | try | { 1869| ulongValue = fieldValue.to!ulong; 0000000| parsesAs = FieldType.integer; | } | catch (ConvException) | { | try | { 1869| doubleValue = fieldValue.to!double; | import std.algorithm : findAmong; 1869| parsesAs = (fieldValue.findAmong("eE").length == 0) ? 1869| FieldType.floatingPoint : FieldType.exponent; | } | catch (ConvException) | { | /* Note: This means isNumeric thinks it's a number, but conversions all failed. */ 0000000| parsesAs = FieldType.text; | } | } | } | 3750| if (parsesAs == FieldType.text) | { | /* Not parsable as a number (despite isNumeric result). Switch to text type. */ 0000000| _type = FieldType.text; | } 3750| else if (parsesAs == FieldType.exponent) | { | /* Exponential notion supersedes both vanilla floats and integers. */ 95| _type = FieldType.exponent; 95| _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits); | 95| if (auto decimalSplit = fieldValue.findSplit(".")) | { 80| auto fromExponent = decimalSplit[2].findAmong("eE"); 80| size_t numDigitsAfterDecimal = decimalSplit[2].length - fromExponent.length; 80| _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, decimalSplit[0].length); 80| _maxDigitsAfterDecimal = max(_maxDigitsAfterDecimal, numDigitsAfterDecimal); | } | else | { | /* Exponent without a decimal point. */ 15| auto fromExponent = fieldValue.findAmong("eE"); 15| assert(fromExponent.length > 0); 15| size_t numDigits = fieldValue.length - fromExponent.length; 15| _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, numDigits); | } | } 3655| else if (parsesAs == FieldType.floatingPoint) | { | /* Floating point supercedes integer but not exponential. */ 3422| if (_type != FieldType.exponent) _type = FieldType.floatingPoint; 1774| _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits); | 1774| if (auto decimalSplit = fieldValue.findSplit(".")) | { 1705| _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, decimalSplit[0].length); 1705| _maxDigitsAfterDecimal = max(_maxDigitsAfterDecimal, decimalSplit[2].length); | } | } | else | { 1881| assert(parsesAs == FieldType.integer); 3634| if (_type != FieldType.floatingPoint) _type = FieldType.integer; 1881| _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits); 1881| _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, fieldValue.length); | } | } | } | | /** Updates field formatting info based on the current state. It is expected to be | * called after adding field entries via updateForFieldValue(). It returns its new | * end position. | */ | size_t finalizeFormatting (size_t startPosition, const ref TsvPrettyOptions options) @safe pure @nogc nothrow | { | import std.algorithm : max, min; 1894| _startPosition = startPosition; 1976| if (_type == FieldType.unknown) _type = FieldType.text; 3369| _alignment = (_type == FieldType.integer || _type == FieldType.floatingPoint 1091| || _type == FieldType.exponent) ? 868| FieldAlignment.right : 1026| FieldAlignment.left; | 1894| if (_type == FieldType.floatingPoint) | { 384| size_t precision = min(options.floatPrecision, _maxDigitsAfterDecimal); 384| size_t maxValueWidth = _maxDigitsBeforeDecimal + precision; 722| if (precision > 0) maxValueWidth++; // Account for the decimal point. 384| _printWidth = max(1, _headerPrintWidth, maxValueWidth); 384| _precision = precision; | } 1510| else if (_type == FieldType.exponent) | { 130| size_t maxPrecision = (_maxSignificantDigits > 0) ? _maxSignificantDigits - 1 : 0; 65| _precision = min(options.floatPrecision, maxPrecision); | 130| size_t maxValuePrintWidth = !options.formatFloats ? _maxRawPrintWidth : _precision + 7; 65| _printWidth = max(1, _headerPrintWidth, maxValuePrintWidth); | } 1445| else if (_type == FieldType.integer) | { 419| _printWidth = max(1, _headerPrintWidth, _minRawPrintWidth, _maxRawPrintWidth); 419| _precision = 0; | } | else | { 1026| _printWidth = max(1, _headerPrintWidth, _minRawPrintWidth, | min(options.maxFieldPrintWidth, _maxRawPrintWidth)); 1026| _precision = 0; | } | 1894| return _startPosition + _printWidth; | } |} | |/** formatFloatingPointValue returns the printed representation of a raw value | * formatted as a fixed precision floating number. This includes zero padding or | * truncation of trailing digits as necessary to meet the desired precision. | * | * If the value cannot be interpreted as a double then the raw value is returned. | * Similarly, values in exponential notion are returned without reformatting. | * | * This routine is used to format values in columns identified as floating point. | */ |string formatFloatingPointValue(const char[] value, size_t precision) @safe |{ | import std.algorithm : canFind, find; | import std.array : join; | import std.conv : to, ConvException; | import std.format : format; | import std.math : isFinite; | import std.range : repeat; | 858| string printValue; | 10858| if (value.canFind!(x => x == 'e' || x == 'E')) | { | /* Exponential notion. Use the raw value. */ 19| printValue = value.to!string; | } | else | { | try | { 839| double doubleValue = value.to!double; 836| if (doubleValue.isFinite) | { 808| size_t numPrecisionDigits = value.precisionDigits; 808| if (numPrecisionDigits >= precision) | { 614| printValue = format("%.*f", precision, doubleValue); | } 194| else if (numPrecisionDigits == 0) | { 84| printValue = format("%.*f", numPrecisionDigits, doubleValue) ~ "." ~ repeat("0", precision).join; | } | else | { 110| printValue = format("%.*f", numPrecisionDigits, doubleValue) ~ repeat("0", precision - numPrecisionDigits).join; | } | } 28| else printValue = value.to!string; // NaN or Infinity | } 3| catch (ConvException) printValue = value.to!string; | } 858| return printValue; |} | |@safe unittest |{ 1| assert("".formatFloatingPointValue(3) == ""); 1| assert(" ".formatFloatingPointValue(3) == " "); 1| assert("abc".formatFloatingPointValue(3) == "abc"); 1| assert("nan".formatFloatingPointValue(3) == "nan"); 1| assert("0".formatFloatingPointValue(0) == "0"); 1| assert("1".formatFloatingPointValue(0) == "1"); 1| assert("1.".formatFloatingPointValue(0) == "1"); 1| assert("1".formatFloatingPointValue(3) == "1.000"); 1| assert("1000".formatFloatingPointValue(3) == "1000.000"); 1| assert("1000.001".formatFloatingPointValue(5) == "1000.00100"); 1| assert("1000.001".formatFloatingPointValue(3) == "1000.001"); 1| assert("1000.001".formatFloatingPointValue(2) == "1000.00"); 1| assert("1000.006".formatFloatingPointValue(2) == "1000.01"); 1| assert("-0.1".formatFloatingPointValue(1) == "-0.1"); 1| assert("-0.1".formatFloatingPointValue(3) == "-0.100"); 1| assert("-0.001".formatFloatingPointValue(3) == "-0.001"); 1| assert("-0.006".formatFloatingPointValue(2) == "-0.01"); 1| assert("-0.001".formatFloatingPointValue(1) == "-0.0"); 1| assert("-0.001".formatFloatingPointValue(0) == "-0"); 1| assert("0e+00".formatFloatingPointValue(0) == "0e+00"); 1| assert("0.00e+00".formatFloatingPointValue(0) == "0.00e+00"); 1| assert("1e+06".formatFloatingPointValue(1) == "1e+06"); 1| assert("1e+06".formatFloatingPointValue(2) == "1e+06"); 1| assert("1E-06".formatFloatingPointValue(1) == "1E-06"); 1| assert("1.1E+6".formatFloatingPointValue(2) == "1.1E+6"); 1| assert("1.1E+100".formatFloatingPointValue(2) == "1.1E+100"); |} | |/** formatExponentValue returns the printed representation of a raw value formatted | * using exponential notation and a specific precision. If the value cannot be interpreted | * as a double then the a copy of the original value is returned. | * | * This routine is used to format values in columns identified as having exponent format. | */ |string formatExponentValue(const char[] value, size_t precision) @safe |{ | import std.algorithm : canFind, find, findSplit; | import std.array : join; | import std.conv : to, ConvException; | import std.format : format; | import std.math : isFinite; | import std.range : repeat; | 208| string printValue; | try | { 208| double doubleValue = value.to!double; 205| if (doubleValue.isFinite) | { 192| size_t numSignificantDigits = value.significantDigits; 384| size_t numPrecisionDigits = (numSignificantDigits == 0) ? 0 : numSignificantDigits - 1; 192| if (numPrecisionDigits >= precision) | { 140| printValue = format("%.*e", precision, doubleValue); | } | else | { 52| string unpaddedPrintValue = format("%.*e", numPrecisionDigits, doubleValue); 52| auto exponentSplit = unpaddedPrintValue.findSplit("e"); // Uses the same exponent case as format call. 52| if (numPrecisionDigits == 0) | { 19| assert(precision != 0); 19| assert(!exponentSplit[0].canFind(".")); 19| printValue = exponentSplit[0] ~ "." ~ repeat("0", precision).join ~ exponentSplit[1] ~ exponentSplit[2]; | } | else | { 33| printValue = exponentSplit[0] ~ repeat("0", precision - numPrecisionDigits).join ~ exponentSplit[1] ~ exponentSplit[2]; | } | } | } 13| else printValue = value.to!string; // NaN or Infinity | } 3| catch (ConvException) printValue = value.to!string; | 208| return printValue; |} | |@safe unittest |{ 1| assert("".formatExponentValue(3) == ""); 1| assert(" ".formatExponentValue(3) == " "); 1| assert("abc".formatExponentValue(3) == "abc"); 1| assert("nan".formatExponentValue(3) == "nan"); 1| assert("0".formatExponentValue(0) == "0e+00"); 1| assert("1".formatExponentValue(0) == "1e+00"); 1| assert("1.".formatExponentValue(0) == "1e+00"); 1| assert("1".formatExponentValue(3) == "1.000e+00"); 1| assert("1000".formatExponentValue(3) == "1.000e+03"); 1| assert("1000.001".formatExponentValue(5) == "1.00000e+03"); 1| assert("1000.001".formatExponentValue(3) == "1.000e+03"); 1| assert("1000.001".formatExponentValue(6) == "1.000001e+03"); 1| assert("1000.006".formatExponentValue(5) == "1.00001e+03"); 1| assert("-0.1".formatExponentValue(1) == "-1.0e-01"); 1| assert("-0.1".formatExponentValue(3) == "-1.000e-01"); 1| assert("-0.001".formatExponentValue(3) == "-1.000e-03"); 1| assert("-0.001".formatExponentValue(1) == "-1.0e-03"); 1| assert("-0.001".formatExponentValue(0) == "-1e-03"); 1| assert("0e+00".formatExponentValue(0) == "0e+00"); 1| assert("0.00e+00".formatExponentValue(0) == "0e+00"); 1| assert("1e+06".formatExponentValue(1) == "1.0e+06"); 1| assert("1e+06".formatExponentValue(2) == "1.00e+06"); 1| assert("1.0001e+06".formatExponentValue(1) == "1.0e+06"); 1| assert("1.0001e+06".formatExponentValue(5) == "1.00010e+06"); |} | |/** Returns the number of significant digits in a numeric string. | * | * Significant digits are those needed to represent a number in exponential notation. | * Examples: | * 22.345 - 5 digits | * 10.010 - 4 digits | * 0.0032 - 2 digits | */ |size_t significantDigits(const char[] numericString) @safe pure |{ | import std.algorithm : canFind, find, findAmong, findSplit, stripRight; | import std.ascii : isDigit; | import std.math : isFinite; | import std.string : isNumeric; | import std.conv : to; | 3954| assert (numericString.isNumeric); | 3954| size_t significantDigits = 0; 3954| if (numericString.to!double.isFinite) | { 14223| auto digitsPart = numericString.find!(x => x.isDigit && x != '0'); 3885| auto exponentPart = digitsPart.findAmong("eE"); 3885| digitsPart = digitsPart[0 .. $ - exponentPart.length]; | 3885| if (digitsPart.canFind('.')) | { 1460| digitsPart = digitsPart.stripRight('0'); 1460| significantDigits = digitsPart.length - 1; | } | else | { 2425| significantDigits = digitsPart.length; | } | 4181| if (significantDigits == 0) significantDigits = 1; | } | 3954| return significantDigits; |} | |@safe pure unittest |{ 1| assert("0".significantDigits == 1); 1| assert("10".significantDigits == 2); 1| assert("0.0".significantDigits == 1); 1| assert("-10.0".significantDigits == 2); 1| assert("-.01".significantDigits == 1); 1| assert("-.5401".significantDigits == 4); 1| assert("1010.010".significantDigits == 6); 1| assert("0.0003003".significantDigits == 4); 1| assert("6e+06".significantDigits == 1); 1| assert("6.0e+06".significantDigits == 1); 1| assert("6.5e+06".significantDigits == 2); 1| assert("6.005e+06".significantDigits == 4); |} | |/** Returns the number of digits to the right of the decimal point in a numeric string. | * This routine includes trailing zeros in the count. | */ |size_t precisionDigits(const char[] numericString) @safe pure |{ | import std.algorithm : canFind, find, findAmong, findSplit, stripRight; | import std.ascii : isDigit; | import std.math : isFinite; | import std.string : isNumeric; | import std.conv : to; | 814| assert (numericString.isNumeric); | 814| size_t precisionDigits = 0; 814| if (numericString.to!double.isFinite) | { 814| if (auto decimalSplit = numericString.findSplit(".")) | { 701| auto exponentPart = decimalSplit[2].findAmong("eE"); 701| precisionDigits = decimalSplit[2].length - exponentPart.length; | } | } | 814| return precisionDigits; |} | |@safe pure unittest |{ 1| assert("0".precisionDigits == 0); 1| assert("10".precisionDigits == 0); 1| assert("0.0".precisionDigits == 1); 1| assert("-10.0".precisionDigits == 1); 1| assert("-.01".precisionDigits == 2); 1| assert("-.5401".precisionDigits == 4); |} | |/** Calculates the expected print width of a string in monospace (fixed-width) fonts. | */ |size_t monospacePrintWidth(const char[] str) @safe nothrow |{ | bool isCJK(dchar c) | { 109236| return c >= '\u3000' && c <= '\u9fff'; | } | | import std.uni : byGrapheme; | 20393| size_t width = 0; 704281| try foreach (g; str.byGrapheme) width += isCJK(g[0]) ? 2 : 1; 2| catch (Exception) width = str.length; // Invalid utf-8 sequence. Catch avoids program failure. | 20393| return width; |} | |unittest |{ 1| assert("".monospacePrintWidth == 0); 1| assert(" ".monospacePrintWidth == 1); 1| assert("abc".monospacePrintWidth == 3); 1| assert("林檎".monospacePrintWidth == 4); 1| assert("æble".monospacePrintWidth == 4); 1| assert("ვაშლი".monospacePrintWidth == 5); 1| assert("größten".monospacePrintWidth == 7); |} tsv-pretty/src/tsv_utils/tsv-pretty.d is 99% covered <<<<<< EOF # path=./tsv-sample-src-tsv_utils-tsv-sample.lst |/** |Command line tool for shuffling or sampling lines from input streams. Several methods |are available, including weighted and unweighted shuffling, simple and weighted random |sampling, sampling with replacement, Bernoulli sampling, and distinct sampling. | |Copyright (c) 2017-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.tsv_sample; | |import std.array : appender, Appender, RefAppender; |import std.exception : enforce; |import std.format : format; |import std.range; |import std.stdio; |import std.typecons : tuple, Flag; | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |version(unittest) |{ | // When running unit tests, use main from -main compiler switch. |} |else |{ | /** Main program. | * | * Invokes command line argument processing and calls tsvSample to do the real | * work. Errors occurring during processing are caught and reported to the user. | */ | int main(string[] cmdArgs) | { | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 154| dmd_coverSetMerge(true); | } | 154| TsvSampleOptions cmdopt; 154| const r = cmdopt.processArgs(cmdArgs); 202| if (!r[0]) return r[1]; | version(LDC_Profile) | { | import ldc.profile : resetAll; | resetAll(); | } | try | { | import tsv_utils.common.utils : BufferedOutputRange; 212| auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); | 106| tsvSample(cmdopt, bufferedOutput); | } | catch (Exception exc) | { 29| stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 29| return 1; | } 77| return 0; | } |} | |immutable helpText = q"EOS |Synopsis: tsv-sample [options] [file...] | |Sample input lines or randomize their order. Several modes of operation |are available: |* Shuffling (the default): All input lines are output in random order. All | orderings are equally likely. |* Random sampling (--n|num N): A random sample of N lines are selected and | written to standard output. By default, selected lines are written in | random order. All sample sets and orderings are equally likely. Use | --i|inorder to write the selected lines in the original input order. |* Weighted random sampling (--n|num N, --w|weight-field F): A weighted | sample of N lines is produced. Weights are taken from field F. Lines are | output in weighted selection order. Use --i|inorder to write in original | input order. Omit --n|num to shuffle all lines (weighted shuffling). |* Sampling with replacement (--r|replace, --n|num N): All input lines are | read in, then lines are repeatedly selected at random and written out. | This continues until N lines are output. Individual lines can be written | multiple times. Output continues forever if N is zero or not provided. |* Bernoulli sampling (--p|prob P): A random subset of lines is selected | based on probability P, a 0.0-1.0 value. This is a streaming operation. | A decision is made on each line as it is read. Line order is not changed. |* Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled | based on the values in the key fields. A subset of keys are chosen based | on the inclusion probability (a 'distinct' set of keys). All lines with | one of the selected keys are output. Line order is not changed. | |Fields are specified using field number or field name. Field names require |that the input file has a header line. | |Use '--help-verbose' for detailed information. | |Options: |EOS"; | |immutable helpTextVerbose = q"EOS |Synopsis: tsv-sample [options] [file...] | |Sample input lines or randomize their order. Several modes of operation |are available: |* Shuffling (the default): All input lines are output in random order. All | orderings are equally likely. |* Random sampling (--n|num N): A random sample of N lines are selected and | written to standard output. By default, selected lines are written in | random order. All sample sets and orderings are equally likely. Use | --i|inorder to write the selected lines in the original input order. |* Weighted random sampling (--n|num N, --w|weight-field F): A weighted | sample of N lines is produced. Weights are taken from field F. Lines are | output in weighted selection order. Use --i|inorder to write in original | input order. Omit --n|num to shuffle all lines (weighted shuffling). |* Sampling with replacement (--r|replace, --n|num N): All input lines are | read in, then lines are repeatedly selected at random and written out. | This continues until N lines are output. Individual lines can be written | multiple times. Output continues forever if N is zero or not provided. |* Bernoulli sampling (--p|prob P): A random subset of lines is selected | based on probability P, a 0.0-1.0 value. This is a streaming operation. | A decision is made on each line as it is read. Line order is not changed. |* Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled | based on the values in the key fields. A subset of keys are chosen based | on the inclusion probability (a 'distinct' set of keys). All lines with | one of the selected keys are output. Line order is not changed. | |Fields: Fields are specified by field number or name. Field names require |the input file to have a header line. Use '--help-fields' for details. | |Sample size: The '--n|num' option controls the sample size for all |sampling methods. In the case of simple and weighted random sampling it |also limits the amount of memory required. | |Controlling the random seed: By default, each run produces a different |randomization or sampling. Using '--s|static-seed' changes this so |multiple runs produce the same results. This works by using the same |random seed each run. The random seed can be specified using |'--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero |value is a no-op and ignored.) | |Memory use: Bernoulli sampling and distinct sampling make decisions on |each line as it is read, there is no memory accumulation. These algorithms |can run on arbitrary size inputs. Sampling with replacement reads all |lines into memory and is limited by available memory. Shuffling also reads |all lines into memory and is similarly limited. Random sampling uses |reservoir sampling, and only needs to hold the sample size (--n|num) in |memory. The input data can be of any length. | |Weighted sampling: Weighted random sampling is done using an algorithm |described by Pavlos Efraimidis and Paul Spirakis. Weights should be |positive values representing the relative weight of the entry in the |collection. Counts and similar can be used as weights, it is *not* |necessary to normalize to a [0,1] interval. Negative values are not |meaningful and given the value zero. Input order is not retained, instead |lines are output ordered by the randomized weight that was assigned. This |means that a smaller valid sample can be produced by taking the first N |lines of output. For more info on the sampling approach see: |* Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling |* "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis | (https://arxiv.org/abs/1012.0256) | |Printing random values: Most of the sampling algorithms work by generating |a random value for each line. (See "Compatibility mode" below.) The nature |of these values depends on the sampling algorithm. They are used for both |line selection and output ordering. The '--p|print-random' option can be |used to print these values. The random value is prepended to the line |separated by the --d|delimiter char (TAB by default). The |'--gen-random-inorder' option takes this one step further, generating |random values for all input lines without changing the input order. The |types of values currently used by these sampling algorithms: |* Unweighted sampling: Uniform random value in the interval [0,1]. This | includes Bernoulli sampling and unweighted line order randomization. |* Weighted sampling: Value in the interval [0,1]. Distribution depends on | the values in the weight field. It is used as a partial ordering. |* Distinct sampling: An integer, zero and up, representing a selection | group. The inclusion probability determines the number of selection groups. |* Sampling with replacement: Random value printing is not supported. | |The specifics behind these random values are subject to change in future |releases. | |Compatibility mode: As described above, many of the sampling algorithms |assign a random value to each line. This is useful when printing random |values. It has another occasionally useful property: repeated runs with |the same static seed but different selection parameters are more |compatible with each other, as each line gets assigned the same random |value on every run. For example, if Bernoulli sampling is run with |'--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed', |all the lines selected in the first run will be selected in the second. |This comes at a cost: in some cases there are faster algorithms that don't |preserve this property. By default, tsv-sample will use faster algorithms |when available. However, the '--compatibility-mode' option switches to |algorithms that assign a random value per line. Printing random values |also engages compatibility mode. | |Options: |EOS"; | |/** Container for command line options and derived data. | * | * TsvSampleOptions handles several aspects of command line options. On the input side, | * it defines the command line options available, performs validation, and sets up any | * derived state based on the options provided. These activities are handled by the | * processArgs() member. | * | * Once argument processing is complete, TsvSampleOptions is used as a container | * holding the specific processing options used by the different sampling routines. | */ |struct TsvSampleOptions |{ | import tsv_utils.common.utils : InputSourceRange; | | string programName; /// Program name | InputSourceRange inputSources; /// Input files | bool hasHeader = false; /// --H|header | ulong sampleSize = 0; /// --n|num - Size of the desired sample | double inclusionProbability = double.nan; /// --p|prob - Inclusion probability | size_t[] keyFields; /// Derived: --k|key-fields - Used with inclusion probability | size_t weightField = 0; /// Derived: --w|weight-field - Field holding the weight | bool srsWithReplacement = false; /// --r|replace | bool preserveInputOrder = false; /// --i|inorder | bool staticSeed = false; /// --s|static-seed | uint seedValueOptionArg = 0; /// --v|seed-value | bool printRandom = false; /// --print-random | bool genRandomInorder = false; /// --gen-random-inorder | string randomValueHeader = "random_value"; /// --random-value-header | bool compatibilityMode = false; /// --compatibility-mode | char delim = '\t'; /// --d|delimiter | bool preferSkipSampling = false; /// --prefer-skip-sampling | bool preferAlgorithmR = false; /// --prefer-algorithm-r | bool hasWeightField = false; /// Derived. | bool useBernoulliSampling = false; /// Derived. | bool useDistinctSampling = false; /// Derived. | bool distinctKeyIsFullLine = false; /// Derived. True if '--k|key-fields 0' is specfied. | bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value | uint seed = 0; /// Derived from --static-seed, --seed-value | | /** Process tsv-sample command line arguments. | * | * Defines the command line options, performs validation, and derives additional | * state. std.getopt.getopt is called to do the main option processing followed | * additional validation and derivation. | * | * Help text is printed to standard output if help was requested. Error text is | * written to stderr if invalid input is encountered. | * | * A tuple is returned. First value is true if command line arguments were | * successfully processed and execution should continue, or false if an error | * occurred or the user asked for help. If false, the second value is the | * appropriate exit code (0 or 1). | * | * Returning true (execution continues) means args have been validated and derived | * values calculated. Field indices will have been converted to zero-based. | */ | auto processArgs(ref string[] cmdArgs) | { | import std.algorithm : all, canFind, each; | import std.conv : to; | import std.getopt; | import std.math : isNaN; | import std.path : baseName, stripExtension; | import std.typecons : Yes, No; | import tsv_utils.common.utils : inputSourceRange, ReadHeader, throwIfWindowsNewlineOnUnix; | import tsv_utils.common.fieldlist; | 683| bool helpVerbose = false; // --help-verbose 683| bool helpFields = false; // --help-fields 683| bool versionWanted = false; // --V|version 683| string keyFieldsArg; // --k|key-fields 683| string weightFieldArg; // --w|weight-field | 683| string keyFieldsOptionString = "k|key-fields"; 683| string weightFieldOptionString = "w|weight-field"; | 1366| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | try | { 683| arraySep = ","; // Use comma to separate values in command line options 683| auto r = getopt( | cmdArgs, | "help-verbose", " Print more detailed help.", &helpVerbose, | "help-fields", " Print help on specifying fields.", &helpFields, | | std.getopt.config.caseSensitive, | "H|header", " Treat the first line of each file as a header.", &hasHeader, | std.getopt.config.caseInsensitive, | | "n|num", "NUM Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, | "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, | | keyFieldsOptionString, | " Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.", | &keyFieldsArg, | | weightFieldOptionString, | "NUM Field containing weights. All lines get equal weight if not provided.", | &weightFieldArg, | | "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, | "i|inorder", " Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder, | "s|static-seed", " Use the same random seed every run.", &staticSeed, | | std.getopt.config.caseSensitive, | "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, | std.getopt.config.caseInsensitive, | | "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, | "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, | "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, | "compatibility-mode", " Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode, | | "d|delimiter", "CHR Field delimiter.", &delim, | | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | std.getopt.config.caseInsensitive, | | "prefer-skip-sampling", " (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.", | &preferSkipSampling, | | "prefer-algorithm-r", " (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.", | &preferAlgorithmR, | ); | 679| if (r.helpWanted) | { 1| defaultGetoptPrinter(helpText, r.options); 1| return tuple(false, 0); | } 678| else if (helpVerbose) | { 1| defaultGetoptPrinter(helpTextVerbose, r.options); 1| return tuple(false, 0); | } 677| else if (helpFields) | { 1| writeln(fieldListHelpText); 1| return tuple(false, 0); | } 676| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("tsv-sample")); 2| return tuple(false, 0); | } | | /* Input files. Remaining command line args are files. */ 1348| string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 674| cmdArgs.length = 1; | | /* Validation and derivations - Do as much validation prior to header line | * processing as possible (avoids waiting on stdin). | * | * Note: keyFields and weightField depend on header line processing, but | * keyFieldsArg and weightFieldArg can be used to detect whether the | * command line argument was specified. | */ | | /* Set hasWeightField here so it can be used in other validation checks. | * Field validity checked after reading file header. | */ 674| hasWeightField = !weightFieldArg.empty; | | /* Sampling with replacement checks (--r|replace). */ 674| if (srsWithReplacement) | { 48| enforce(!hasWeightField, 1| "Sampling with replacement (--r|replace) does not support weights (--w|weight-field)."); | 47| enforce(inclusionProbability.isNaN, 1| "Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); | 46| enforce(keyFieldsArg.empty, 1| "Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); | 89| enforce(!printRandom && !genRandomInorder, 2| "Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); | 43| enforce(!preserveInputOrder, 1| "Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option)."); | } | | /* Distinct sampling checks (--k|key-fields --p|prob). */ 668| enforce(keyFieldsArg.empty | !inclusionProbability.isNaN, 1| "--p|prob is required when using --k|key-fields."); | | /* Inclusion probability (--p|prob) is used for both Bernoulli sampling | * and distinct sampling. | */ 667| if (!inclusionProbability.isNaN) | { 427| enforce(inclusionProbability > 0.0 && inclusionProbability <= 1.0, 4| format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); | 310| if (!keyFieldsArg.empty) useDistinctSampling = true; 112| else useBernoulliSampling = true; | 213| enforce(!hasWeightField, "--w|weight-field and --p|prob cannot be used together."); | 220| enforce(!genRandomInorder || useDistinctSampling, 1| "--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used." ~ | "\nUse --gen-random-inorder alone to print probabilities for all lines." ~ | "\nUse --p|prob and --print-random to print probabilities for lines satisfying the probability threshold."); | } 486| else if (genRandomInorder && !hasWeightField) | { 26| useBernoulliSampling = true; | } | | /* randomValueHeader (--random-value-header) validity. Note that | randomValueHeader is initialized to a valid, non-empty string. | */ 1320| enforce(!randomValueHeader.empty && !randomValueHeader.canFind('\n') && 660| !randomValueHeader.canFind(delim), 1| "--randomValueHeader must be at least one character and not contain field delimiters or newlines."); | | /* Check for incompatible use of (--i|inorder) and shuffling of the full | * data set. Sampling with replacement is also incompatible, this is | * detected earlier. Shuffling is the default operation, so it identified | * by eliminating the other modes of operation. | */ 659| enforce(!preserveInputOrder || 90| sampleSize != 0 || 5| useBernoulliSampling || 4| useDistinctSampling, 2| "Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder."); | | | /* Compatibility mode checks: | * - Random value printing implies compatibility-mode, otherwise user's | * selection is used. | * - Distinct sampling doesn't support compatibility-mode. The routines | * don't care, but users might expect larger probabilities to be a | * superset of smaller probabilities. This would be confusing, so | * flag it as an error. | */ 760| enforce(!(compatibilityMode && useDistinctSampling), 1| "Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode."); | 1355| if (printRandom || genRandomInorder) compatibilityMode = true; | | | /* Seed. */ | import std.random : unpredictableSeed; | 777| usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); | 718| if (usingUnpredictableSeed) seed = unpredictableSeed; 656| else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 1064| else if (staticSeed) seed = 2438424139; 0000000| else assert(0, "Internal error, invalid seed option states."); | 656| string[] headerFields; | | /* fieldListArgProcessing encapsulates the field list processing. It is | * called prior to reading the header line if headers are not being used, | * and after if headers are being used. | */ | void fieldListArgProcessing() | { 646| if (!weightFieldArg.empty) | { 132| auto fieldIndices = | weightFieldArg | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero) | (hasHeader, headerFields, weightFieldOptionString) | .array; | 129| enforce(fieldIndices.length == 1, 3| format("'--%s' must be a single field.", weightFieldOptionString)); | 126| weightField = fieldIndices[0]; | } | 640| if (!keyFieldsArg.empty) | { 96| keyFields = | keyFieldsArg | .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, headerFields, keyFieldsOptionString) | .array; | 94| assert(keyFields.length > 0); | 94| if (keyFields.length > 0) | { 144| if (keyFields.length == 1 && keyFields[0] == 0) | { 8| distinctKeyIsFullLine = true; | } | else | { 221| enforce(keyFields.length <= 1 || keyFields.all!(x => x != 0), 2| "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); | 213| keyFields.each!((ref x) => --x); // Convert to zero-based indexing. | } | } | } | } | 928| if (!hasHeader) fieldListArgProcessing(); | | /* | * Create the inputSourceRange and perform header line processing. | */ 1302| ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 651| inputSources = inputSourceRange(filepaths, readHeader); | 650| if (hasHeader) | { 384| throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1); 374| headerFields = inputSources.front.header.split(delim).to!(string[]); 374| fieldListArgProcessing(); | } | | } | catch (Exception exc) | { 43| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 43| return tuple(false, 1); | } 635| return tuple(true, 0); | } |} |/** Invokes the appropriate sampling routine based on the command line arguments. | * | * tsvSample is the top-level routine handling the different tsv-sample use cases. | * Its primary role is to invoke the correct routine for type of sampling requested. | */ |void tsvSample(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ 633| if (cmdopt.srsWithReplacement) | { 41| simpleRandomSamplingWithReplacement(cmdopt, outputStream); | } 592| else if (cmdopt.useBernoulliSampling) | { 133| bernoulliSamplingCommand(cmdopt, outputStream); | } 459| else if (cmdopt.useDistinctSampling) | { 102| if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 82| else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); | } 367| else if (cmdopt.genRandomInorder) | { | /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli, | * Distinct), or don't handle it (SRS w/ Replacement). | */ 8| assert(cmdopt.hasWeightField); 8| generateWeightedRandomValuesInorder(cmdopt, outputStream); | } 359| else if (cmdopt.sampleSize != 0) | { 267| randomSamplingCommand(cmdopt, outputStream); | } | else | { 92| shuffleCommand(cmdopt, outputStream); | } |} | |/** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling | * routine based on the command line arguments. | * | * This routine selects the appropriate Bernoulli sampling function and template | * instantiation to use based on the command line arguments. | * | * One of the basic choices is whether to use the vanilla algorithm or skip sampling. | * Skip sampling is a little bit faster when the inclusion probability is small but | * doesn't support compatibility mode. See the bernoulliSkipSampling documentation | * for a discussion of the skipSamplingProbabilityThreshold used here. | */ |void bernoulliSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ 133| assert(!cmdopt.hasWeightField); | 133| immutable double skipSamplingProbabilityThreshold = 0.04; | 133| if (cmdopt.compatibilityMode || 101| (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling)) | { 94| if (cmdopt.genRandomInorder) | { 25| bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); | } | else | { 69| bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); | } | } | else | { 39| bernoulliSkipSampling(cmdopt, outputStream); | } |} | |/** Bernoulli sampling of lines from the input stream. | * | * Each input line is a assigned a random value and output if less than | * cmdopt.inclusionProbability. The order of the lines is not changed. | * | * This routine supports random value printing and gen-random-inorder value printing. | */ |void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) | (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import std.random : Random = Mt19937, uniform01; | import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, | InputSourceRange, throwIfWindowsNewlineOnUnix; | 25| static if (generateRandomAll) assert(cmdopt.genRandomInorder); 69| else assert(!cmdopt.genRandomInorder); | 94| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | 94| auto randomGenerator = Random(cmdopt.seed); | | /* First header is read during command line argument processing. */ 143| if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) | { 49| auto inputStream = cmdopt.inputSources.front; | | static if (generateRandomAll) | { 14| outputStream.put(cmdopt.randomValueHeader); 14| outputStream.put(cmdopt.delim); | } 35| else if (cmdopt.printRandom) | { 15| outputStream.put(cmdopt.randomValueHeader); 15| outputStream.put(cmdopt.delim); | } | 49| outputStream.put(inputStream.header); 49| outputStream.put("\n"); | | /* Immediately flush the header so subsequent processes in a unix command | * pipeline see it early. This helps provide timely error messages. | */ 7| static if (isFlushableOutputRange!OutputRange) outputStream.flush; | } | | /* Process each line. */ 188| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 94| ulong numLinesWritten = 0; | 507| foreach (inputStream; cmdopt.inputSources) | { 198| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 9060| foreach (ulong fileLineNum, line; | inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) | { 1831| if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); | 1772| immutable double lineScore = uniform01(randomGenerator); | | static if (generateRandomAll) | { 170| outputStream.formatRandomValue(lineScore); 170| outputStream.put(cmdopt.delim); 170| outputStream.put(line); 170| outputStream.put("\n"); | 170| if (cmdopt.sampleSize != 0) | { 93| ++numLinesWritten; 106| if (numLinesWritten == cmdopt.sampleSize) return; | } | } 1602| else if (lineScore < cmdopt.inclusionProbability) | { 295| if (cmdopt.printRandom) | { 159| outputStream.formatRandomValue(lineScore); 159| outputStream.put(cmdopt.delim); | } 295| outputStream.put(line); 295| outputStream.put("\n"); | 295| if (cmdopt.sampleSize != 0) | { 130| ++numLinesWritten; 148| if (numLinesWritten == cmdopt.sampleSize) return; | } | } | } | } |} | |/** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips. | * | * Skip sampling works by skipping a random number of lines between selections. This | * can be faster than assigning a random value to each line when the inclusion | * probability is low, as it reduces the number of calls to the random number | * generator. Both the random number generator and the log() function are called when | * calculating the next skip size. These additional log() calls add up as the | * inclusion probability increases. | * | * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for | * file-oriented line sampling. This is obviously environment specific. In the | * environments this implementation has been tested in the performance improvements | * remain small, less than 7%, even with an inclusion probability as low as 0.0001. | * | * The algorithm does not assign random values to individual lines. This makes it | * incompatible with random value printing. It is not suitable for compatibility mode | * either. As an example, in compatibility mode a line selected with '--prob 0.2' should | * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling | * does not have this property. | * | * The algorithm for calculating the skip size has been described by multiple sources. | * There are two key variants depending on whether the total number of lines in the | * data set is known in advance. (This implementation does not know the total.) | * Useful references: | * $(LIST | * * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling", | * ACM Trans on Mathematical Software, 1987. On-line: | * http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf | * * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book | * "Data Stream Management", Springer-Verlag, 2016. On-line: | * https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf | * * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line: | * http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/ | * ) | */ |void bernoulliSkipSampling(OutputRange)(ref TsvSampleOptions cmdopt, OutputRange outputStream) | if (isOutputRange!(OutputRange, char)) |{ | import std.conv : to; | import std.math : log, trunc; | import std.random : Random = Mt19937, uniform01; | import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, | InputSourceRange, throwIfWindowsNewlineOnUnix; | 78| assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0); 39| assert(!cmdopt.printRandom); 39| assert(!cmdopt.compatibilityMode); | 39| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | 39| auto randomGenerator = Random(cmdopt.seed); | 39| immutable double discardRate = 1.0 - cmdopt.inclusionProbability; 39| immutable double logDiscardRate = log(discardRate); | | /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed | * interval to (0.0, 1.0], excluding 0.0. | */ 39| size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; | | /* First header is read during command line argument processing. */ 60| if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) | { 20| auto inputStream = cmdopt.inputSources.front; | 20| outputStream.put(inputStream.header); 20| outputStream.put("\n"); | | /* Immediately flush the header so subsequent processes in a unix command | * pipeline see it early. This helps provide timely error messages. | */ 2| static if (isFlushableOutputRange!OutputRange) outputStream.flush; | } | | /* Process each line. */ 78| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 39| ulong numLinesWritten = 0; 189| foreach (inputStream; cmdopt.inputSources) | { 76| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 24962| foreach (ulong fileLineNum, line; | inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) | { 5001| if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); | 4980| if (remainingSkips > 0) | { 4769| --remainingSkips; | } | else | { 211| outputStream.put(line); 211| outputStream.put("\n"); | 211| if (cmdopt.sampleSize != 0) | { 144| ++numLinesWritten; 162| if (numLinesWritten == cmdopt.sampleSize) return; | } | 193| remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; | } | } | } |} | |/** Sample lines by choosing a random set of distinct keys formed from one or more | * fields on each line. | * | * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling. | * However, instead of each line being subject to an independent trial, lines are | * selected based on a key from each line. A portion of keys are randomly selected for | * output, and every line containing a selected key is included in the output. | * | * An example use-case is a query log having triples. It is | * often useful to sample records for portion of the users, but including all records | * for the users selected. Distinct sampling supports this by selecting a subset of | * users to include in the output. | * | * Distinct sampling is done by hashing the key and mapping the hash value into | * buckets sized to hold the inclusion probability. Records having a key mapping to | * bucket zero are output. Buckets are equal size and therefore may be larger than the | * inclusion probability. (The other approach would be to have the caller specify the | * the number of buckets. More correct, but less convenient.) | */ |void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) | (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import std.algorithm : splitter; | import std.conv : to; | import std.digest.murmurhash; | import std.math : lrint; | import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, | InputFieldReordering, InputSourceRange, throwIfWindowsNewlineOnUnix; | 10| static if (generateRandomAll) assert(cmdopt.genRandomInorder); 82| else assert(!cmdopt.genRandomInorder); | 92| assert(cmdopt.keyFields.length > 0); 184| assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); | 92| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | | static if (generateRandomAll) | { | import std.format : formatValue, singleSpec; 10| immutable randomValueFormatSpec = singleSpec("%d"); | } | 92| immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. | 92| uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; | | /* Create a mapping for the key fields. */ 184| auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); | | /* First header is read during command line argument processing. */ 151| if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) | { 58| auto inputStream = cmdopt.inputSources.front; | | static if (generateRandomAll) | { 7| outputStream.put(cmdopt.randomValueHeader); 7| outputStream.put(cmdopt.delim); | } 51| else if (cmdopt.printRandom) | { 9| outputStream.put(cmdopt.randomValueHeader); 9| outputStream.put(cmdopt.delim); | } | 58| outputStream.put(inputStream.header); 58| outputStream.put("\n"); | | /* Immediately flush the header so subsequent processes in a unix command | * pipeline see it early. This helps provide timely error messages. | */ 15| static if (isFlushableOutputRange!OutputRange) outputStream.flush; | } | | /* Process each line. */ 184| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 92| ulong numLinesWritten = 0; | 512| foreach (inputStream; cmdopt.inputSources) | { 199| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 8973| foreach (ulong fileLineNum, line; | inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) | { 1793| if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); | | /* Murmurhash works by successively adding individual keys, then finalizing. | * Adding individual keys is simpler if the full-line-as-key and individual | * fields as keys cases are separated. | */ 1751| auto hasher = MurmurHash3!32(cmdopt.seed); | 1751| if (cmdopt.distinctKeyIsFullLine) | { 200| hasher.put(cast(ubyte[]) line); | } | else | { 1551| assert(keyFieldsReordering !is null); | | /* Gather the key field values and assemble the key. */ 1551| keyFieldsReordering.initNewLine; 18579| foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) | { 3715| keyFieldsReordering.processNextField(fieldIndex, fieldValue); 5264| if (keyFieldsReordering.allFieldsFilled) break; | } | 1551| enforce(keyFieldsReordering.allFieldsFilled, 2| format("Not enough fields in line. File: %s, Line: %s", | inputStream.name, fileLineNum)); | 15893| foreach (count, key; keyFieldsReordering.outputFields.enumerate) | { 3569| if (count > 0) hasher.put(delimArray); 2559| hasher.put(cast(ubyte[]) key); | } | } | 1749| hasher.finish; | | static if (generateRandomAll) | { | import std.conv : to; 182| outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 182| outputStream.put(cmdopt.delim); 182| outputStream.put(line); 182| outputStream.put("\n"); | 182| if (cmdopt.sampleSize != 0) | { 20| ++numLinesWritten; 22| if (numLinesWritten == cmdopt.sampleSize) return; | } | } 1567| else if (hasher.get % numBuckets == 0) | { 576| if (cmdopt.printRandom) | { 76| outputStream.put('0'); 76| outputStream.put(cmdopt.delim); | } 576| outputStream.put(line); 576| outputStream.put("\n"); | 576| if (cmdopt.sampleSize != 0) | { 58| ++numLinesWritten; 66| if (numLinesWritten == cmdopt.sampleSize) return; | } | } | } | } |} | |/** Random sampling command handler. Invokes the appropriate sampling routine based on | * the command line arguments. | * | * Random sampling selects a fixed size random sample from the input stream. Both | * simple random sampling (equal likelihood) and weighted random sampling are | * supported. Selected lines are output either in random order or original input order. | * For weighted sampling the random order is the weighted selection order. | * | * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via | * Algorithm R. This routine selects the appropriate reservoir sampling function and | * template instantiation to based on the command line arguments. | * | * Weighted sampling always uses the heap approach. Compatibility mode does as well, | * as it is the method that uses per-line random value assignments. The implication | * of compatibility mode is that a larger sample size includes all the results from | * a smaller sample, assuming the same random seed is used. | * | * For unweighted sampling there is a performance tradeoff between implementations. | * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for | * large sample sizes. The threshold used was chosen based on performance tests. See | * the reservoirSamplingAlgorithmR documentation for more information. | */ | |void randomSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ 267| assert(cmdopt.sampleSize != 0); | 267| immutable size_t algorithmRSampleSizeThreshold = 128 * 1024; | 267| if (cmdopt.hasWeightField) | { 77| if (cmdopt.preserveInputOrder) | { 2| reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); | } | else | { 75| reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); | } | } 190| else if (cmdopt.compatibilityMode || 232| (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR)) | { 137| if (cmdopt.preserveInputOrder) | { 56| reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); | } | else | { 81| reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); | } | } 53| else if (cmdopt.preserveInputOrder) | { 27| reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream); | } | else | { 26| reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream); | } |} | |/** Reservoir sampling using a heap. Both weighted and unweighted random sampling are | * supported. | * | * The algorithm used here is based on the one-pass algorithm described by Pavlos | * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. | * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are | * simply set to one. | * | * The implementation uses a heap (priority queue) large enough to hold the desired | * number of lines. Input is read line-by-line, assigned a random value, and added to | * the heap. The role of the heap is to identify the lines with the highest assigned | * random values. Once the heap is full, adding a new line means dropping the line with | * the lowest score. A "min" heap used for this reason. | * | * When done reading all lines, the "min" heap is in reverse of weighted selection | * order. Weighted selection order is obtained by removing each element one at at time | * from the heap. The underlying data store will have the elements in weighted selection | * order (largest weights first). | * | * Generating output in weighted order is useful for several reasons: | * - For weighted sampling, it preserves the property that smaller valid subsets can be | * created by taking the first N lines. | * - For unweighted sampling, it ensures that all output permutations are possible, and | * are not influenced by input order or the heap data structure used. | * - Order consistency is maintained when making repeated use of the same random seed, | * but with different sample sizes. | * | * The other choice is preserving input order. This is supporting by recording line | * numbers and sorting the selected sample. | * | * There are use cases where only the selection set matters. For these some performance | * could be gained by skipping the reordering and simply printing the backing store | * array in-order. Performance tests indicate only a minor benefit, so this is not | * supported. | * | * Notes: | * $(LIST | * * In tsv-sample versions 1.2.1 and earlier this routine also supported | * randomization of all input lines. This was dropped in version 1.2.2 in favor | * of the approach used in randomizeLines. The latter has significant advantages | * given that all data must be read into memory. | * * For large reservoir sizes better performance can be achieved using Algorithm R. | * See the reservoirSamplingAlgorithmR documentation for details. | * ) | */ |void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange) | (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import std.algorithm : sort; | import std.container.array; | import std.container.binaryheap; | import std.meta : AliasSeq; | import std.random : Random = Mt19937, uniform01; | import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, | InputSourceRange, throwIfWindowsNewlineOnUnix; | 77| static if (isWeighted) assert(cmdopt.hasWeightField); 137| else assert(!cmdopt.hasWeightField); | 214| assert(cmdopt.sampleSize > 0); | 214| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | 214| auto randomGenerator = Random(cmdopt.seed); | | static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) | { | double score; | const(char)[] line; | static if (preserveInputOrder) ulong lineNumber; | } | | /* Create the heap and backing data store. | * | * Note: An std.container.array is used as the backing store to avoid some issues in | * the standard library (Phobos) binaryheap implementation. Specifically, when an | * std.container.array is used as backing store, the heap can efficiently reversed by | * removing the heap elements. This leaves the backing store in the reversed order. | * However, the current binaryheap implementation does not support this for all | * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. | */ | 428| Array!(Entry!preserveInputOrder) dataStore; 214| dataStore.reserve(cmdopt.sampleSize); 428| auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap | | /* First header is read during command line argument processing. */ 333| if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) | { 117| auto inputStream = cmdopt.inputSources.front; | 117| if (cmdopt.printRandom) | { 34| outputStream.put(cmdopt.randomValueHeader); 34| outputStream.put(cmdopt.delim); | } 117| outputStream.put(inputStream.header); 117| outputStream.put("\n"); | | /* Immediately flush the header so subsequent processes in a unix command | * pipeline see it early. This helps provide timely error messages. | */ 19| static if (isFlushableOutputRange!OutputRange) outputStream.flush; | } | | /* Process each line. */ 428| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 58| static if (preserveInputOrder) ulong totalLineNum = 0; | 1107| foreach (inputStream; cmdopt.inputSources) | { 362| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 9283| foreach (ulong fileLineNum, line; | inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) | { 1865| if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); | | static if (!isWeighted) | { 1007| immutable double lineScore = uniform01(randomGenerator); | } | else | { 756| immutable double lineWeight = | getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum); 756| immutable double lineScore = | (lineWeight > 0.0) 754| ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 2| : 0.0; | } | | static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); | else alias entryCTArgs = AliasSeq!(); | 1763| if (reservoir.length < cmdopt.sampleSize) | { 1144| reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); | } 619| else if (reservoir.front.score < lineScore) | { 460| reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); | } | 379| static if (preserveInputOrder) ++totalLineNum; | } | } | | /* Done with input, all entries are in the reservoir. */ | | /* The asserts here avoid issues with the current binaryheap implementation. They | * detect use of backing stores having a length not synchronized to the reservoir. | */ 207| immutable ulong numLines = reservoir.length; 207| assert(numLines == dataStore.length); | | /* Update the backing store so it is in the desired output order. | */ | static if (preserveInputOrder) | { 776| dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber); | } | else | { | /* Output in weighted selection order. The heap is in reverse order of assigned | * weights. Reversing order is done by removing all elements from the heap. This | * leaves the backing store in the correct order. | */ 1989| while (!reservoir.empty) reservoir.removeFront; | } | 207| assert(numLines == dataStore.length); | 4047| foreach (entry; dataStore) | { 1142| if (cmdopt.printRandom) | { 309| outputStream.formatRandomValue(entry.score); 309| outputStream.put(cmdopt.delim); | } 1142| outputStream.put(entry.line); 1142| outputStream.put("\n"); | } | } | |/** Generate weighted random values for all input lines, preserving input order. | * | * This complements weighted reservoir sampling, but instead of using a reservoir it | * simply iterates over the input lines generating the values. The weighted random | * values are generated with the same formula used by reservoirSampling. | */ |void generateWeightedRandomValuesInorder(OutputRange) | (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import std.random : Random = Mt19937, uniform01; | import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, | InputSourceRange, throwIfWindowsNewlineOnUnix; | 8| assert(cmdopt.hasWeightField); | 8| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | 8| auto randomGenerator = Random(cmdopt.seed); | | /* First header is read during command line argument processing. */ 15| if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) | { 7| auto inputStream = cmdopt.inputSources.front; | 7| outputStream.put(cmdopt.randomValueHeader); 7| outputStream.put(cmdopt.delim); 7| outputStream.put(inputStream.header); 7| outputStream.put("\n"); | | /* Immediately flush the header so subsequent processes in a unix command | * pipeline see it early. This helps provide timely error messages. | */ 4| static if (isFlushableOutputRange!OutputRange) outputStream.flush; | } | | /* Process each line. */ 16| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 8| ulong numLinesWritten = 0; | 44| foreach (inputStream; cmdopt.inputSources) | { 19| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 638| foreach (ulong fileLineNum, line; | inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) | { 125| if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); | 124| immutable double lineWeight = | getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum); | 124| immutable double lineScore = | (lineWeight > 0.0) 122| ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 2| : 0.0; | 124| outputStream.formatRandomValue(lineScore); 124| outputStream.put(cmdopt.delim); 124| outputStream.put(line); 124| outputStream.put("\n"); | 124| if (cmdopt.sampleSize != 0) | { 15| ++numLinesWritten; 16| if (numLinesWritten == cmdopt.sampleSize) return; | } | } | } |} | |/** Reservoir sampling via Algorithm R | * | * This is an implementation of reservoir sampling using what is commonly known as | * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of | * Computer Programming, Volume 2: Seminumerical Algorithms". More information about | * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with | * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling" | * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R). | * | * Algorithm R is used for unweighted sampling without replacement. The heap-based | * algorithm in reservoirSamplingViaHeap is used for weighted sampling. | * | * The classic algorithm stops after identifying the selected set of items. This | * implementation goes one step further and randomizes the order of the selected | * lines. This is consistent with shuffling (line order randomization), a primary | * tsv-sample use-case. | * | * This algorithm is faster than reservoirSamplingViaHeap when the sample size | * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size. | * Insertion in this algorithm is O(1). Similarly, generating the random order in the | * heap is O(k * log k), while in this algorithm the final randomization step is O(k). | * | * This speed advantage may be offset a certain amount by using a more expensive random | * value generator. reservoirSamplingViaHeap generates values between zero and one, | * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing | * interval. The latter is expected to be more expensive. This is consistent with | * performance tests indicating that reservoirSamplingViaHeap is faster when using | * small-to-medium size reservoirs and large input streams. | */ |void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange) | (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import std.meta : AliasSeq; | import std.random : Random = Mt19937, randomShuffle, uniform; | import std.algorithm : sort; | import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, | InputSourceRange, throwIfWindowsNewlineOnUnix; | 53| assert(cmdopt.sampleSize > 0); 53| assert(!cmdopt.hasWeightField); 53| assert(!cmdopt.compatibilityMode); 53| assert(!cmdopt.printRandom); 53| assert(!cmdopt.genRandomInorder); | 53| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | | static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) | { | const(char)[] line; | static if (preserveInputOrder) ulong lineNumber; | } | 53| Entry!preserveInputOrder[] reservoir; 53| auto reservoirAppender = appender(&reservoir); 53| reservoirAppender.reserve(cmdopt.sampleSize); | 53| auto randomGenerator = Random(cmdopt.seed); | | /* First header is read during command line argument processing. */ 82| if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) | { 25| auto inputStream = cmdopt.inputSources.front; | 25| outputStream.put(inputStream.header); 25| outputStream.put("\n"); | | /* Immediately flush the header so subsequent processes in a unix command | * pipeline see it early. This helps provide timely error messages. | */ 1| static if (isFlushableOutputRange!OutputRange) outputStream.flush; | } | | /* Process each line. */ 106| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 53| ulong totalLineNum = 0; | 322| foreach (inputStream; cmdopt.inputSources) | { 112| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 1439| foreach (ulong fileLineNum, line; | inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) | { 285| if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); | | /* Add lines to the reservoir until the reservoir is filled. | * After that lines are added with decreasing likelihood, based on | * the total number of lines seen. If added to the reservoir, the | * line replaces a randomly chosen existing line. | */ | static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); | else alias entryCTArgs = AliasSeq!(); | 259| if (totalLineNum < cmdopt.sampleSize) | { 147| reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs); | } | else | { 112| immutable size_t i = uniform(0, totalLineNum, randomGenerator); 112| if (i < reservoir.length) | { 81| reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs); | } | } | 259| ++totalLineNum; | } | } | | /* Done with input. The sample is in the reservoir. Update the order and print. */ | | static if (preserveInputOrder) | { 270| reservoir.sort!((a, b) => a.lineNumber < b.lineNumber); | } | else | { 26| reservoir.randomShuffle(randomGenerator); | } | 600| foreach (ref entry; reservoir) | { 147| outputStream.put(entry.line); 147| outputStream.put("\n"); | } |} | |/** Shuffling command handler. Invokes the appropriate shuffle (line order | * randomization) routine based on the command line arguments. | * | * Shuffling has similarities to random sampling, but the algorithms used are | * different. Random sampling selects a subset, only the current subset selection | * needs to be kept in memory. This is supported by reservoir sampling. By contrast, | * shuffling needs to hold all input in memory, so it works better to read all lines | * into memory at once and then shuffle. | * | * Two different algorithms are used. Array shuffling is used for unweighted shuffling. | * Sorting plus random weight assignments is used for weighted shuffling and when | * compatibility mode is being used. | * | * The algorithms used here are all limited by available memory. | */ |void shuffleCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ 92| if (cmdopt.hasWeightField) | { 41| randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream); | } 51| else if (cmdopt.compatibilityMode) | { 37| randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream); | } | else | { 14| randomizeLinesViaShuffle(cmdopt, outputStream); | } |} | |/** Shuffle all input lines by assigning random weights and sorting. | * | * randomizeLinesViaSort reads in all input lines and writes them out in random order. | * The algorithm works by assigning a random value to each line and sorting. Both | * weighted and unweighted shuffling are supported. | * | * Notes: | * $(LIST | * * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used | * unless compatibility mode is needed. | * * This routine is significantly faster than heap-based reservoir sampling in the | * case where the entire file is being read. | * * Input data must be read entirely in memory. Disk oriented techniques are needed | * when data sizes get too large for available memory. One option is to generate | * random values for each line, e.g. --gen-random-inorder, and sort with a disk- | * backed sort program like GNU sort. | * ) | */ |void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange) | (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import std.algorithm : map, sort; | 41| static if (isWeighted) assert(cmdopt.hasWeightField); 37| else assert(!cmdopt.hasWeightField); | 78| assert(cmdopt.sampleSize == 0); | | /* | * Read all file data into memory. Then split the data into lines and assign a | * random value to each line. readFileData also writes the first header line. | */ 78| const fileData = readFileData!(Yes.hasRandomValue)(cmdopt, outputStream); 76| auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt); | | /* | * Sort by the weight and output the lines. | */ 4600| inputLines.sort!((a, b) => a.randomValue > b.randomValue); | 2385| foreach (lineEntry; inputLines) | { 724| if (cmdopt.printRandom) | { 397| outputStream.formatRandomValue(lineEntry.randomValue); 397| outputStream.put(cmdopt.delim); | } 724| outputStream.put(lineEntry.data); 724| outputStream.put("\n"); | } |} | |/** Shuffle (randomize) all input lines using a shuffling algorithm. | * | * All lines in files and/or standard input are read in and written out in random | * order. This routine uses array shuffling, which is faster than sorting. It is a | * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the | * most common case). | * | * Input data size is limited by available memory. Disk oriented techniques are needed | * when data sizes are larger. For example, generating random values line-by-line (ala | * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. | * | * This routine does not support random value printing or compatibility-mode. | */ |void randomizeLinesViaShuffle(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import std.algorithm : map; | import std.random : Random = Mt19937, randomShuffle; | 14| assert(cmdopt.sampleSize == 0); 14| assert(!cmdopt.hasWeightField); 14| assert(!cmdopt.printRandom); 14| assert(!cmdopt.genRandomInorder); | | /* | * Read all file data into memory and split into lines. | */ 14| const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream); 13| auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt); | | /* | * Randomly shuffle and print each line. | * | * Note: Also tried randomCover, but that was exceedingly slow. | */ | import std.random : randomShuffle; | 11| auto randomGenerator = Random(cmdopt.seed); 11| inputLines.randomShuffle(randomGenerator); | 135| foreach (ref line; inputLines) | { 34| outputStream.put(line.data); 34| outputStream.put("\n"); | } |} | |/** Simple random sampling with replacement. | * | * All lines in files and/or standard input are read in. Then random lines are selected | * one at a time and output. Lines can be selected multiple times. This process continues | * until the desired number of samples (--n|num) has been output. Output continues | * indefinitely if a sample size was not provided. | */ |void simpleRandomSamplingWithReplacement(OutputRange) | (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import std.algorithm : map; | import std.random : Random = Mt19937, uniform; | | /* | * Read all file data into memory and split the data into lines. | */ 41| const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream); 40| const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt); | 38| if (inputLines.length > 0) | { 32| auto randomGenerator = Random(cmdopt.seed); | | /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 64| size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 240| while (numLeft != 0) | { 208| immutable size_t index = uniform(0, inputLines.length, randomGenerator); 208| outputStream.put(inputLines[index].data); 208| outputStream.put("\n"); 416| if (cmdopt.sampleSize != 0) numLeft--; | } | } |} | |/** A container holding data read from a file or standard input. | * | * The InputBlock struct is used to represent a block of data read from a file or | * standard input. An array of InputBlocks is returned by readFileData. Typically one | * block per file. Multiple blocks are used for standard input and when the file size | * cannot be determined. Individual lines are not allowed to span blocks. The blocks | * allocated to an individual file are numbered starting with zero. | * | * See readFileData() for more information. | */ |static struct InputBlock |{ | string filename; /// Original filename or path. "-" denotes standard input. | size_t fileBlockNumber; /// Zero-based block number for the file. | char[] data; /// The actual data. Newline terminated or last block for the file. |} | |/** Read data from one or more files. This routine is used by algorithms needing to | * read all data into memory. | * | * readFileData reads in all data from a set of files. Data is returned as an array | * of InputBlock structs. Normally one InputBlock per file, sized to match the size | * of the file. Standard input is read in one or more blocks, as are files whose size | * cannot be determined. Multiple blocks are used in these last two cases to avoid | * expensive memory reallocations. This is not necessary when file size is known as | * the necessary memory can be preallocated. | * | * Individual lines never span multiple blocks, and newlines are preserved. This | * means that each block starts at the beginning of a line and ends with a newline | * unless the end of a file has been reached. | * | * Each file gets its own block. Prior to using InputSourceRange this was so header | * processing can be done. With InputSourceRange the header is read separately, so | * this could be changed. | */ |InputBlock[] readFileData(HasRandomValue hasRandomValue, OutputRange) |(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import std.algorithm : find, min; | import std.range : retro; | import tsv_utils.common.utils : InputSourceRange, isFlushableOutputRange, | throwIfWindowsNewlineOnUnix; | 55| static if(!hasRandomValue) assert(!cmdopt.printRandom); | 133| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | | /* First header is read during command line argument processing. */ 217| if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) | { 80| auto inputStream = cmdopt.inputSources.front; | 80| if (cmdopt.printRandom) | { 28| outputStream.put(cmdopt.randomValueHeader); 28| outputStream.put(cmdopt.delim); | } 80| outputStream.put(inputStream.header); 80| outputStream.put("\n"); | | /* Immediately flush the header so subsequent processes in a unix command | * pipeline see it early. This helps provide timely error messages. | */ 26| static if (isFlushableOutputRange!OutputRange) outputStream.flush; | } | | enum BlockSize = 1024L * 1024L * 1024L; // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.) | enum ReadSize = 1024L * 128L; | enum NewlineSearchSize = 1024L * 16L; | 133| InputBlock[] blocks; 133| auto blocksAppender = appender(&blocks); 133| blocksAppender.reserve(cmdopt.inputSources.length); // At least one block per file. | 133| ubyte[] rawReadBuffer = new ubyte[ReadSize]; | 882| foreach (inputStream; cmdopt.inputSources) | { 340| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | | /* If the file size can be determined then read it as a single block. | * Otherwise read as multiple blocks. File.size() returns ulong.max | * if file size cannot be determined, so we'll combine that check | * with the standard input case. | */ | 600| immutable ulong filesize = inputStream.isStdin ? ulong.max : inputStream.file.size; 408| auto ifile = inputStream.file; | 204| if (filesize != ulong.max) | { 192| readFileDataAsOneBlock(inputStream.name, ifile, filesize, | blocksAppender, rawReadBuffer); | } | else | { 12| readFileDataAsMultipleBlocks( | inputStream.name, ifile, blocksAppender, rawReadBuffer, | BlockSize, NewlineSearchSize); | } | } 129| return blocks; |} | |/* readFileData() helper function. Read data from a File handle as a single block. The | * new block is appended to an existing InputBlock[] array. | * | * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case | * where a file is being read as a single block. Normally initialBlockSize is passed | * as the size of the file. | * | * This routine has been separated out to enable unit testing. At present it is not | * intended as a general API. See readFileData for more info. | */ |private void readFileDataAsOneBlock( | string filename, | ref File ifile, | const ulong initialBlockSize, | ref RefAppender!(InputBlock[]) blocksAppender, | ref ubyte[] rawReadBuffer) |{ 195| blocksAppender.put(InputBlock(filename, 0)); 195| auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 195| dataAppender.reserve(initialBlockSize); | 1557| foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) | { 324| dataAppender.put(cast(char[]) buffer); | } |} | |/* readFileData() helper function. Read data from a File handle as one or more blocks. | * Blocks are appended to an existing InputBlock[] array. | * | * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case | * where a file or standard input is being read as a series of blocks. This is the | * standard approach for standard input, but also applies when the file size cannot be | * determined. | * | * This routine has been separated out to enable unit testing. At present it is not | * intended as a general API. See readFileData for more info. | */ |private void readFileDataAsMultipleBlocks( | string filename, | ref File ifile, | ref RefAppender!(InputBlock[]) blocksAppender, | ref ubyte[] rawReadBuffer, | const size_t blockSize, | const size_t newlineSearchSize) |{ | import std.algorithm : find, min; | import std.range : retro; | 252| assert(ifile.isOpen); | | /* Create a new block for the file and an Appender for writing data. | */ 252| blocksAppender.put(InputBlock(filename, 0)); 252| auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 252| dataAppender.reserve(blockSize); 252| size_t blockNumber = 0; | | /* Read all the data and copy it to an InputBlock. */ 4115112| foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) | { 1371452| assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber); | 1371452| immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length; | 1371452| if (buffer.length <= remainingCapacity) | { 1341173| dataAppender.put(cast(char[]) buffer); | } | else | { | /* Look for the last newline in the input buffer that fits in remaining | * capacity of the block. | */ 30279| auto searchRegion = buffer[0 .. remainingCapacity]; 30279| auto appendRegion = searchRegion.retro.find('\n').source; | 30279| if (appendRegion.length > 0) | { | /* Copy the first part of the read buffer to the block. */ 2993| dataAppender.put(cast(char[]) appendRegion); | | /* Create a new InputBlock and copy the remaining data to it. */ 2993| blockNumber++; 2993| blocksAppender.put(InputBlock(filename, blockNumber)); 2993| dataAppender = appender(&(blocksAppender.data[$-1].data)); 2993| dataAppender.reserve(blockSize); 2993| dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]); | 2993| assert(blocksAppender.data.length >= 2); 2993| assert(blocksAppender.data[$-2].data[$-1] == '\n'); | } | else | { | /* Search backward in the current block for a newline. If found, it | * becomes the last newline in the current block. Anything following | * it is moved to the block. If a newline is not found, simply append | * to the current block and let it grow. We'll only search backward | * so far. | */ 27286| immutable size_t currBlockLength = blocksAppender.data[$-1].data.length; 27286| immutable size_t searchLength = min(currBlockLength, newlineSearchSize); 27286| immutable size_t searchStart = currBlockLength - searchLength; 27286| auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $]; 27286| auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length; | 27286| if (lastNewlineOffset != 0) | { | /* Create a new InputBlock. The previous InputBlock is then found | * at blocksAppender.data[$-2]. It may be a physically different | * struct (a copy) if the blocks array gets reallocated. | */ 15293| blockNumber++; 15293| blocksAppender.put(InputBlock(filename, blockNumber)); 15293| dataAppender = appender(&(blocksAppender.data[$-1].data)); 15293| dataAppender.reserve(blockSize); | | /* Copy data following the newline from the last block to the new | * block. Then append the current read buffer. | */ 15293| immutable size_t moveRegionStart = searchStart + lastNewlineOffset; 15293| dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]); 15293| dataAppender.put(cast(char[]) buffer); | | /* Now delete the moved region from the last block. */ 15293| blocksAppender.data[$-2].data.length = moveRegionStart; | 15293| assert(blocksAppender.data.length >= 2); 15293| assert(blocksAppender.data[$-2].data[$-1] == '\n'); | } | else | { | /* Give up. Allow the current block to grow. */ 11993| dataAppender.put(cast(char[]) buffer); | } | } | } | } |} | |/** HasRandomValue is a boolean flag used at compile time by identifyInputLines to | * distinguish use cases needing random value assignments from those that don't. | */ |alias HasRandomValue = Flag!"hasRandomValue"; | |/** An InputLine array is returned by identifyInputLines to represent each non-header line | * line found in a FileData array. The 'data' element contains the line. A 'randomValue' | * line is included if random values are being generated. | */ |static struct InputLine(HasRandomValue hasRandomValue) |{ | const(char)[] data; | static if (hasRandomValue) double randomValue; |} | |/** identifyInputLines is used by algorithms that read all files into memory prior to | * processing. It does the initial processing of the file data. | * | * Two main tasks are performed. One is splitting all input data into lines. The second | * is assigning a random value to the line, if random values are being generated. | * | * The key input is an InputBlock array. Normally one block for each file, but standard | * input may have multiple blocks. | * | * The return value is an array of InputLine structs. The struct will have a 'randomValue' | * member if random values are being assigned. | */ |InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted) |(const ref InputBlock[] inputBlocks, ref TsvSampleOptions cmdopt) |{ | import std.algorithm : splitter; | import std.array : appender; | import std.random : Random = Mt19937, uniform01; | import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; | | static assert(hasRandomValue || !isWeighted); 134| static if(!hasRandomValue) assert(!cmdopt.printRandom); | 210| InputLine!hasRandomValue[] inputLines; | 210| auto linesAppender = appender(&inputLines); 76| static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed); | | /* Note: fileLineNum is zero-based here. One-based in most other code in this file. */ 420| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 1 : 0; 210| size_t fileLineNum = fileBodyStartLine; | 56799| foreach (block; inputBlocks) | { | /* Drop the last newline to avoid adding an extra empty line. */ 37431| const data = (block.data.length > 0 && block.data[$-1] == '\n') ? 18729| block.data[0 .. $-1] : block.data; | 19172| if (block.fileBlockNumber == 0) fileLineNum = fileBodyStartLine; | 672165| foreach (ref line; data.splitter('\n')) | { 211575| fileLineNum++; | 211887| if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, block.filename, fileLineNum); | | static if (!hasRandomValue) | { 210841| linesAppender.put(InputLine!hasRandomValue(line)); | } | else | { | static if (!isWeighted) | { 338| immutable double randomValue = uniform01(randomGenerator); | } | else | { 390| immutable double lineWeight = | getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, | block.filename, fileLineNum); 387| immutable double randomValue = | (lineWeight > 0.0) 385| ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 2| : 0.0; | } | 725| linesAppender.put(InputLine!hasRandomValue(line, randomValue)); | } | } | } | 201| return inputLines; |} | | |/* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios. | * Other use paths are well tested by the tests at the end cases. | */ |unittest |{ | import tsv_utils.common.unittest_utils; | import std.algorithm : equal, find, joiner, splitter; | import std.array : appender; | import std.file : rmdirRecurse; | import std.path : buildPath; | import std.range : repeat; | 1| auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData"); 1| scope(exit) rfdTestDir.rmdirRecurse; | 1| char[] file1Data; 1| char[] file2Data; 1| char[] file3Data; | 1| auto app1 = appender(&file1Data); 1| auto app2 = appender(&file2Data); 1| auto app3 = appender(&file3Data); | | /* File 1: 1000 short lines. */ 1| app1.put("\n".repeat(100).joiner); 1| app1.put("x\n".repeat(100).joiner); 1| app1.put("yz\n".repeat(100).joiner); 1| app1.put("pqr\n".repeat(100).joiner); 1| app1.put("a\nbc\ndef\n".repeat(100).joiner); 1| app1.put('\n'.repeat(100)); 1| app1.put("z\n".repeat(100).joiner); 1| app1.put("xy\n".repeat(100).joiner); | | /* File 2: 500 longer lines. */ 1| app2.put( | "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" | .repeat(100) | .joiner); 1| app2.put( | "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n" | .repeat(100) | .joiner); 1| app2.put( | "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" | .repeat(100) | .joiner); | | /* File 3: 1000 mixed length lines. */ 1| app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner); | 1| string file1Path = buildPath(rfdTestDir, "file1.txt"); 1| string file2Path = buildPath(rfdTestDir, "file2.txt"); 1| string file3Path = buildPath(rfdTestDir, "file3.txt"); | | try | { 2| auto ofile1 = File(file1Path, "w"); 1| ofile1.write(file1Data); | } 0000000| catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file1Path, e.msg)); | | try | { 2| auto ofile2 = File(file2Path, "w"); 1| ofile2.write(file2Data); | } 0000000| catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file2Path, e.msg)); | | try | { 2| auto ofile3 = File(file3Path, "w"); 1| ofile3.write(file3Data); | } 0000000| catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file3Path, e.msg)); | 1| auto allData = file1Data ~ file2Data ~ file3Data; 1| auto expectedLines = allData.splitter('\n').array[0 .. $-1]; | 1| auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $]; 1| auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $]; 1| auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader; 1| auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1]; | 1| assert(expectedLines.length == expectedLinesUsingHeader.length + 2); | 1| TsvSampleOptions cmdoptNoHeader; 1| auto noHeaderCmdArgs = ["unittest", file1Path]; 1| auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs); 1| assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs)); | 1| TsvSampleOptions cmdoptYesHeader; 1| auto yesHeaderCmdArgs = ["unittest", "--header", file1Path]; 1| auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs); 1| assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs)); | 1| auto outputStream = appender!(char[])(); | | { | /* Reading as single blocks. */ 1| ubyte[] rawReadBuffer = new ubyte[256]; 1| InputBlock[] blocks; 1| auto blocksAppender = appender(&blocks); 1| blocksAppender.reserve(3); 12| foreach (f; [ file1Path, file2Path, file3Path ]) | { 6| auto ifile = f.File; 3| ulong filesize = ifile.size; 3| if (filesize == ulong.max) filesize = 1000; 3| readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer); | } 1| auto inputLines = | identifyInputLines!(No.hasRandomValue, No.isWeighted)( | blocks, cmdoptNoHeader); | 2601| assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); | } | | { | /* Reading as multiple blocks. */ 15| foreach (size_t searchSize; [ 0, 1, 2, 64 ]) | { 72| foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ]) | { 300| foreach (size_t readSize; [ 1, 2, 8, 32 ]) | { 80| ubyte[] rawReadBuffer = new ubyte[readSize]; 80| InputBlock[] blocks; 80| auto blocksAppender = appender(&blocks); 80| blocksAppender.reserve(3); 960| foreach (f; [ file1Path, file2Path, file3Path ]) | { 480| auto ifile = f.File; 240| readFileDataAsMultipleBlocks(f, ifile, blocksAppender, | rawReadBuffer, blockSize, searchSize); | } 80| auto inputLines = | identifyInputLines!(No.hasRandomValue, No.isWeighted)( | blocks, cmdoptNoHeader); | 208080| assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); | } | } | } | } | version(none) { | { | /* Reading as multiple blocks, with header processing. */ | const size_t readSize = 32; | const size_t blockSize = 48; | const size_t searchSize = 16; | | ubyte[] rawReadBuffer = new ubyte[readSize]; | InputBlock[] blocks; | auto blocksAppender = appender(&blocks); | blocksAppender.reserve(3); | foreach (f; [ file1Path, file2Path, file3Path ]) | { | auto ifile = f.File; | readFileDataAsMultipleBlocks(f, ifile, blocksAppender, | rawReadBuffer, blockSize, searchSize); | } | auto inputLines = | identifyInputLines!(No.hasRandomValue, No.isWeighted)( | blocks, cmdoptYesHeader); | | assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n'); | assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $])); | } | } |} | |/** Write a floating point random value to an output stream. | * | * This routine is used for floating point random value printing. This routine writes | * 17 significant digits, the range available in doubles. This routine prefers decimal | * format, without exponents. It will generate somewhat large precision numbers, | * currently up to 28 digits, before switching to exponents. | * | * The primary reason for this approach is to enable faster sorting on random values | * by GNU sort and similar external sorting programs. GNU sort is dramatically faster | * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch). | * The 'general numeric' handles exponential notation. The difference is 5-10x. | * | * Random values generated by Bernoulli sampling are nearly always greater than 1e-12. | * No examples less than 1e-09 were seen in hundred of millions of trials. Similar | * results were seen with weighted sampling with integer weights. The same is not true | * with floating point weights. These produce quite large exponents. However, even | * for floating point weights this can be useful. For random weights [0,1] less than 5% | * will be less than 1e-12 and use exponential notation. | */ |void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value) |if (isOutputRange!(OutputRange, char)) |{ | import std.format : formatValue, singleSpec; | 1189| immutable spec17f = singleSpec("%.17f"); 1189| immutable spec18f = singleSpec("%.18f"); 1189| immutable spec19f = singleSpec("%.19f"); 1189| immutable spec20f = singleSpec("%.20f"); 1189| immutable spec21f = singleSpec("%.21f"); 1189| immutable spec22f = singleSpec("%.22f"); 1189| immutable spec23f = singleSpec("%.23f"); 1189| immutable spec24f = singleSpec("%.24f"); 1189| immutable spec25f = singleSpec("%.25f"); 1189| immutable spec26f = singleSpec("%.26f"); 1189| immutable spec27f = singleSpec("%.27f"); 1189| immutable spec28f = singleSpec("%.28f"); | 1189| immutable spec17g = singleSpec("%.17g"); | 1189| immutable formatSpec = 1017| (value >= 1e-01) ? spec17f : 275| (value >= 1e-02) ? spec18f : 77| (value >= 1e-03) ? spec19f : 65| (value >= 1e-04) ? spec20f : 60| (value >= 1e-05) ? spec21f : 56| (value >= 1e-06) ? spec22f : 54| (value >= 1e-07) ? spec23f : 54| (value >= 1e-08) ? spec24f : 48| (value >= 1e-09) ? spec25f : 48| (value >= 1e-10) ? spec26f : 42| (value >= 1e-11) ? spec27f : 76| (value >= 1e-12) ? spec28f : spec17g; | 1189| outputStream.formatValue(value, formatSpec); |} | |@safe unittest |{ | void testFormatValue(double value, string expected) | { | import std.array : appender; | 30| auto s = appender!string(); 30| s.formatRandomValue(value); 30| assert(s.data == expected, | format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data)); | } | 1| testFormatValue(1.0, "1.00000000000000000"); 1| testFormatValue(0.1, "0.10000000000000001"); 1| testFormatValue(0.01, "0.010000000000000000"); 1| testFormatValue(1e-03, "0.0010000000000000000"); 1| testFormatValue(1e-04, "0.00010000000000000000"); 1| testFormatValue(1e-05, "0.000010000000000000001"); 1| testFormatValue(1e-06, "0.0000010000000000000000"); 1| testFormatValue(1e-07, "0.00000010000000000000000"); 1| testFormatValue(1e-08, "0.000000010000000000000000"); 1| testFormatValue(1e-09, "0.0000000010000000000000001"); 1| testFormatValue(1e-10, "0.00000000010000000000000000"); 1| testFormatValue(1e-11, "0.000000000009999999999999999"); 1| testFormatValue(1e-12, "0.0000000000010000000000000000"); 1| testFormatValue(1e-13, "1e-13"); 1| testFormatValue(1e-14, "1e-14"); 1| testFormatValue(12345678901234567e-15, "12.34567890123456735"); 1| testFormatValue(12345678901234567e-16, "1.23456789012345669"); 1| testFormatValue(12345678901234567e-17, "0.12345678901234566"); 1| testFormatValue(12345678901234567e-18, "0.012345678901234567"); 1| testFormatValue(12345678901234567e-19, "0.0012345678901234567"); 1| testFormatValue(12345678901234567e-20, "0.00012345678901234567"); 1| testFormatValue(12345678901234567e-21, "0.000012345678901234568"); 1| testFormatValue(12345678901234567e-22, "0.0000012345678901234567"); 1| testFormatValue(12345678901234567e-23, "0.00000012345678901234566"); 1| testFormatValue(12345678901234567e-24, "0.000000012345678901234567"); 1| testFormatValue(12345678901234567e-25, "0.0000000012345678901234566"); 1| testFormatValue(12345678901234567e-26, "0.00000000012345678901234568"); 1| testFormatValue(12345678901234567e-27, "0.000000000012345678901234567"); 1| testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567"); 1| testFormatValue(12345678901234567e-29, "1.2345678901234566e-13"); |} | | |/** Convenience function for extracting a single field from a line. See | * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error | * text tailored for this program. | */ |import std.traits : isSomeChar; |T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe |if (isSomeChar!C) |{ | import std.conv : ConvException, to; | import tsv_utils.common.utils : getTsvFieldValue; | 1276| T val; | try | { 1276| val = getTsvFieldValue!T(line, fieldIndex, delim); | } | catch (ConvException exc) | { 4| throw new Exception( | format("Could not process line: %s\n File: %s Line: %s%s", 4| exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 4| (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); | } | catch (Exception exc) | { | /* Not enough fields on the line. */ 3| throw new Exception( | format("Could not process line: %s\n File: %s Line: %s", 3| exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); | } | 1269| return val; |} | |@safe unittest |{ | /* getFieldValue unit tests. getTsvFieldValue has it's own tests. | * These tests make basic sanity checks on the getFieldValue wrapper. | */ | import std.exception; | 1| assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 1| assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 2| assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 2| assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 2| assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 2| assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); |} | |/* Unit tests for the main program start here. | * | * Portability note: Many of the tests here rely on generating consistent random numbers | * across different platforms when using the same random seed. So far this has succeeded | * on several different platform, compiler, and library versions. However, it is certainly | * possible this condition will not hold on other platforms. | * | * For tsv-sample, this portability implies generating the same results on different | * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, | * but it is convenient for testing. If platforms are identified that do not generate | * the same results these tests will need to be adjusted. | */ |version(unittest) |{ | /* Unit test helper functions. */ | | import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. | import std.conv : to; | | void testTsvSample(string[] cmdArgs, string[][] expected) | { | import std.array : appender; | 527| assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); | | auto formatAssertMessage(T...)(string msg, T formatArgs) | { 0000000| auto formatString = "[testTsvSample] %s: " ~ msg; 0000000| return format(formatString, cmdArgs[0], formatArgs); | } | 527| TsvSampleOptions cmdopt; 527| auto savedCmdArgs = cmdArgs.to!string; 527| auto r = cmdopt.processArgs(cmdArgs); 527| assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 527| auto output = appender!(char[])(); | 527| tsvSample(cmdopt, output); // This invokes the main code line. | 527| auto expectedOutput = expected.tsvDataToString; | 527| assert(output.data == expectedOutput, | formatAssertMessage( | "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", | expectedOutput.to!string, output.data.to!string)); | } | } | |unittest |{ | import std.path : buildPath; | import std.file : rmdirRecurse; | 1| auto testDir = makeUnittestTempDir("tsv_sample"); 1| scope(exit) testDir.rmdirRecurse; | | /* Tabular data sets and expected results use the built-in static seed. | * Tests are run by writing the data set to a file, then calling the main | * routine to process. The function testTsvSample plays the role of the | * main program. Rather than writing to expected output, the results are | * matched against expected. The expected results were verified by hand | * prior to inclusion in the test. | * | * The initial part of this section is simply setting up data files and | * expected results. | * | * Expected results naming conventions: | * - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected | * - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct | * - Compatibility: Compat, AlgoR, Skip, Swap, Inorder | * - Weight Field: Wt, e.g. Wt3 | * - Sample Size: Num, eg. Num3 | * - Seed Value: V, eg. V77 | * - Key Field: K, e.g. K2 | * - Probability: P, e.g P05 (5%) | * - Printing Probabilities: Probs | * - Printing Probs in order: ProbsInorder | * - Printing Probs with custom header: RVCustom | */ | | /* Empty file. */ 1| string[][] dataEmpty = []; 1| string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 1| writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); | | /* 3x0, header only. */ 1| string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 1| string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 1| writeUnittestTsvFile(fpath_data3x0, data3x0); | | /* 3x1 */ 1| string[][] data3x1 = | [["field_a", "field_b", "field_c"], | ["tan", "タン", "8.5"]]; | 1| string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 1| string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 1| writeUnittestTsvFile(fpath_data3x1, data3x1); 1| writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]); | 1| string[][] data3x1ExpectedReplaceNum3 = | [["field_a", "field_b", "field_c"], | ["tan", "タン", "8.5"], | ["tan", "タン", "8.5"], | ["tan", "タン", "8.5"]]; | | /* 3x2 */ 1| string[][] data3x2 = | [["field_a", "field_b", "field_c"], | ["brown", "褐色", "29.2"], | ["gray", "グレー", "6.2"]]; | 1| string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 1| string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 1| writeUnittestTsvFile(fpath_data3x2, data3x2); 1| writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]); | 1| string[][] data3x2PermuteCompat = | [["field_a", "field_b", "field_c"], | ["gray", "グレー", "6.2"], | ["brown", "褐色", "29.2"]]; | 1| string[][] data3x2PermuteShuffle = | [["field_a", "field_b", "field_c"], | ["gray", "グレー", "6.2"], | ["brown", "褐色", "29.2"]]; | | /* 3x3 */ 1| string[][] data3x3 = | [["field_a", "field_b", "field_c"], | ["orange", "オレンジ", "2.5"], | ["pink", "ピンク", "1.1"], | ["purple", "紫の", "42"]]; | 1| string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 1| string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 1| writeUnittestTsvFile(fpath_data3x3, data3x3); 1| writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]); | 1| string[][] data3x3ExpectedPermuteCompat = | [["field_a", "field_b", "field_c"], | ["purple", "紫の", "42"], | ["pink", "ピンク", "1.1"], | ["orange", "オレンジ", "2.5"]]; | 1| string[][] data3x3ExpectedPermuteSwap = | [["field_a", "field_b", "field_c"], | ["purple", "紫の", "42"], | ["orange", "オレンジ", "2.5"], | ["pink", "ピンク", "1.1"]]; | | /* 3x6 */ 1| string[][] data3x6 = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; 1| string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 1| string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 1| writeUnittestTsvFile(fpath_data3x6, data3x6); 1| writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]); | | // Randomization, all lines 1| string[][] data3x6ExpectedPermuteCompat = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"], | ["black", "黒", "0.983"], | ["blue", "青", "12"], | ["white", "白", "1.65"], | ["green", "緑", "0.0072"], | ["red", "赤", "23.8"]]; | 1| string[][] data3x6ExpectedPermuteSwap = | [["field_a", "field_b", "field_c"], | ["black", "黒", "0.983"], | ["green", "緑", "0.0072"], | ["red", "赤", "23.8"], | ["yellow", "黄", "12"], | ["white", "白", "1.65"], | ["blue", "青", "12"]]; | 1| string[][] data3x6ExpectedPermuteCompatProbs = | [["random_value", "field_a", "field_b", "field_c"], | ["0.96055546286515892", "yellow", "黄", "12"], | ["0.75710153928957880", "black", "黒", "0.983"], | ["0.52525980887003243", "blue", "青", "12"], | ["0.49287854949943721", "white", "白", "1.65"], | ["0.15929344086907804", "green", "緑", "0.0072"], | ["0.010968807619065046", "red", "赤", "23.8"]]; | | /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because | * both are effectively the same algorithm given that --num is data length. Both read | * in the full data in order then call randomShuffle. | */ 1| string[][] data3x6ExpectedSampleAlgoRNum6 = | [["field_a", "field_b", "field_c"], | ["black", "黒", "0.983"], | ["green", "緑", "0.0072"], | ["red", "赤", "23.8"], | ["yellow", "黄", "12"], | ["white", "白", "1.65"], | ["blue", "青", "12"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum5 = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["black", "黒", "0.983"], | ["white", "白", "1.65"], | ["green", "緑", "0.0072"], | ["yellow", "黄", "12"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum4 = | [["field_a", "field_b", "field_c"], | ["blue", "青", "12"], | ["green", "緑", "0.0072"], | ["black", "黒", "0.983"], | ["white", "白", "1.65"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum3 = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["black", "黒", "0.983"], | ["green", "緑", "0.0072"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum2 = | [["field_a", "field_b", "field_c"], | ["black", "黒", "0.983"], | ["red", "赤", "23.8"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum1 = | [["field_a", "field_b", "field_c"], | ["green", "緑", "0.0072"]]; | | /* Inorder versions. */ 1| string[][] data3x6ExpectedSampleAlgoRNum6Inorder = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum5Inorder = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum4Inorder = | [["field_a", "field_b", "field_c"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum3Inorder = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["green", "緑", "0.0072"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum2Inorder = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleAlgoRNum1Inorder = | [["field_a", "field_b", "field_c"], | ["green", "緑", "0.0072"]]; | | /* Reservoir inorder */ 1| string[][] data3x6ExpectedSampleCompatNum6Inorder = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum5Inorder = | [["field_a", "field_b", "field_c"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum4Inorder = | [["field_a", "field_b", "field_c"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum3Inorder = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum2Inorder = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum1Inorder = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"]]; | | | /* Reservoir inorder with probabilities. */ 1| string[][] data3x6ExpectedSampleCompatNum6ProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["0.010968807619065046", "red", "赤", "23.8"], | ["0.15929344086907804", "green", "緑", "0.0072"], | ["0.49287854949943721", "white", "白", "1.65"], | ["0.96055546286515892", "yellow", "黄", "12"], | ["0.52525980887003243", "blue", "青", "12"], | ["0.75710153928957880", "black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum5ProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["0.15929344086907804", "green", "緑", "0.0072"], | ["0.49287854949943721", "white", "白", "1.65"], | ["0.96055546286515892", "yellow", "黄", "12"], | ["0.52525980887003243", "blue", "青", "12"], | ["0.75710153928957880", "black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum4ProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["0.49287854949943721", "white", "白", "1.65"], | ["0.96055546286515892", "yellow", "黄", "12"], | ["0.52525980887003243", "blue", "青", "12"], | ["0.75710153928957880", "black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum3ProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["0.96055546286515892", "yellow", "黄", "12"], | ["0.52525980887003243", "blue", "青", "12"], | ["0.75710153928957880", "black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum2ProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["0.96055546286515892", "yellow", "黄", "12"], | ["0.75710153928957880", "black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedSampleCompatNum1ProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["0.96055546286515892", "yellow", "黄", "12"]]; | 1| string[][] data3x6ExpectedWt3Num6Inorder = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedWt3Num5Inorder = | [["field_a", "field_b", "field_c"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedWt3Num4Inorder = | [["field_a", "field_b", "field_c"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedWt3Num3Inorder = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedWt3Num2Inorder = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedWt3Num1Inorder = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"]]; | | 1| string[][] data3x6ExpectedBernoulliProbsP100 = | [["random_value", "field_a", "field_b", "field_c"], | ["0.010968807619065046", "red", "赤", "23.8"], | ["0.15929344086907804", "green", "緑", "0.0072"], | ["0.49287854949943721", "white", "白", "1.65"], | ["0.96055546286515892", "yellow", "黄", "12"], | ["0.52525980887003243", "blue", "青", "12"], | ["0.75710153928957880", "black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedBernoulliCompatProbsP60 = | [["random_value", "field_a", "field_b", "field_c"], | ["0.010968807619065046", "red", "赤", "23.8"], | ["0.15929344086907804", "green", "緑", "0.0072"], | ["0.49287854949943721", "white", "白", "1.65"], | ["0.52525980887003243", "blue", "青", "12"]]; | 1| string[][] data3x6ExpectedBernoulliSkipP40 = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["green", "緑", "0.0072"], | ["yellow", "黄", "12"]]; | 1| string[][] data3x6ExpectedBernoulliCompatP60 = | [["field_a", "field_b", "field_c"], | ["red", "赤", "23.8"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["blue", "青", "12"]]; | 1| string[][] data3x6ExpectedDistinctK1K3P60 = | [["field_a", "field_b", "field_c"], | ["green", "緑", "0.0072"], | ["white", "白", "1.65"], | ["blue", "青", "12"]]; | 1| string[][] data3x6ExpectedDistinctK1K3P60Probs = | [["random_value", "field_a", "field_b", "field_c"], | ["0", "green", "緑", "0.0072"], | ["0", "white", "白", "1.65"], | ["0", "blue", "青", "12"]]; | 1| string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom = | [["custom_random_value_header", "field_a", "field_b", "field_c"], | ["0", "green", "緑", "0.0072"], | ["0", "white", "白", "1.65"], | ["0", "blue", "青", "12"]]; | 1| string[][] data3x6ExpectedDistinctK2P2ProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["1", "red", "赤", "23.8"], | ["0", "green", "緑", "0.0072"], | ["0", "white", "白", "1.65"], | ["1", "yellow", "黄", "12"], | ["3", "blue", "青", "12"], | ["2", "black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedPermuteWt3Probs = | [["random_value", "field_a", "field_b", "field_c"], | ["0.99665198757645390", "yellow", "黄", "12"], | ["0.94775884809836686", "blue", "青", "12"], | ["0.82728234682286661", "red", "赤", "23.8"], | ["0.75346697377181959", "black", "黒", "0.983"], | ["0.65130103496422487", "white", "白", "1.65"], | ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; | 1| string[][] data3x6ExpectedWt3ProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["0.82728234682286661", "red", "赤", "23.8"], | ["1.5636943712879866e-111", "green", "緑", "0.0072"], | ["0.65130103496422487", "white", "白", "1.65"], | ["0.99665198757645390", "yellow", "黄", "12"], | ["0.94775884809836686", "blue", "青", "12"], | ["0.75346697377181959", "black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedPermuteWt3 = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["red", "赤", "23.8"], | ["black", "黒", "0.983"], | ["white", "白", "1.65"], | ["green", "緑", "0.0072"]]; | | 1| string[][] data3x6ExpectedReplaceNum10 = | [["field_a", "field_b", "field_c"], | ["black", "黒", "0.983"], | ["green", "緑", "0.0072"], | ["green", "緑", "0.0072"], | ["red", "赤", "23.8"], | ["yellow", "黄", "12"], | ["red", "赤", "23.8"], | ["white", "白", "1.65"], | ["yellow", "黄", "12"], | ["yellow", "黄", "12"], | ["white", "白", "1.65"], | ]; | 1| string[][] data3x6ExpectedReplaceNum10V77 = | [["field_a", "field_b", "field_c"], | ["black", "黒", "0.983"], | ["red", "赤", "23.8"], | ["black", "黒", "0.983"], | ["yellow", "黄", "12"], | ["green", "緑", "0.0072"], | ["green", "緑", "0.0072"], | ["green", "緑", "0.0072"], | ["yellow", "黄", "12"], | ["blue", "青", "12"], | ["white", "白", "1.65"], | ]; | | /* Using a different static seed. */ 1| string[][] data3x6ExpectedPermuteCompatV41Probs = | [["random_value", "field_a", "field_b", "field_c"], | ["0.68057272653095424", "green", "緑", "0.0072"], | ["0.67681624367833138", "blue", "青", "12"], | ["0.32097338931635022", "yellow", "黄", "12"], | ["0.25092361867427826", "red", "赤", "23.8"], | ["0.15535934292711318", "black", "黒", "0.983"], | ["0.046095821075141430", "white", "白", "1.65"]]; | 1| string[][] data3x6ExpectedBernoulliCompatP60V41Probs = | [["random_value", "field_a", "field_b", "field_c"], | ["0.25092361867427826", "red", "赤", "23.8"], | ["0.046095821075141430", "white", "白", "1.65"], | ["0.32097338931635022", "yellow", "黄", "12"], | ["0.15535934292711318", "black", "黒", "0.983"]]; | 1| string[][] data3x6ExpectedPermuteWt3V41Probs = | [["random_value", "field_a", "field_b", "field_c"], | ["0.96799377498910666", "blue", "青", "12"], | ["0.94356245792573568", "red", "赤", "23.8"], | ["0.90964601024271996", "yellow", "黄", "12"], | ["0.15491658409260103", "white", "白", "1.65"], | ["0.15043620392537033", "black", "黒", "0.983"], | ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; | 1| string[][] data3x6ExpectedWt3V41ProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["0.94356245792573568", "red", "赤", "23.8"], | ["6.1394674830701461e-24", "green", "緑", "0.0072"], | ["0.15491658409260103", "white", "白", "1.65"], | ["0.90964601024271996", "yellow", "黄", "12"], | ["0.96799377498910666", "blue", "青", "12"], | ["0.15043620392537033", "black", "黒", "0.983"]]; | | | /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1| string[][] combo1ExpectedPermuteCompat = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"], | ["tan", "タン", "8.5"], | ["brown", "褐色", "29.2"], | ["green", "緑", "0.0072"], | ["red", "赤", "23.8"], | ["purple", "紫の", "42"], | ["black", "黒", "0.983"], | ["white", "白", "1.65"], | ["gray", "グレー", "6.2"], | ["blue", "青", "12"], | ["pink", "ピンク", "1.1"], | ["orange", "オレンジ", "2.5"]]; | 1| string[][] combo1ExpectedPermuteCompatProbs = | [["random_value", "field_a", "field_b", "field_c"], | ["0.97088520275428891", "yellow", "黄", "12"], | ["0.96055546286515892", "tan", "タン", "8.5"], | ["0.81756894313730299", "brown", "褐色", "29.2"], | ["0.75710153928957880", "green", "緑", "0.0072"], | ["0.52525980887003243", "red", "赤", "23.8"], | ["0.49287854949943721", "purple", "紫の", "42"], | ["0.47081507067196071", "black", "黒", "0.983"], | ["0.38388182921335101", "white", "白", "1.65"], | ["0.29215990612283349", "gray", "グレー", "6.2"], | ["0.24033216014504433", "blue", "青", "12"], | ["0.15929344086907804", "pink", "ピンク", "1.1"], | ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; | | /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1| string[][] combo1ExpectedProbsInorder = | [["random_value", "field_a", "field_b", "field_c"], | ["0.010968807619065046", "orange", "オレンジ", "2.5"], | ["0.15929344086907804", "pink", "ピンク", "1.1"], | ["0.49287854949943721", "purple", "紫の", "42"], | ["0.96055546286515892", "tan", "タン", "8.5"], | ["0.52525980887003243", "red", "赤", "23.8"], | ["0.75710153928957880", "green", "緑", "0.0072"], | ["0.38388182921335101", "white", "白", "1.65"], | ["0.97088520275428891", "yellow", "黄", "12"], | ["0.24033216014504433", "blue", "青", "12"], | ["0.47081507067196071", "black", "黒", "0.983"], | ["0.81756894313730299", "brown", "褐色", "29.2"], | ["0.29215990612283349", "gray", "グレー", "6.2"]]; | 1| string[][] combo1ExpectedBernoulliCompatP50Probs = | [["random_value", "field_a", "field_b", "field_c"], | ["0.010968807619065046", "orange", "オレンジ", "2.5"], | ["0.15929344086907804", "pink", "ピンク", "1.1"], | ["0.49287854949943721", "purple", "紫の", "42"], | ["0.38388182921335101", "white", "白", "1.65"], | ["0.24033216014504433", "blue", "青", "12"], | ["0.47081507067196071", "black", "黒", "0.983"], | ["0.29215990612283349", "gray", "グレー", "6.2"]]; | 1| string[][] combo1ExpectedBernoulliCompatP40 = | [["field_a", "field_b", "field_c"], | ["orange", "オレンジ", "2.5"], | ["pink", "ピンク", "1.1"], | ["white", "白", "1.65"], | ["blue", "青", "12"], | ["gray", "グレー", "6.2"]]; | 1| string[][] combo1ExpectedDistinctK1P40 = | [["field_a", "field_b", "field_c"], | ["orange", "オレンジ", "2.5"], | ["red", "赤", "23.8"], | ["green", "緑", "0.0072"], | ["blue", "青", "12"], | ["black", "黒", "0.983"]]; | 1| string[][] combo1ExpectedPermuteWt3Probs = | [["random_value", "field_a", "field_b", "field_c"], | ["0.99754077523718754", "yellow", "黄", "12"], | ["0.99527665440088786", "tan", "タン", "8.5"], | ["0.99312578945741659", "brown", "褐色", "29.2"], | ["0.98329602553389361", "purple", "紫の", "42"], | ["0.97330961938083660", "red", "赤", "23.8"], | ["0.88797551521739648", "blue", "青", "12"], | ["0.81999230489041786", "gray", "グレー", "6.2"], | ["0.55975569204250941", "white", "白", "1.65"], | ["0.46472135609205739", "black", "黒", "0.983"], | ["0.18824582704191337", "pink", "ピンク", "1.1"], | ["0.16446131853299920", "orange", "オレンジ", "2.5"], | ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; | 1| string[][] combo1ExpectedPermuteWt3 = | [["field_a", "field_b", "field_c"], | ["yellow", "黄", "12"], | ["tan", "タン", "8.5"], | ["brown", "褐色", "29.2"], | ["purple", "紫の", "42"], | ["red", "赤", "23.8"], | ["blue", "青", "12"], | ["gray", "グレー", "6.2"], | ["white", "白", "1.65"], | ["black", "黒", "0.983"], | ["pink", "ピンク", "1.1"], | ["orange", "オレンジ", "2.5"], | ["green", "緑", "0.0072"]]; | 1| string[][] combo1ExpectedSampleAlgoRNum4 = | [["field_a", "field_b", "field_c"], | ["blue", "青", "12"], | ["gray", "グレー", "6.2"], | ["brown", "褐色", "29.2"], | ["white", "白", "1.65"]]; | 1| string[][] combo1ExpectedSampleAlgoRNum4Inorder = | [["field_a", "field_b", "field_c"], | ["white", "白", "1.65"], | ["blue", "青", "12"], | ["brown", "褐色", "29.2"], | ["gray", "グレー", "6.2"]]; | 1| string[][] combo1ExpectedReplaceNum10 = | [["field_a", "field_b", "field_c"], | ["gray", "グレー", "6.2"], | ["yellow", "黄", "12"], | ["yellow", "黄", "12"], | ["white", "白", "1.65"], | ["tan", "タン", "8.5"], | ["white", "白", "1.65"], | ["blue", "青", "12"], | ["black", "黒", "0.983"], | ["tan", "タン", "8.5"], | ["purple", "紫の", "42"]]; | | /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */ 1| string[][] data1x200 = | [["field_a"], | ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"], | ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"], | ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"], | ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"], | ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"], | ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"], | ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"], | ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"], | ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"], | ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"], | ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"], | ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"], | ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"], | ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"], | ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"], | ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"], | ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"], | ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"], | ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"], | ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"], | ]; | 1| string fpath_data1x200 = buildPath(testDir, "data1x200.tsv"); 1| string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv"); 1| writeUnittestTsvFile(fpath_data1x200, data1x200); 1| writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]); | 1| string[][] data1x200ExpectedBernoulliSkipV333P01 = | [["field_a"], | ["077"], | ["119"]]; | 1| string[][] data1x200ExpectedBernoulliSkipV333P02 = | [["field_a"], | ["038"], | ["059"], | ["124"], | ["161"], | ["162"], | ["183"]]; | 1| string[][] data1x200ExpectedBernoulliSkipV333P03 = | [["field_a"], | ["025"], | ["039"], | ["082"], | ["107"], | ["108"], | ["122"], | ["136"], | ["166"], | ["182"]]; | 1| string[][] data1x200ExpectedBernoulliCompatV333P01 = | [["field_a"], | ["072"]]; | 1| string[][] data1x200ExpectedBernoulliCompatV333P02 = | [["field_a"], | ["004"], | ["072"]]; | 1| string[][] data1x200ExpectedBernoulliCompatV333P03 = | [["field_a"], | ["004"], | ["072"], | ["181"]]; | | /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files, | * only expected results. The header is from 3x0, the results are offset 1-position | * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line. | */ 1| string[][] combo2ExpectedBernoulliSkipV333P03 = | [["field_a", "field_b", "field_c"], | ["024"], | ["038"], | ["081"], | ["106"], | ["107"], | ["121"], | ["135"], | ["165"], | ["181"]]; | | | /* 1x10 - Simple 1-column file. */ 1| string[][] data1x10 = | [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 1| string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 1| string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 1| writeUnittestTsvFile(fpath_data1x10, data1x10); 1| writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]); | 1| string[][] data1x10ExpectedPermuteCompat = | [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; | 1| string[][] data1x10ExpectedPermuteWt1 = | [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; | | /* 2x10a - Uniform distribution [0,1]. */ 1| string[][] data2x10a = | [["line", "weight"], | ["1", "0.26788837"], | ["2", "0.06601298"], | ["3", "0.38627527"], | ["4", "0.47379424"], | ["5", "0.02966641"], | ["6", "0.05636231"], | ["7", "0.70529242"], | ["8", "0.91836862"], | ["9", "0.99103720"], | ["10", "0.31401740"]]; | 1| string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 1| writeUnittestTsvFile(fpath_data2x10a, data2x10a); | 1| string[][] data2x10aExpectedPermuteWt2Probs = | [["random_value", "line", "weight"], | ["0.96833865494543658", "8", "0.91836862"], | ["0.91856842054413923", "4", "0.47379424"], | ["0.25730832087795091", "7", "0.70529242"], | ["0.23725317907018120", "9", "0.99103720"], | ["0.16016096701872204", "3", "0.38627527"], | ["0.090819662667243381", "10", "0.31401740"], | ["0.0071764539244361172", "6", "0.05636231"], | ["0.000000048318642951630057", "1", "0.26788837"], | ["0.00000000037525692966535517", "5", "0.02966641"], | ["8.2123247880095796e-13", "2", "0.06601298"]]; | | /* 2x10b - Uniform distribution [0,1000]. */ 1| string[][] data2x10b = | [["line", "weight"], | ["1", "761"], | ["2", "432"], | ["3", "103"], | ["4", "448"], | ["5", "750"], | ["6", "711"], | ["7", "867"], | ["8", "841"], | ["9", "963"], | ["10", "784"]]; | 1| string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 1| writeUnittestTsvFile(fpath_data2x10b, data2x10b); | 1| string[][] data2x10bExpectedPermuteWt2Probs = | [["random_value", "line", "weight"], | ["0.99996486739067969", "8", "841"], | ["0.99991017467137211", "4", "448"], | ["0.99960871524873662", "6", "711"], | ["0.99914188537143800", "5", "750"], | ["0.99903963250274785", "10", "784"], | ["0.99889631825931946", "7", "867"], | ["0.99852058315191139", "9", "963"], | ["0.99575669679158918", "2", "432"], | ["0.99408758732050595", "1", "761"], | ["0.99315467761212362", "3", "103"]]; | | /* 2x10c - Logarithmic distribution in random order. */ 1| string[][] data2x10c = | [["line", "weight"], | ["1", "31.85"], | ["2", "17403.31"], | ["3", "653.84"], | ["4", "8.23"], | ["5", "2671.04"], | ["6", "26226.08"], | ["7", "1.79"], | ["8", "354.56"], | ["9", "35213.81"], | ["10", "679.29"]]; | 1| string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 1| writeUnittestTsvFile(fpath_data2x10c, data2x10c); | 1| string[][] data2x10cExpectedPermuteWt2Probs = | [["random_value", "line", "weight"], | ["0.99998939008709697", "6", "26226.08"], | ["0.99995951291695517", "9", "35213.81"], | ["0.99991666907613541", "8", "354.56"], | ["0.99989445052186410", "2", "17403.31"], | ["0.99975897602861630", "5", "2671.04"], | ["0.99891852769877643", "3", "653.84"], | ["0.99889167752782515", "10", "679.29"], | ["0.99512207506850148", "4", "8.23"], | ["0.86789371584259023", "1", "31.85"], | ["0.58574438162915610", "7", "1.79"]]; | | /* 2x10d. Logarithmic distribution in ascending order. */ 1| string[][] data2x10d = | [["line", "weight"], | ["1", "1.79"], | ["2", "8.23"], | ["3", "31.85"], | ["4", "354.56"], | ["5", "653.84"], | ["6", "679.29"], | ["7", "2671.04"], | ["8", "17403.31"], | ["9", "26226.08"], | ["10", "35213.81"]]; | 1| string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 1| writeUnittestTsvFile(fpath_data2x10d, data2x10d); | 1| string[][] data2x10dExpectedPermuteWt2Probs = | [["random_value", "line", "weight"], | ["0.99999830221846353", "8", "17403.31"], | ["0.99997860834041397", "10", "35213.81"], | ["0.99994563828986716", "9", "26226.08"], | ["0.99988650363575737", "4", "354.56"], | ["0.99964161939190088", "7", "2671.04"], | ["0.99959045338948649", "6", "679.29"], | ["0.99901574490639788", "5", "653.84"], | ["0.97803163304747431", "3", "31.85"], | ["0.79994791806910948", "2", "8.23"], | ["0.080374261239949119", "1", "1.79"]]; | | /* 2x10e. Logarithmic distribution in descending order. */ 1| string[][] data2x10e = | [["line", "weight"], | ["1", "35213.81"], | ["2", "26226.08"], | ["3", "17403.31"], | ["4", "2671.04"], | ["5", "679.29"], | ["6", "653.84"], | ["7", "354.56"], | ["8", "31.85"], | ["9", "8.23"], | ["10", "1.79"]]; 1| string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 1| writeUnittestTsvFile(fpath_data2x10e, data2x10e); | 1| string[][] data2x10eExpectedPermuteWt2Probs = | [["random_value", "line", "weight"], | ["0.99998493348975237", "4", "2671.04"], | ["0.99995934807202624", "3", "17403.31"], | ["0.99992995739727453", "2", "26226.08"], | ["0.99987185679245649", "1", "35213.81"], | ["0.99957451563173938", "6", "653.84"], | ["0.99907273650209583", "8", "31.85"], | ["0.99905260312968946", "5", "679.29"], | ["0.99730333650516401", "7", "354.56"], | ["0.84093902435227808", "9", "8.23"], | ["0.65650015926290028", "10", "1.79"]]; | | /* Data sets for distinct sampling. */ 1| string[][] data5x25 = | [["ID", "Shape", "Color", "Size", "Weight"], | ["01", "circle", "red", "S", "10"], | ["02", "circle", "black", "L", "20"], | ["03", "square", "black", "L", "20"], | ["04", "circle", "green", "L", "30"], | ["05", "ellipse", "red", "S", "20"], | ["06", "triangle", "red", "S", "10"], | ["07", "triangle", "red", "L", "20"], | ["08", "square", "black", "S", "10"], | ["09", "circle", "black", "S", "20"], | ["10", "square", "green", "L", "20"], | ["11", "triangle", "red", "L", "20"], | ["12", "circle", "green", "L", "30"], | ["13", "ellipse", "red", "S", "20"], | ["14", "circle", "green", "L", "30"], | ["15", "ellipse", "red", "L", "30"], | ["16", "square", "red", "S", "10"], | ["17", "circle", "black", "L", "20"], | ["18", "square", "red", "S", "20"], | ["19", "square", "black", "L", "20"], | ["20", "circle", "red", "S", "10"], | ["21", "ellipse", "black", "L", "30"], | ["22", "triangle", "red", "L", "30"], | ["23", "circle", "green", "S", "20"], | ["24", "square", "green", "L", "20"], | ["25", "circle", "red", "S", "10"], | ]; | 1| string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 1| string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 1| writeUnittestTsvFile(fpath_data5x25, data5x25); 1| writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]); | 1| string[][] data5x25ExpectedDistinctK2P40 = | [["ID", "Shape", "Color", "Size", "Weight"], | ["03", "square", "black", "L", "20"], | ["05", "ellipse", "red", "S", "20"], | ["08", "square", "black", "S", "10"], | ["10", "square", "green", "L", "20"], | ["13", "ellipse", "red", "S", "20"], | ["15", "ellipse", "red", "L", "30"], | ["16", "square", "red", "S", "10"], | ["18", "square", "red", "S", "20"], | ["19", "square", "black", "L", "20"], | ["21", "ellipse", "black", "L", "30"], | ["24", "square", "green", "L", "20"], | ]; | 1| string[][] data5x25ExpectedDistinctK2K4P20 = | [["ID", "Shape", "Color", "Size", "Weight"], | ["03", "square", "black", "L", "20"], | ["07", "triangle", "red", "L", "20"], | ["08", "square", "black", "S", "10"], | ["10", "square", "green", "L", "20"], | ["11", "triangle", "red", "L", "20"], | ["16", "square", "red", "S", "10"], | ["18", "square", "red", "S", "20"], | ["19", "square", "black", "L", "20"], | ["22", "triangle", "red", "L", "30"], | ["24", "square", "green", "L", "20"], | ]; | 1| string[][] data5x25ExpectedDistinctK2K3K4P20 = | [["ID", "Shape", "Color", "Size", "Weight"], | ["04", "circle", "green", "L", "30"], | ["07", "triangle", "red", "L", "20"], | ["09", "circle", "black", "S", "20"], | ["11", "triangle", "red", "L", "20"], | ["12", "circle", "green", "L", "30"], | ["14", "circle", "green", "L", "30"], | ["16", "square", "red", "S", "10"], | ["18", "square", "red", "S", "20"], | ["22", "triangle", "red", "L", "30"], | ]; | | /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */ 1| string[][] data2x25 = | [["Shape", "Size"], | ["circle", "S"], | ["circle", "L"], | ["square", "L"], | ["circle", "L"], | ["ellipse", "S"], | ["triangle", "S"], | ["triangle", "L"], | ["square", "S"], | ["circle", "S"], | ["square", "L"], | ["triangle", "L"], | ["circle", "L"], | ["ellipse", "S"], | ["circle", "L"], | ["ellipse", "L"], | ["square", "S"], | ["circle", "L"], | ["square", "S"], | ["square", "L"], | ["circle", "S"], | ["ellipse", "L"], | ["triangle", "L"], | ["circle", "S"], | ["square", "L"], | ["circle", "S"], | ]; | 1| string fpath_data2x25 = buildPath(testDir, "data2x25.tsv"); 1| string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv"); 1| writeUnittestTsvFile(fpath_data2x25, data2x25); 1| writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]); | 1| string[][] data2x25ExpectedDistinctK1K2P20 = | [["Shape", "Size"], | ["square", "L"], | ["triangle", "L"], | ["square", "S"], | ["square", "L"], | ["triangle", "L"], | ["square", "S"], | ["square", "S"], | ["square", "L"], | ["triangle", "L"], | ["square", "L"], | ]; | 1| string[][] data1x25 = | [["Shape-Size"], | ["circle-S"], | ["circle-L"], | ["square-L"], | ["circle-L"], | ["ellipse-S"], | ["triangle-S"], | ["triangle-L"], | ["square-S"], | ["circle-S"], | ["square-L"], | ["triangle-L"], | ["circle-L"], | ["ellipse-S"], | ["circle-L"], | ["ellipse-L"], | ["square-S"], | ["circle-L"], | ["square-S"], | ["square-L"], | ["circle-S"], | ["ellipse-L"], | ["triangle-L"], | ["circle-S"], | ["square-L"], | ["circle-S"], | ]; | 1| string fpath_data1x25 = buildPath(testDir, "data1x25.tsv"); 1| string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv"); 1| writeUnittestTsvFile(fpath_data1x25, data1x25); 1| writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]); | 1| string[][] data1x25ExpectedDistinctK1P20 = | [["Shape-Size"], | ["triangle-L"], | ["square-S"], | ["triangle-L"], | ["ellipse-L"], | ["square-S"], | ["square-S"], | ["ellipse-L"], | ["triangle-L"], | ]; | 1| string[][] data1x25ExpectedDistinctK1P20Probs = | [["random_value", "Shape-Size"], | ["0", "triangle-L"], | ["0", "square-S"], | ["0", "triangle-L"], | ["0", "ellipse-L"], | ["0", "square-S"], | ["0", "square-S"], | ["0", "ellipse-L"], | ["0", "triangle-L"], | ]; | 1| string[][] data1x25ExpectedDistinctK1P20ProbsInorder = | [["random_value", "Shape-Size"], | ["1", "circle-S"], | ["4", "circle-L"], | ["2", "square-L"], | ["4", "circle-L"], | ["2", "ellipse-S"], | ["1", "triangle-S"], | ["0", "triangle-L"], | ["0", "square-S"], | ["1", "circle-S"], | ["2", "square-L"], | ["0", "triangle-L"], | ["4", "circle-L"], | ["2", "ellipse-S"], | ["4", "circle-L"], | ["0", "ellipse-L"], | ["0", "square-S"], | ["4", "circle-L"], | ["0", "square-S"], | ["2", "square-L"], | ["1", "circle-S"], | ["0", "ellipse-L"], | ["0", "triangle-L"], | ["1", "circle-S"], | ["2", "square-L"], | ["1", "circle-S"], | ]; | | /* | * Enough setup! Actually run some tests! | */ | | /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */ 1| testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0); 1| testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1); 1| testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat); 1| testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat); 1| testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat); 1| testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 1| testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 1| testTsvSample(["test-a8b", "-H", "-s", "--weight-field", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3); 1| testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 1| testTsvSample(["test-a9b", "-H", "-s", "--print-random", "-w", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 1| testTsvSample(["test-a9c", "-H", "-s", "--print-random", "-w", "f*c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 1| testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 1| testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 1| testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 1| testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 1| testTsvSample(["test-a13b", "-H", "-v", "41", "-w", "field_c", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); | | /* Shuffling, without compatibility mode, or with both compatibility and printing. */ 1| testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0); 1| testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1); 1| testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle); 1| testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap); 1| testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap); 1| testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 1| testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 1| testTsvSample(["test-aa8b", "-H", "-s", "--print-random", "-w", "field_c", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 1| testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); | | /* Reservoir sampling using Algorithm R. | * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.) | */ 1| testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0); 1| testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0); 1| testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1); 1| testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1); 1| testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 1| testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 1| testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5); 1| testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4); 1| testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3); 1| testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2); 1| testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1); | | /* Inorder versions of Algorithm R tests. */ 1| testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 1| testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 1| testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 1| testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 1| testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 1| testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 1| testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder); 1| testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder); 1| testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder); 1| testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder); 1| testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder); | | /* Bernoulli sampling cases. */ 1| testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 1| testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 1| testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 1| testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 1| testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 1| testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60); 1| testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60); 1| testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); | | /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */ 1| testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01); 1| testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02); 1| testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03); 1| testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01); 1| testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02); 1| testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03); 1| testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); | | /* Distinct sampling cases. */ 1| testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 1| testTsvSample(["test-a24b", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "field_a", fpath_data3x0], data3x0); 1| testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 1| testTsvSample(["test-a25b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x1], data3x1); 1| testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 1| testTsvSample(["test-a26b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x6], data3x6); 1| testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 1| testTsvSample(["test-a27b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); | | /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. | * For weighted sampling, use the weighted cases, but with expected using the original ordering. | */ 1| testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 1| testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 1| testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], | data3x6ExpectedWt3ProbsInorder); 1| testTsvSample(["test-a30b", "-H", "-s", "--gen-random-inorder", "--weight-field", "field_c", fpath_data3x6], | data3x6ExpectedWt3ProbsInorder); 1| testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], | data3x6ExpectedWt3V41ProbsInorder); 1| testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], | data3x6ExpectedDistinctK1K3P60Probs); 1| testTsvSample(["test-a32b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", "--print-random", fpath_data3x6], | data3x6ExpectedDistinctK1K3P60Probs); 1| testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", | "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom); 1| testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], | data3x6ExpectedDistinctK2P2ProbsInorder); | | /* Simple random sampling with replacement. */ 1| testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 1| testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 1| testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3); 1| testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10); 1| testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); | | /* Shuffling, compatibility mode, without headers. */ 1| testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]); 1| testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]); 1| testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]); 1| testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]); 1| testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 1| testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 1| testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 1| testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]); | | /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */ 1| testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]); 1| testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]); 1| testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]); 1| testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 1| testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 1| testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); | | /* Reservoir sampling using Algorithm R, no headers. */ 1| testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 1| testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 1| testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]); 1| testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]); 1| testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]); 1| testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]); 1| testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]); | | /* Reservoir sampling using Algorithm R, no headers, inorder output. */ 1| testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 1| testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 1| testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]); 1| testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]); 1| testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]); 1| testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]); 1| testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]); | | /* Bernoulli sampling cases. */ 1| testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 1| testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 1| testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 1| testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]); 1| testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]); | | /* Bernoulli sampling with probabilities in skip sampling range. */ 1| testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]); 1| testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]); 1| testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]); 1| testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]); 1| testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]); 1| testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]); 1| testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]); | | /* Distinct sampling cases. */ 1| testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 1| testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 1| testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); | | /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 1| testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 1| testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]); 1| testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], | data3x6ExpectedDistinctK1K3P60Probs[1 .. $]); 1| testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], | data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]); | | /* Simple random sampling with replacement. */ 1| testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]); 1| testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]); 1| testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]); | | /* Multi-file tests. */ 1| testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedPermuteCompat); 1| testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedPermuteCompatProbs); 1| testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedPermuteWt3Probs); 1| testTsvSample(["test-c3b", "--header", "--static-seed", "--print-random", "--weight-field", "field_c", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedPermuteWt3Probs); 1| testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedPermuteWt3); 1| testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedSampleAlgoRNum4); 1| testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedSampleAlgoRNum4Inorder); | | /* Multi-file, no headers. */ 1| testTsvSample(["test-c6", "--static-seed", "--compatibility-mode", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedPermuteCompat[1 .. $]); 1| testTsvSample(["test-c7", "--static-seed", "--print-random", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedPermuteCompatProbs[1 .. $]); 1| testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedPermuteWt3Probs[1 .. $]); 1| testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedPermuteWt3[1 .. $]); 1| testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedSampleAlgoRNum4[1 .. $]); 1| testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]); | | /* Bernoulli sampling cases. */ 1| testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedBernoulliCompatP50Probs); 1| testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedBernoulliCompatP40); 1| testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedBernoulliCompatP50Probs[1 .. $]); 1| testTsvSample(["test-c14", "--static-seed", "--prob", ".4", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedBernoulliCompatP40[1 .. $]); | | /* Bernoulli sampling with probabilities in skip sampling range. */ 1| testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03", | fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10], | combo2ExpectedBernoulliSkipV333P03); 1| testTsvSample(["test-cc1", "-v", "333", "-p", "0.03", | fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], | combo2ExpectedBernoulliSkipV333P03[1 .. $]); | | /* Distinct sampling cases. */ 1| testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedDistinctK1P40); 1| testTsvSample(["test-c13b", "--header", "--static-seed", "--key-fields", "field_a", "--prob", ".4", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedDistinctK1P40); 1| testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedDistinctK1P40[1 .. $]); | | /* Generating random weights. */ 1| testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedProbsInorder); 1| testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder", | fpath_data3x3_noheader, fpath_data3x1_noheader, | fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedProbsInorder[1 .. $]); | | /* Simple random sampling with replacement. */ 1| testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10", | fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], | combo1ExpectedReplaceNum10); | 1| testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10", | fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, | fpath_data3x6_noheader, fpath_data3x2_noheader], | combo1ExpectedReplaceNum10[1 .. $]); | | /* Single column file. */ 1| testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 1| testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); | | /* Distributions. */ 1| testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 1| testTsvSample(["test-e1b", "-H", "-s", "-w", "weight", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 1| testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs); 1| testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs); 1| testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs); 1| testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs); | | /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling. | * | * Note: The way these tests are done ensures that subset length does not affect | * output order. | */ | import std.algorithm : min; 20| for (size_t n = data3x6.length + 2; n >= 1; n--) | { | /* reservoirSamplingViaHeap. | */ 9| size_t expectedLength = min(data3x6.length, n + 1); 9| testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, | "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); | 9| testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, | "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); | 9| testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, | "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]); | 9| testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, | "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]); | 9| testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, | "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]); | 9| testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, | fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]); | 9| testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, | "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]); | 9| testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, | "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]); | 9| testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string, | "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]); | | /* Bernoulli sampling. | */ | import std.algorithm : min; 9| size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length); | 9| testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, | "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]); | 9| testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, | "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]); | 9| testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, | "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]); | 9| testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string, | fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]); | | /* Distinct Sampling. | */ 9| size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length); | 9| testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, | "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]); | 9| testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, | fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]); | 9| testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, | "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]); | 9| testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, | fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]); | } | | /* Similar tests with the 1x10 data set. */ 28| for (size_t n = data1x10.length + 2; n >= 1; n--) | { 13| size_t expectedLength = min(data1x10.length, n + 1); 13| testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, | "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]); | 13| testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, | "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]); | 13| testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, | fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]); | 13| testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, | "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]); | } | | /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 22| for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--) | { 10| testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], | data3x6ExpectedReplaceNum10[0 .. n + 1]); | 10| testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], | data3x6ExpectedReplaceNum10[1 .. n + 1]); | } | | /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */ 26| for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--) | { 12| size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1); | 12| testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, | "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]); | 12| testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, | fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]); | } | | /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */ 1| testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 1| testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 1| testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 1| testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 1| testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 1| testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 1| testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder); 1| testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum4Inorder); 1| testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder); 1| testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder); 1| testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder); | 1| testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 1| testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 1| testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 1| testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 1| testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]); 1| testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]); 1| testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]); 1| testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]); 1| testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]); | | /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */ 1| testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 1| testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 1| testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder); 1| testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 1| testTsvSample(["test-at19", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 1| testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 1| testTsvSample(["test-at20", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 1| testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder); 1| testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder); | 1| testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 1| testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 1| testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]); 1| testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 1| testTsvSample(["test-au19", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 1| testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]); 1| testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]); 1| testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]); | | /* Inorder weighted sampling tests. */ 1| testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 1| testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 1| testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder); 1| testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder); 1| testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder); 1| testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder); 1| testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder); | 1| testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 1| testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 1| testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]); 1| testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]); 1| testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]); 1| testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]); 1| testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]); | | /* | * Distinct sampling tests. | */ 1| testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], | data5x25ExpectedDistinctK2P40); | 1| testTsvSample(["test-j1b", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "Shape", fpath_data5x25], | data5x25ExpectedDistinctK2P40); | 1| testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], | data5x25ExpectedDistinctK2K4P20); | 1| testTsvSample(["test-j2b", "-H", "-s", "-p", "0.20", "-k", "Shape,Size", fpath_data5x25], | data5x25ExpectedDistinctK2K4P20); | 1| testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], | data5x25ExpectedDistinctK2K3K4P20); | 1| testTsvSample(["test-j3b", "-H", "-s", "-p", "0.20", "-k", "Shape-Size", fpath_data5x25], | data5x25ExpectedDistinctK2K3K4P20); | 1| testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], | data5x25ExpectedDistinctK2P40[1 .. $]); | 1| testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], | data5x25ExpectedDistinctK2K4P20[1 .. $]); | 1| testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], | data5x25ExpectedDistinctK2K3K4P20[1 .. $]); | | | /* These distinct tests check that the whole line as '-k 0' and specifying all fields | * in order have the same result. Also that field numbers don't matter, as '-k 1,2' | * in data2x25 are the same keys as '-k 2,4' in data5x25. | */ 1| testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25], | data2x25ExpectedDistinctK1K2P20); | 1| testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25], | data2x25ExpectedDistinctK1K2P20); | 1| testTsvSample(["test-j8b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data2x25], | data2x25ExpectedDistinctK1K2P20); | 1| testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader], | data2x25ExpectedDistinctK1K2P20[1 .. $]); | 1| testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader], | data2x25ExpectedDistinctK1K2P20[1 .. $]); | | /* Similar to the last set, but for a 1-column file. Also with random value printing. */ 1| testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25], | data1x25ExpectedDistinctK1P20); | 1| testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25], | data1x25ExpectedDistinctK1P20); | 1| testTsvSample(["test-j12b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data1x25], | data1x25ExpectedDistinctK1P20); | 1| testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader], | data1x25ExpectedDistinctK1P20[1 .. $]); | 1| testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader], | data1x25ExpectedDistinctK1P20[1 .. $]); | 1| testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25], | data1x25ExpectedDistinctK1P20Probs); | 1| testTsvSample(["test-j15b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--print-random", fpath_data1x25], | data1x25ExpectedDistinctK1P20Probs); | 1| testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25], | data1x25ExpectedDistinctK1P20Probs); | 1| testTsvSample(["test-j16b", "-H", "-s", "-p", "0.20", "-k", "*", "--print-random", fpath_data1x25], | data1x25ExpectedDistinctK1P20Probs); | 1| testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader], | data1x25ExpectedDistinctK1P20Probs[1 .. $]); | 1| testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader], | data1x25ExpectedDistinctK1P20Probs[1 .. $]); | 1| testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25], | data1x25ExpectedDistinctK1P20ProbsInorder); | 1| testTsvSample(["test-j19b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--gen-random-inorder", fpath_data1x25], | data1x25ExpectedDistinctK1P20ProbsInorder); | 1| testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25], | data1x25ExpectedDistinctK1P20ProbsInorder); | 1| testTsvSample(["test-j20b", "-H", "-s", "-p", "0.20", "-k", "*", "--gen-random-inorder", fpath_data1x25], | data1x25ExpectedDistinctK1P20ProbsInorder); | 1| testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader], | data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); | 1| testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader], | data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); | |} tsv-sample/src/tsv_utils/tsv-sample.d is 99% covered <<<<<< EOF # path=./tsv-summarize-src-tsv_utils-tsv-summarize.lst |/** |Command line tool that reads TSV files and summarizes field values associated with |equivalent keys. | |Copyright (c) 2016-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.tsv_summarize; | |import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter; |import std.array : join; |import std.conv : to; |import std.exception : enforce; |import std.format : format; |import std.range; |import std.stdio; |import std.typecons : tuple; |import std.container : DList; | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |version(unittest) |{ | // When running unit tests, use main from -main compiler switch. |} |else |{ | int main(string[] cmdArgs) | { | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 99| dmd_coverSetMerge(true); | } | 99| TsvSummarizeOptions cmdopt; 99| auto r = cmdopt.processArgs(cmdArgs); 148| if (!r[0]) return r[1]; | version(LDC_Profile) | { | import ldc.profile : resetAll; | resetAll(); | } 50| try tsvSummarize(cmdopt); | catch (Exception exc) | { 4| stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 4| return 1; | } 46| return 0; | } |} | |auto helpTextVerbose = q"EOS |Synopsis: tsv-summarize [options] file [file...] | |tsv-summarize reads tabular data files (tab-separated by default), tracks |field values for each unique key, and runs summarization algorithms. Consider |the file data.tsv: | | Make Color Time | ford blue 131 | chevy green 124 | ford red 128 | bmw black 118 | bmw black 126 | ford blue 122 | |The min and average times for each make is generated by the command: | | $ tsv-summarize --header --group-by Make --min Time --mean Time data.tsv | |This produces: | | Make Time_min Time_mean | ford 122 127 | chevy 124 124 | bmw 118 122 | |Using '--group-by Make,Color' will group by both 'Make' and 'Color'. |Omitting the '--group-by' entirely summarizes fields for the full file. | |The previous example uses field names to identify fields. Field numbers |can be used as well. The next two commands are equivalent: | | $ tsv-summarize -H --group-by Make,Color --min Time --mean Time data.tsv | $ tsv-summarize -H --group-by 1,2 --min 3 --mean 3 data.tsv | |The program tries to generate useful headers, but custom headers can be |specified. Example (using -g and -H shortcuts for --header and --group-by): | | $ tsv-summarize -H -g 1 --min 3:Fastest --mean 3:Average data.tsv | |Most operators take custom headers in a similarly way, generally following: | | -- FIELD[:header] | |Operators can be specified multiple times. They can also take multiple |fields (though not when a custom header is specified). Examples: | | --median 2,3,4 | --median 2-5,7-11 | --median elapsed_time,system_time,user_time | --median '*_time' # Wildcard. All fields ending in '_time'. | |The quantile operator requires one or more probabilities after the fields: | | --quantile run_time:0.25 # Quantile 1 of the 'run_time' field | --quantile 2:0.25 # Quantile 1 of field 2 | --quantile 2-4:0.25,0.5,0.75 # Q1, Median, Q3 of fields 2, 3, 4 | |Summarization operators available are: | count range mad values | retain sum var unique-values | first mean stddev unique-count | last median mode missing-count | min quantile mode-count not-missing-count | max | |Calculated numeric values are printed to 12 significant digits by default. |This can be changed using the '--p|float-precision' option. If six or less |it sets the number of significant digits after the decimal point. If |greater than six it sets the total number of significant digits. | |Calculations hold onto the minimum data needed while reading data. A few |operations like median keep all data values in memory. These operations will |start to encounter performance issues as available memory becomes scarce. The |size that can be handled effectively is machine dependent, but often quite |large files can be handled. | |Operations requiring numeric entries will signal an error and terminate |processing if a non-numeric entry is found. | |Missing values are not treated specially by default, this can be changed |using the '--x|exclude-missing' or '--r|replace-missing' option. The former |turns off processing for missing values, the latter uses a replacement value. | |Options: |EOS"; | |auto helpText = q"EOS |Synopsis: tsv-summarize [options] file [file...] | |tsv-summarize runs aggregation operations on fields in tab-separated value |files. Operations can be run against the full input data or grouped by key |fields. Fields can be specified either by field number or field name. Use |'--help-verbose' for more detailed help. | |Options: |EOS"; | |/** Command line options - Container and processing. The processArgs method is used to | * process the command line. | */ |struct TsvSummarizeOptions { | import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange; | | string programName; /// Program name | ByLineSourceRange!() inputSources; /// Input Files | size_t[] keyFields; /// -g, --group-by | bool hasHeader = false; /// --header | bool writeHeader = false; /// -w, --write-header | char inputFieldDelimiter = '\t'; /// --d|delimiter | char valuesDelimiter = '|'; /// --v|values-delimiter | size_t floatPrecision = 12; /// --p|float-precision | DList!Operator operators; /// Operators, in the order specified. | size_t endFieldIndex = 0; /// Derived value. Max field index used plus one. | MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; /// Derived value. | | /* tsv-summarize operators require access to the header line when the operator is | * created. This is because named fields may be used to describe fields names. To | * enable this, a CmdOptionHandler delegate is added to the cmdLinOperatorOptions | * array during during initial processing by std.getopt. The group-by operation is | * similar, but is added to the cmdLineOtherFieldOptions instead. At least one | * cmdLineOperatorOptions entry is required. | * | * The different handlers are defined after processArgs. | */ | | /* CmdOptionHandler delegate signature - This is the call made to process the command | * line option arguments after the header line has been read. | */ | alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields); | | private CmdOptionHandler[] cmdLineOperatorOptions; | private CmdOptionHandler[] cmdLineOtherFieldOptions; | | /* Returns a tuple. First value is true if command line arguments were successfully | * processed and execution should continue, or false if an error occurred or the user | * asked for help. If false, the second value is the appropriate exit code (0 or 1). | * | * Returning true (execution continues) means args have been validated and derived | * values calculated. In addition, field indices have been converted to zero-based. | */ | auto processArgs (ref string[] cmdArgs) { | import std.algorithm : any, each; | import std.getopt; | import std.path : baseName, stripExtension; | import std.typecons : Yes, No; | import tsv_utils.common.fieldlist : fieldListHelpText; | import tsv_utils.common.getopt_inorder; | import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; | 191| bool helpVerbose = false; // --help-verbose 191| bool helpFields = false; // --help-fields 191| bool versionWanted = false; // --V|version 191| bool excludeMissing = false; // --x|exclude-missing 191| string missingValueReplacement; // --r|replace-missing | | 382| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | try | { 191| arraySep = ","; // Use comma to separate values in command line options 191| auto r = getoptInorder( | cmdArgs, | "help-verbose", " Print full help.", &helpVerbose, | "help-fields", " Print help on specifying fields.", &helpFields, | | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | std.getopt.config.caseInsensitive, | | "g|group-by", " Fields to use as key.", &addGroupByOptionHandler, | | std.getopt.config.caseSensitive, | "H|header", " Treat the first line of each file as a header.", &hasHeader, | std.getopt.config.caseInsensitive, | | "w|write-header", " Write an output header even if there is no input header.", &writeHeader, | "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter, | "v|values-delimiter", "CHR Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter, | "p|float-precision", "NUM 'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision, | "x|exclude-missing", " Exclude missing (empty) fields from calculations.", &excludeMissing, | "r|replace-missing", "STR Replace missing (empty) fields with STR in calculations.", &missingValueReplacement, | "count", " Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &addCountOptionHandler, | "count-header", "STR Count occurrences of each unique key, like '--count', but use STR as the header.", &addCountHeaderOptionHandler, | "retain", " Retain one copy of the field.", &addOperatorOptionHandler!RetainOperator, | "first", "[:STR] First value seen.", &addOperatorOptionHandler!FirstOperator, | "last", "[:STR] Last value seen.", &addOperatorOptionHandler!LastOperator, | "min", "[:STR] Min value. (Fields with numeric values only.)", &addOperatorOptionHandler!MinOperator, | "max", "[:STR] Max value. (Fields with numeric values only.)", &addOperatorOptionHandler!MaxOperator, | "range", "[:STR] Difference between min and max values. (Fields with numeric values only.)", &addOperatorOptionHandler!RangeOperator, | "sum", "[:STR] Sum of the values. (Fields with numeric values only.)", &addOperatorOptionHandler!SumOperator, | "mean", "[:STR] Mean (average). (Fields with numeric values only.)", &addOperatorOptionHandler!MeanOperator, | "median", "[:STR] Median value. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MedianOperator, | "quantile", ":p[,p...][:STR] Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Fields with numeric values only. Reads all values into memory.)", &addQuantileOperatorOptionHandler, | "mad", "[:STR] Median absolute deviation from the median. Raw value, not scaled. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MadOperator, | "var", "[:STR] Variance. (Sample variance, numeric fields only).", &addOperatorOptionHandler!VarianceOperator, | "stdev", "[:STR] Standard deviation. (Sample st.dev, numeric fields only).", &addOperatorOptionHandler!StDevOperator, | "mode", "[:STR] Mode. The most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeOperator, | "mode-count", "[:STR] Count of the most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeCountOperator, | "unique-count", "[:STR] Number of unique values. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueCountOperator, | "missing-count", "[:STR] Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &addOperatorOptionHandler!MissingCountOperator, | "not-missing-count", "[:STR] Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &addOperatorOptionHandler!NotMissingCountOperator, | "values", "[:STR] All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &addOperatorOptionHandler!ValuesOperator, | "unique-values", "[:STR] All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueValuesOperator, | ); | 186| if (r.helpWanted) | { 1| defaultGetoptPrinter(helpText, r.options); 1| return tuple(false, 0); | } 185| else if (helpVerbose) | { 1| defaultGetoptPrinter(helpTextVerbose, r.options); 1| return tuple(false, 0); | } 184| else if (helpFields) | { 1| writeln(fieldListHelpText); 1| return tuple(false, 0); | } 183| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("tsv-summarize")); 2| return tuple(false, 0); | } | | /* Remaining command line args are files. Use standard input if files | * were not provided. Truncate cmdArgs to consume the arguments. | */ 362| string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 181| cmdArgs.length = 1; | | /* Validation and derivations - Do as much validation prior to header line | * processing as possible (avoids waiting on stdin). | */ | 182| enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required."); | 180| enforce(inputFieldDelimiter != valuesDelimiter, 1| "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter."); | 190| enforce(!(excludeMissing && missingValueReplacement.length != 0), 1| "Cannot use both '--x|exclude-missing' and '--r|replace-missing'."); | | /* Missing field policy. */ 178| globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement); | 178| string[] headerFields; | | /* fieldListArgProcessing encapsulates the field list processing. It is | * called prior to reading the header line if headers are not being used, | * and after if headers are being used. | */ | void fieldListArgProcessing() | { | /* Run all the operator handlers. */ 398| cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields)); 643| cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields)); | | /* keyFields need to be part of the endFieldIndex, which is one past | * the last field index. */ 143| keyFields.each!(delegate (size_t x) | { 166| if (x >= endFieldIndex) endFieldIndex = x + 1; | } ); | } | 252| if (!hasHeader) fieldListArgProcessing(); | | /* | * Create the byLineSourceRange and perform header line processing. | */ 146| inputSources = byLineSourceRange(filepaths); | | 145| if (hasHeader) | { 104| if (!inputSources.front.byLine.empty) | { 102| throwIfWindowsNewlineOnUnix(inputSources.front.byLine.front, inputSources.front.name, 1); 101| headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]); | } | 103| fieldListArgProcessing(); | } | } | catch (Exception exc) | { 44| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 44| return tuple(false, 1); | } 142| return tuple(true, 0); | } | | private void addGroupByOptionHandler(string option, string optionVal) | { 115| cmdLineOtherFieldOptions ~= | (bool hasHeader, string[] headerFields) 221| => groupByOptionHandler(hasHeader, headerFields, option, optionVal); | } | | private void groupByOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal) | { | import tsv_utils.common.fieldlist; | | try | { 114| keyFields = | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields) | .array; | } | catch (Exception e) | { 7| e.msg = format("[--%s %s]. %s", option, optionVal, e.msg); 7| throw e; | } | } | | private void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal) | { 191| cmdLineOperatorOptions ~= | (bool hasHeader, string[] headerFields) 371| => operatorOptionHandler!OperatorClass(hasHeader, headerFields, option, optionVal); | } | | /* operationOptionHandler functions are callbacks that process command line options | * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers | * check syntactic correctness and instantiate Operator objects that do the work. This | * is also where 1-upped field numbers are converted to 0-based indices. | */ | private void operatorOptionHandler(OperatorClass : SingleFieldOperator) | (bool hasHeader, string[] headerFields, string option, string optionVal) | { | import std.range : enumerate; | import std.typecons : Yes, No; | import tsv_utils.common.fieldlist; | | try | { 190| auto optionValParse = | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields); | 187| auto fieldIndices = optionValParse.array; 185| bool hasOptionalHeader = optionVal.length > optionValParse.consumed; 185| string optionalHeader; | 185| if (hasOptionalHeader) | { 20| enforce(fieldIndices.length <= 1, "Cannot specify a custom header when using multiple fields."); 16| enforce(optionVal.length - optionValParse.consumed > 1, 1| format("No value after field list.\n Expected: '--%s ' or '--%s :
'.", | option, option)); 15| optionalHeader = optionVal[optionValParse.consumed + 1 .. $].idup; | } | 1396| foreach (fieldIndex; fieldIndices) | { 284| auto op = new OperatorClass(fieldIndex, globalMissingPolicy); | 284| if (hasOptionalHeader) | { 16| enforce(op.allowCustomHeader, "Operator does not support custom headers."); 14| op.setCustomHeader(optionalHeader); | } | 283| operators.insertBack(op); 478| if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; | } | } | catch (Exception exc) | { | import std.format : format; 9| exc.msg = format("[--%s %s] %s", option, optionVal, exc.msg); 9| throw exc; | } | } | | private void addQuantileOperatorOptionHandler(string option, string optionVal) | { 20| cmdLineOperatorOptions ~= | (bool hasHeader, string[] headerFields) 22| => quantileOperatorOptionHandler(hasHeader, headerFields, option, optionVal); | } | | /* QuantileOperator has a different syntax and needs a custom command option handler. */ | private void quantileOperatorOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal) | { | import std.typecons : Yes, No; | import tsv_utils.common.fieldlist; | | try | { 20| auto optionValParse = | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields); | 11| auto fieldIndices = optionValParse.array; 12| enforce(optionVal.length - optionValParse.consumed > 1, "No probabilities entered."); | 8| auto splitRemaining = | optionVal[optionValParse.consumed + 1 .. $] | .findSplit(":"); | 11| enforce(splitRemaining[1].empty || !splitRemaining[2].empty, 1| "Empty custom header."); | 7| auto probStr = splitRemaining[0]; 7| auto header = splitRemaining[2]; | 7| double[] probs; | 35| foreach (str; probStr.splitter(',')) | { 9| double p = str.to!double; 16| enforce(p >= 0.0 && p <= 1.0, 2| format("Probability '%g' is not in the interval [0.0,1.0].", p)); 6| probs ~= p; | } | 7| enforce(header.empty || (fieldIndices.length <= 1 && probs.length <= 1), 2| format("Cannot specify a custom header when using multiple fields or multiple probabilities.")); | 2| assert (fieldIndices.length > 0); 2| assert (probs.length > 0); 2| assert (header.empty || (fieldIndices.length == 1 && probs.length == 1)); | 12| foreach (fieldIndex; fieldIndices) | { 12| foreach (p; probs) | { 2| auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p); 2| if (!header.empty) op.setCustomHeader(header); 2| operators.insertBack(op); | } 2| if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; | } | } | catch (Exception e) | { 18| e.msg = format( | "[--%s %s]. %s\n Expected: '--%s :[,]' or '--%s ::
' where is a number between 0.0 and 1.0.", | option, optionVal, e.msg, option, option); 18| throw e; | } | | } | | private void addCountOptionHandler() | { 48| cmdLineOperatorOptions ~= | (bool hasHeader, string[] headerFields) 78| => countOptionHandler(hasHeader, headerFields); | } | | private void countOptionHandler(bool hasHeader, string[] headerFields) | { 39| operators.insertBack(new CountOperator()); | } | | private void addCountHeaderOptionHandler(string option, string optionVal) | { 1| cmdLineOperatorOptions ~= | (bool hasHeader, string[] headerFields) 2| => countHeaderOptionHandler(hasHeader, headerFields, option, optionVal); | } | | private void countHeaderOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal) | { 1| auto op = new CountOperator(); 1| op.setCustomHeader(optionVal); 1| operators.insertBack(op); | } |} | |/** tsvSummarize does the primary work of the tsv-summarize program. | */ |void tsvSummarize(ref TsvSummarizeOptions cmdopt) |{ | import tsv_utils.common.utils : BufferedOutputRange, ByLineSourceRange, | bufferedByLine, throwIfWindowsNewlineOnUnix; | | /* Check that the input files were setup as expected. Should at least have one | * input, stdin if nothing else, and newlines removed from the byLine range. | */ 50| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator))); | | /* BufferedOutputRange is faster than writing directly to stdout if many lines are | * being written. This will happen mostly when group-by is used. | */ 100| auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); | | /* Pick the Summarizer based on the number of key-fields entered. */ 50| auto summarizer = | (cmdopt.keyFields.length == 0) 30| ? new NoKeySummarizer!(typeof(bufferedOutput))( | cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) | 20| : (cmdopt.keyFields.length == 1) 14| ? new OneKeySummarizer!(typeof(bufferedOutput))( | cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) | 6| : new MultiKeySummarizer!(typeof(bufferedOutput))( | cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); | | /* Add the operators to the Summarizer. */ 50| summarizer.setOperators(inputRangeObject(cmdopt.operators[])); | | /* If there's no input header line, but writing an output header anyway, then | * write it now. This helps tasks further on in a unix pipeline detect errors | * quickly, without waiting for all the data to flow through the pipeline. | */ 50| auto printOptions = SummarizerPrintOptions( | cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); | 72| if (!cmdopt.hasHeader && cmdopt.writeHeader) | { 3| summarizer.writeSummaryHeader(bufferedOutput, printOptions); 3| bufferedOutput.flush; | } | | /* Process each input file, one line at a time. */ 50| auto lineFields = new char[][](cmdopt.endFieldIndex); 50| bool headerFound = false; 410| foreach (inputStream; cmdopt.inputSources) | { 2409| foreach (lineNum, line; inputStream.byLine.enumerate(1)) | { 527| if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); | | /* Copy the needed number of fields to the fields array. | * Note: The number is zero if no operator needs fields. Notably, the count | * operator. Used by itself, it counts the number input lines (ala 'wc -l'). | */ 440| if (cmdopt.endFieldIndex > 0) | { 390| size_t fieldIndex = 0; 4934| foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter)) | { 1438| if (fieldIndex == cmdopt.endFieldIndex) break; 1374| lineFields[fieldIndex] = fieldValue; 1374| fieldIndex++; | } | 390| if (fieldIndex == 0) | { 12| assert(cmdopt.endFieldIndex > 0); 12| assert(line.length == 0); | | /* Bug work-around. Empty lines are not handled properly by splitter. | * - Bug: https://issues.dlang.org/show_bug.cgi?id=15735 | * - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 | * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the | * unique values in field 1. If there's only one column, then an empty | * line becomes an empty string for field 1. Work-around: Point to the | * line. It's an empty string. | */ 12| lineFields[fieldIndex] = line; 12| fieldIndex++; | } | 390| enforce(fieldIndex >= cmdopt.endFieldIndex, 2| format("Not enough fields in line. File: %s, Line: %s", | inputStream.name, lineNum)); | } | 694| if (cmdopt.hasHeader && lineNum == 1) | { 47| if (!headerFound) | { 26| summarizer.processHeaderLine(lineFields); 26| headerFound = true; | | /* Write the header now. This helps tasks further on in a unix | * pipeline detect errors quickly, without waiting for all the | * data to flow through the pipeline. Note that an upstream task | * may have flushed its header line, so the header may arrive | * long before the main block of data. | */ 26| summarizer.writeSummaryHeader(bufferedOutput, printOptions); 26| bufferedOutput.flush; | } | } | else | { | /* Process the line. Processing will fail (throw) if a field cannot be | * converted to the expected type. | */ 391| try summarizer.processNextLine(lineFields); | catch (Exception exc) | { 1| throw new Exception( | format("Could not process line or field: %s\n File: %s Line: %s%s", | exc.msg, inputStream.name, lineNum, 1| (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); | } | } | } | } | | debug writeln("[tsvSummarize] After reading all data."); | | /* Whew! We're done processing input data. Run the calculations and print. */ | 46| summarizer.writeSummaryBody(bufferedOutput, printOptions); |} | |/** The default field header. This is used when the input doesn't have field headers, | * but field headers are used in the output. The default is "fieldN", where N is the | * 1-upped field number. | */ |string fieldHeaderFromIndex(size_t fieldIndex) |{ | enum prefix = "field"; 1587| return prefix ~ (fieldIndex + 1).to!string; |} | |unittest |{ 1| assert(fieldHeaderFromIndex(0) == "field1"); 1| assert(fieldHeaderFromIndex(10) == "field11"); |} | |/** Produce a summary header from a field header. | * | * The result has the form `_`. e.g. If the field header is | * "length" and the operation is "max", the summary header is "length_max". The field | * header typically comes a header line in the input data or was constructed by | * fieldHeaderFromIndex(). | * | * If operationName is the empty string, then fieldHeader is used unchanged. This supports | * the Retain operator. | */ |string summaryHeaderFromFieldHeader(string fieldHeader, string operationName) |{ 4098| return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader; |} | |unittest |{ 1| assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc"); 1| assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield"); |} | |/** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically | * specified with command line options, it is separated out for modularity. | */ |struct SummarizerPrintOptions |{ | char fieldDelimiter; | char valuesDelimiter; | size_t floatPrecision = 12; | | import std.traits : isFloatingPoint, isIntegral; | | auto formatNumber(T)(T n) const | if (isFloatingPoint!T || isIntegral!T) | { | import tsv_utils.common.numerics : formatNumber; 3280| return formatNumber!T(n, floatPrecision); | } |} | |/** A Summarizer object maintains the state of the summarization and performs basic | * processing. Handling of files and input lines is left to the caller. | * | * Classes supporting the Summarizer must implement the methods: | * - setOperators - Called after initializing the object for each operator to be processed. | * - processHeaderLine - Called to process the header line of each file. Returns true if | * it was the first header line processed (used when reading multiple files). | * - processNextLine - Called to process non-header lines. | * - writeSummaryHeader - Called to write the header line. | * - writeSummaryBody - Called to write the result lines. | * | */ |interface Summarizer(OutputRange) |{ | /** Called after initializing the object for each operator to be processed. */ | void setOperators(InputRange!Operator op); | | /** Called to process the header line of each file. Returns true if it was the | * first header line processed (used when reading multiple files). | */ | bool processHeaderLine(const char[][] lineFields); | | /** Called to process non-header lines. */ | void processNextLine(const char[][] lineFields); | | /** Called to write the header line. */ | void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); | | /** Called to write the result lines. */ | void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); |} | |/** SummarizerBase performs work shared by all sumarizers, most everything except for | * handling of unique keys. | * | * The base class handles creation, allocates storage for Operators and SharedFieldValues, | * and similar. Derived classes deal primarily with unique keys and the associated Calculators | * and UniqueKeyValuesLists. | */ |class SummarizerBase(OutputRange) : Summarizer!OutputRange |{ | private char _inputFieldDelimiter; | private bool _hasProcessedFirstHeaderLine = false; | private SharedFieldValues _sharedFieldValues = null; // Null if no shared field value lists. | protected MissingFieldPolicy _missingPolicy; | protected DList!Operator _operators; | protected size_t _numOperators = 0; | 1101| this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) | { 1101| _inputFieldDelimiter = inputFieldDelimiter; 1101| _missingPolicy = missingPolicy; | } | | char inputFieldDelimiter() const @property | { 230| return _inputFieldDelimiter; | } | | /** Sets the Operators used by the Summarizer. Called after construction. */ | void setOperators(InputRange!Operator operators) | { 1101| foreach (op; operators) | { 1283| _operators.insertBack(op); 1283| _numOperators++; 1283| auto numericFieldsToSave = op.numericFieldsToSave(); 1283| auto textFieldsToSave = op.textFieldsToSave(); | 2380| if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0) | { 362| if (_sharedFieldValues is null) | { 316| _sharedFieldValues = new SharedFieldValues(); | } 734| numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x)); 714| textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x)); | } | } | } | | /** Called to process the header line of each file. Returns true if it was the | * first header line processed (used when reading multiple files). | */ | bool processHeaderLine(const char[][] lineFields) | { 481| if (!_hasProcessedFirstHeaderLine) | { 1747| _operators.each!(x => x.processHeaderLine(lineFields)); 481| _hasProcessedFirstHeaderLine = true; 481| return true; | } | else | { 0000000| return false; | } | } | | protected final UniqueKeyValuesLists makeUniqueKeyValuesLists() | { 1340| return (_sharedFieldValues is null) 872| ? null 468| : _sharedFieldValues.makeUniqueKeyValuesLists; | } | | abstract void processNextLine(const char[][] lineFields); | abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); | abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); |} | |/** The NoKeySummarizer is used when summarizing values across the entire input. | * | * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing | * through that mechanism. | */ |final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange |{ | private Calculator[] _calculators; | private UniqueKeyValuesLists _valueLists; | 996| this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) | { 996| super(inputFieldDelimiter, missingPolicy); | } | | /** Called after initializing the object for each operator to be processed. */ | override void setOperators(InputRange!Operator operators) | { 996| super.setOperators(operators); | | /* Only one Calculator per Operation, so create them as Operators are added. */ 2057| foreach (op; operators) _calculators ~= op.makeCalculator; 996| _valueLists = super.makeUniqueKeyValuesLists(); | } | | /** Called to process non-header lines. */ | override void processNextLine(const char[][] lineFields) | { 12111| _calculators.each!(x => x.processNextLine(lineFields)); 4511| if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy); | } | | /** Called to write the header line. */ | override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) | { 1623| put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 785| put(outputStream, '\n'); | } | | /** Called to write the result lines. */ | override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) | { 4415| put(outputStream, | _calculators[] 4480| .map!(x => x.calculate(_valueLists, printOptions)) | .join(printOptions.fieldDelimiter)); 4415| put(outputStream, '\n'); | } |} | |/** KeySummarizerBase does work shared by the single key and multi-key summarizers. | * | * The primary difference between those two is the formation of the key. The primary | * reason for separating those into two separate classes is to simplify (speed-up) | * handling of single field keys, which are the most common use case. | */ |class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange |{ | protected struct UniqueKeyData | { | Calculator[] calculators; | UniqueKeyValuesLists valuesLists; | } | | private DList!string _uniqueKeys; | private UniqueKeyData[string] _uniqueKeyData; | 105| this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) | { 105| super(inputFieldDelimiter, missingPolicy); | } | | protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields) | { | debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string); | 561| auto dataPtr = (key in _uniqueKeyData); 1122| auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr; | 3667| data.calculators.each!(x => x.processNextLine(lineFields)); 919| if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy); | } | | protected UniqueKeyData addUniqueKey(string key) | { 344| assert(key !in _uniqueKeyData); | 344| _uniqueKeys.insertBack(key); | 344| auto calculators = new Calculator[_numOperators]; 344| size_t i = 0; 3325| foreach (op; _operators) | { 879| calculators[i] = op.makeCalculator; 879| i++; | } | 344| return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists()); | } | | override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) | { 91| put(outputStream, keyFieldHeader()); 91| put(outputStream, printOptions.fieldDelimiter); 290| put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 91| put(outputStream, '\n'); | } | | override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) | { 1231| foreach(key; _uniqueKeys) | { 341| auto data = _uniqueKeyData[key]; 341| put(outputStream, key); 341| put(outputStream, printOptions.fieldDelimiter); 341| put(outputStream, | data.calculators[] 876| .map!(x => x.calculate(data.valuesLists, printOptions)) | .join(printOptions.fieldDelimiter)); 341| put(outputStream, '\n'); | } | } | | abstract string keyFieldHeader() const @property; |} | |/** This Summarizer is for the case where the unique key is based on exactly one field. | */ |final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange |{ | private size_t _keyFieldIndex = 0; | private string _keyFieldHeader; | private DList!string _uniqueKeys; | 68| this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) | { 68| super(inputFieldDelimiter, missingPolicy); 68| _keyFieldIndex = keyFieldIndex; 68| _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex); | } | | override string keyFieldHeader() const @property | { 59| return _keyFieldHeader; | } | | override bool processHeaderLine(const char[][] lineFields) | { 53| assert(_keyFieldIndex <= lineFields.length); | 53| bool isFirstHeaderLine = super.processHeaderLine(lineFields); 53| if (isFirstHeaderLine) | { 53| _keyFieldHeader = lineFields[_keyFieldIndex].to!string; | } 53| return isFirstHeaderLine; | } | | override void processNextLine(const char[][] lineFields) | { 358| assert(_keyFieldIndex < lineFields.length); 358| processNextLineWithKey(lineFields[_keyFieldIndex], lineFields); | } |} | |/** This Summarizer is for the case where the unique key is based on multiple fields. | */ |final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange |{ | private size_t[] _keyFieldIndices; | private string _keyFieldHeader; | private DList!string _uniqueKeys; | 37| this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) | { 37| super(inputFieldDelimiter, missingPolicy); 37| _keyFieldIndices = keyFieldIndices.dup; 37| _keyFieldHeader = 79| _keyFieldIndices.map!(i => fieldHeaderFromIndex(i)) | .join(inputFieldDelimiter); | } | | override string keyFieldHeader() const @property | { 32| return _keyFieldHeader; | } | | override bool processHeaderLine(const char[][] lineFields) | { 84| assert(_keyFieldIndices.all!(x => x < lineFields.length)); 27| assert(_keyFieldIndices.length >= 2); | 27| bool isFirstHeaderLine = super.processHeaderLine(lineFields); 27| if (isFirstHeaderLine) | { 84| _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; | } 27| return isFirstHeaderLine; | } | | override void processNextLine(const char[][] lineFields) | { 639| assert(_keyFieldIndices.all!(x => x < lineFields.length)); 203| assert(_keyFieldIndices.length >= 2); | 639| string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 203| processNextLineWithKey(key, lineFields); | } |} | |version(unittest) |{ | /* testSummarizer is a helper that can run many types of unit tests against | * Summarizers. It can also test operators, but there are separate helper functions | * better suited for that purpose. | * | * Arguments are a command line args, an input file, and expected output. The | * input file and expected output are already split into lines and fields, the helper | * manages re-assembly. The program name from the command line args is printed if an | * an error occurs, it is useful to identify the test that failed. | * | * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of | * file input/output would enable running unit tests directly on top of tsvSummarize. | * | * Update (April 2020): With the introduction of InputSourceRange and ByLineSource, | * there needs to be a physical file when call processArgs. Its hard to get around, | * as the intent is to read the header line of the first input file during command | * line argument processing. Eventually this unit test process will need to be | * rewritten. For now, a file with the equivalent data is being added to the command | * line. | */ | void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected) | { | import std.array : appender; | 92| assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty."); | | auto formatAssertMessage(T...)(string msg, T formatArgs) | { 0000000| auto formatString = "[testSummarizer] %s: " ~ msg; 0000000| return format(formatString, cmdArgs[0], formatArgs); | } | 92| TsvSummarizeOptions cmdopt; 92| auto savedCmdArgs = cmdArgs.to!string; 92| auto r = cmdopt.processArgs(cmdArgs); 92| assert(r[0], formatAssertMessage("Invalid command line args: '%s'.", savedCmdArgs)); | 565| assert(file.all!(line => line.length >= cmdopt.endFieldIndex), | formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file.")); | | /* Pick the Summarizer based on the number of key-fields entered. */ 92| auto summarizer = | (cmdopt.keyFields.length == 0) 7| ? new NoKeySummarizer!(typeof(appender!(char[])()))( | cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) | 85| : (cmdopt.keyFields.length == 1) 54| ? new OneKeySummarizer!(typeof(appender!(char[])()))( | cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) | 31| : new MultiKeySummarizer!(typeof(appender!(char[])()))( | cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); | | /* Add the operators to the Summarizer. */ 92| summarizer.setOperators(inputRangeObject(cmdopt.operators[])); | | /* Process the file one line at a time. */ 92| auto lineFields = new char[][](cmdopt.endFieldIndex); 92| bool headerFound = false; 2549| foreach (lineNum, line; file.enumerate(1)) | { | /* Copy the needed fields to the fields array. */ 7349| foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup; | 888| if (cmdopt.hasHeader && lineNum == 1) | { 73| if (!headerFound) | { 73| summarizer.processHeaderLine(lineFields); 73| headerFound = true; | } | } | else | { 400| try summarizer.processNextLine(lineFields); | catch (Exception exc) | { 0000000| assert(false, formatAssertMessage(exc.msg)); | } | } | } 92| auto printOptions = SummarizerPrintOptions( | cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); | 92| auto summarizerOutput = appender!(char[])(); | 111| if (cmdopt.hasHeader || cmdopt.writeHeader) | { 83| summarizer.writeSummaryHeader(summarizerOutput, printOptions); | } | 92| summarizer.writeSummaryBody(summarizerOutput, printOptions); 657| auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string; 270| if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n"; | 92| assert(summarizerOutput.data == expectedOutput, | formatAssertMessage( | "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", | expectedOutput.to!string, summarizerOutput.data.to!string)); | } | | void writeDataFile(string filepath, string[][] fileData, string delimiter = "\t") | { | import std.algorithm; | import std.stdio; | 48| auto f = filepath.File("w"); 300| foreach (record; fileData) f.writeln(record.joiner(delimiter)); 24| f.close; | } |} | |unittest |{ | import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. | import std.file : mkdir, rmdirRecurse; | import std.path : buildPath; | 1| auto testDir = makeUnittestTempDir("tsv_summarizer"); 1| scope(exit) testDir.rmdirRecurse; | | /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited | * extent, command line option handling (TsvSummarizeOptions). Individual operators | * have separate tests, those tests test the no-key summarizer. The Values operator is | * used in these tests. It engages a number of behaviors, and the results have limited | * ambiguity. Using only one operator limits dependence on individual operators. | * | * Update (April 2020): There now needs to be a real file passed to testSummarizer. | * See the comments with testSummarizer for details. | */ | 1| auto file1 = [["fld1", "fld2", "fld3"], | ["a", "a", "3"], | ["c", "a", "2b"], | ["c", "bc", ""], | ["a", "c", "2b"], | ["", "bc", ""], | ["c", "bc", "3"]]; | 1| auto file1Path = buildPath(testDir, "file1.tsv"); 1| auto file1NoHeaderPath = buildPath(testDir, "file1_noheader.tsv"); 1| writeDataFile(file1Path, file1); 1| writeDataFile(file1NoHeaderPath, file1[1 .. $]); | | /* Single-key summarizer tests. | */ 1| testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1", file1Path], | file1, | [["fld1", "fld1_values"], | ["a", "a|a"], | ["c", "c|c|c"], | ["", ""]] | ); 1| testSummarizer(["unittest-sk-1-named", "--header", "--group-by", "fld1", "--values", "fld1", file1Path], | file1, | [["fld1", "fld1_values"], | ["a", "a|a"], | ["c", "c|c|c"], | ["", ""]] | ); 1| testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2", file1Path], | file1, | [["fld1", "fld2_values"], | ["a", "a|c"], | ["c", "a|bc|bc"], | ["", "bc"]] | ); 1| testSummarizer(["unittest-sk-2-named", "-H", "--group-by", "fld1", "--values", "fld2", file1Path], | file1, | [["fld1", "fld2_values"], | ["a", "a|c"], | ["c", "a|bc|bc"], | ["", "bc"]] | ); 1| testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3", file1Path], | file1, | [["fld1", "fld3_values"], | ["a", "3|2b"], | ["c", "2b||3"], | ["", ""]] | ); 1| testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3", file1Path], | file1, | [["fld1", "fld1_values", "fld2_values", "fld3_values"], | ["a", "a|a", "a|c", "3|2b"], | ["c", "c|c|c", "a|bc|bc", "2b||3"], | ["", "", "bc", ""]] | ); 1| testSummarizer(["unittest-sk-4-named-a", "-H", "--group-by", "fld1", "--values", "fld1,fld2,fld3", file1Path], | file1, | [["fld1", "fld1_values", "fld2_values", "fld3_values"], | ["a", "a|a", "a|c", "3|2b"], | ["c", "c|c|c", "a|bc|bc", "2b||3"], | ["", "", "bc", ""]] | ); 1| testSummarizer(["unittest-sk-4-named-b", "-H", "--group-by", "fld1", "--values", "fld*", file1Path], | file1, | [["fld1", "fld1_values", "fld2_values", "fld3_values"], | ["a", "a|a", "a|c", "3|2b"], | ["c", "c|c|c", "a|bc|bc", "2b||3"], | ["", "", "bc", ""]] | ); 1| testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3", file1Path], | file1, | [["fld1", "fld1_values", "fld2_values", "fld3_values"], | ["a", "a|a", "a|c", "3|2b"], | ["c", "c|c|c", "a|bc|bc", "2b||3"], | ["", "", "bc", ""]] | ); 1| testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1", file1Path], | file1, | [["fld1", "fld3_values", "fld2_values", "fld1_values"], | ["a", "3|2b", "a|c", "a|a"], | ["c", "2b||3", "a|bc|bc", "c|c|c"], | ["", "", "bc", ""]] | ); 1| testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1", file1Path], | file1, | [["fld1", "fld3_values", "fld2_values", "fld1_values"], | ["a", "3|2b", "a|c", "a|a"], | ["c", "2b||3", "a|bc|bc", "c|c|c"], | ["", "", "bc", ""]] | ); 1| testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1", file1Path], | file1, | [["fld2", "fld1_values"], | ["a", "a|c"], | ["bc", "c||c"], | ["c", "a"]] | ); 1| testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2", file1Path], | file1, | [["fld2", "fld2_values"], | ["a", "a|a"], | ["bc", "bc|bc|bc"], | ["c", "c"]] | ); 1| testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3", file1Path], | file1, | [["fld2", "fld3_values"], | ["a", "3|2b"], | ["bc", "||3"], | ["c", "2b"]] | ); 1| testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3", file1Path], | file1, | [["fld2", "fld1_values", "fld3_values"], | ["a", "a|c", "3|2b"], | ["bc", "c||c", "||3"], | ["c", "a", "2b"]] | ); 1| testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1", file1Path], | file1, | [["fld2", "fld3_values", "fld1_values"], | ["a", "3|2b", "a|c"], | ["bc", "||3", "c||c"], | ["c", "2b", "a"]] | ); 1| testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1", file1Path], | file1, | [["fld3", "fld1_values"], | ["3", "a|c"], | ["2b", "c|a"], | ["", "c|"]] | ); 1| testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2", file1Path], | file1, | [["fld3", "fld2_values"], | ["3", "a|bc"], | ["2b", "a|c"], | ["", "bc|bc"]] | ); 1| testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2", file1Path], | file1, | [["fld3", "fld1_values", "fld2_values"], | ["3", "a|c", "a|bc"], | ["2b", "c|a", "a|c"], | ["", "c|", "bc|bc"]] | ); 1| testSummarizer(["unittest-sk-15-named", "-H", "--group-by", "fld3", "--values", "fld1,fld2", file1Path], | file1, | [["fld3", "fld1_values", "fld2_values"], | ["3", "a|c", "a|bc"], | ["2b", "c|a", "a|c"], | ["", "c|", "bc|bc"]] | ); | | /* Multi-key summarizer tests. | */ 1| testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1", file1Path], | file1, | [["fld1", "fld2", "fld1_values"], | ["a", "a", "a"], | ["c", "a", "c"], | ["c", "bc", "c|c"], | ["a", "c", "a"], | ["", "bc", ""]] | ); 1| testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2", file1Path], | file1, | [["fld1", "fld2", "fld2_values"], | ["a", "a", "a"], | ["c", "a", "a"], | ["c", "bc", "bc|bc"], | ["a", "c", "c"], | ["", "bc", "bc"]] | ); 1| testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3", file1Path], | file1, | [["fld1", "fld2", "fld3_values"], | ["a", "a", "3"], | ["c", "a", "2b"], | ["c", "bc", "|3"], | ["a", "c", "2b"], | ["", "bc", ""]] | ); 1| testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1", file1Path], | file1, | [["fld1", "fld2", "fld3_values", "fld1_values"], | ["a", "a", "3", "a"], | ["c", "a", "2b", "c"], | ["c", "bc", "|3", "c|c"], | ["a", "c", "2b", "a"], | ["", "bc", "", ""]] | ); 1| testSummarizer(["unittest-mk-4-named", "-H", "--group-by", "fld1,fld2", "--values", "fld3,fld1", file1Path], | file1, | [["fld1", "fld2", "fld3_values", "fld1_values"], | ["a", "a", "3", "a"], | ["c", "a", "2b", "c"], | ["c", "bc", "|3", "c|c"], | ["a", "c", "2b", "a"], | ["", "bc", "", ""]] | ); 1| testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1", file1Path], | file1, | [["fld3", "fld2", "fld1_values"], | ["3", "a", "a"], | ["2b", "a", "c"], | ["", "bc", "c|"], | ["2b", "c", "a"], | ["3", "bc", "c"]] | ); 1| testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1", file1Path], | file1, | [["fld3", "fld2", "fld1_values"], | ["3", "a", "a"], | ["2b", "a", "c"], | ["", "bc", "c|"], | ["2b", "c", "a"], | ["3", "bc", "c"]] | ); 1| testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2", file1Path], | file1, | [["fld2", "fld1", "fld3", "fld2_values"], | ["a", "a", "3", "a"], | ["a", "c", "2b", "a"], | ["bc", "c", "", "bc"], | ["c", "a", "2b", "c"], | ["bc", "", "", "bc"], | ["bc", "c", "3", "bc"]] | ); | | /* Missing policies. */ 1| testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing", file1Path], | file1, | [["fld1", "fld1_values"], | ["a", "a|a"], | ["c", "c|c|c"], | ["", ""]] | ); 1| testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x", file1Path], | file1, | [["fld1", "fld2_values"], | ["a", "a|c"], | ["c", "a|bc|bc"], | ["", "bc"]] | ); 1| testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x", file1Path], | file1, | [["fld1", "fld3_values"], | ["a", "3|2b"], | ["c", "2b|3"], | ["", ""]] | ); 1| testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x", file1Path], | file1, | [["fld1", "fld1_values", "fld2_values", "fld3_values"], | ["a", "a|a", "a|c", "3|2b"], | ["c", "c|c|c", "a|bc|bc", "2b|3"], | ["", "", "bc", ""]] | ); 1| testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA", file1Path], | file1, | [["fld1", "fld1_values"], | ["a", "a|a"], | ["c", "c|c|c"], | ["", "NA"]] | ); 1| testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA", file1Path], | file1, | [["fld1", "fld2_values"], | ["a", "a|c"], | ["c", "a|bc|bc"], | ["", "bc"]] | ); 1| testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA", file1Path], | file1, | [["fld1", "fld3_values"], | ["a", "3|2b"], | ["c", "2b|NA|3"], | ["", "NA"]] | ); 1| testSummarizer(["unittest-mis-7-named", "-H", "-g", "fld1", "--values", "fld3", "-r", "NA", file1Path], | file1, | [["fld1", "fld3_values"], | ["a", "3|2b"], | ["c", "2b|NA|3"], | ["", "NA"]] | ); 1| testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA", file1Path], | file1, | [["fld1", "fld1_values", "fld2_values", "fld3_values"], | ["a", "a|a", "a|c", "3|2b"], | ["c", "c|c|c", "a|bc|bc", "2b|NA|3"], | ["", "NA", "bc", "NA"]] | ); 1| testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x", file1Path], | file1, | [["fld1", "fld2", "fld3_values", "fld1_values"], | ["a", "a", "3", "a"], | ["c", "a", "2b", "c"], | ["c", "bc", "3", "c|c"], | ["a", "c", "2b", "a"], | ["", "bc", "", ""]] | ); 1| testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x", file1Path], | file1, | [["fld3", "fld2", "fld1_values"], | ["3", "a", "a"], | ["2b", "a", "c"], | ["", "bc", "c"], | ["2b", "c", "a"], | ["3", "bc", "c"]] | ); 1| testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x", file1Path], | file1, | [["fld2", "fld1", "fld3", "fld2_values"], | ["a", "a", "3", "a"], | ["a", "c", "2b", "a"], | ["bc", "c", "", "bc"], | ["c", "a", "2b", "c"], | ["bc", "", "", "bc"], | ["bc", "c", "3", "bc"]] | ); 1| testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA", file1Path], | file1, | [["fld1", "fld2", "fld3_values", "fld1_values"], | ["a", "a", "3", "a"], | ["c", "a", "2b", "c"], | ["c", "bc", "NA|3", "c|c"], | ["a", "c", "2b", "a"], | ["", "bc", "NA", "NA"]] | ); 1| testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA", file1Path], | file1, | [["fld3", "fld2", "fld1_values"], | ["3", "a", "a"], | ["2b", "a", "c"], | ["", "bc", "c|NA"], | ["2b", "c", "a"], | ["3", "bc", "c"]] | ); 1| testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA", file1Path], | file1, | [["fld2", "fld1", "fld3", "fld2_values"], | ["a", "a", "3", "a"], | ["a", "c", "2b", "a"], | ["bc", "c", "", "bc"], | ["c", "a", "2b", "c"], | ["bc", "", "", "bc"], | ["bc", "c", "3", "bc"]] | ); | | /* Validate that the no-key summarizer works with testSummarizer helper function. | */ 1| testSummarizer(["unittest-nk-1", "-H", "--values", "1,2", file1Path], | file1, | [["fld1_values", "fld2_values"], | ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] | ); 1| testSummarizer(["unittest-nk-1-named", "-H", "--values", "fld1,fld2", file1Path], | file1, | [["fld1_values", "fld2_values"], | ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] | ); | | /* Header variations: no header line; auto-generated header line; custom headers. | */ 1| testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1", file1NoHeaderPath], | file1[1..$], | [["a", "a|a"], | ["c", "c|c|c"], | ["", ""]] | ); 1| testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2", file1NoHeaderPath], | file1[1..$], | [["a", "a", "a"], | ["c", "a", "a"], | ["c", "bc", "bc|bc"], | ["a", "c", "c"], | ["", "bc", "bc"]] | ); 1| testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1", file1NoHeaderPath], | file1[1..$], | [["field2", "field1_values"], | ["a", "a|c"], | ["bc", "c||c"], | ["c", "a"]] | ); 1| testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1", file1NoHeaderPath], | file1[1..$], | [["field3", "field2", "field1_values"], | ["3", "a", "a"], | ["2b", "a", "c"], | ["", "bc", "c|"], | ["2b", "c", "a"], | ["3", "bc", "c"]] | ); 1| testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values", file1Path], | file1, | [["fld2", "Field3Values"], | ["a", "3|2b"], | ["bc", "||3"], | ["c", "2b"]] | ); 1| testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues", file1Path], | file1, | [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], | ["a", "a", "3", "a"], | ["c", "a", "2b", "c"], | ["c", "bc", "|3", "c|c"], | ["a", "c", "2b", "a"], | ["", "bc", "", ""]] | ); 1| testSummarizer(["unittest-hdr-6-named-a", "-H", "--group-by", "fld1,fld2", "--values", "fld3:FieldThreeValues", "--values", "fld1:FieldOneValues", file1Path], | file1, | [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], | ["a", "a", "3", "a"], | ["c", "a", "2b", "c"], | ["c", "bc", "|3", "c|c"], | ["a", "c", "2b", "a"], | ["", "bc", "", ""]] | ); 1| testSummarizer(["unittest-hdr-6-named-b", "-H", "--group-by", "fld1,fld2", "--values", "fld3 FieldThreeValues", "--values", "fld1 FieldOneValues", file1Path], | file1, | [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], | ["a", "a", "3", "a"], | ["c", "a", "2b", "c"], | ["c", "bc", "|3", "c|c"], | ["a", "c", "2b", "a"], | ["", "bc", "", ""]] | ); 1| testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals", file1NoHeaderPath], | file1[1..$], | [["field1", "f3_vals", "f2_vals", "f1_vals"], | ["a", "3|2b", "a|c", "a|a"], | ["c", "2b||3", "a|bc|bc", "c|c|c"], | ["", "", "bc", ""]] | ); 1| testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath], | file1[1..$], | [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], | ["a", "3", "a", "3", "a", "a"], | ["c", "2b", "a", "2b", "c", "a"], | ["c", "", "bc", "", "c", "bc"], | ["a", "2b", "c", "2b", "a", "c"], | ["", "", "bc", "", "", "bc"], | ["c", "3", "bc", "3", "c", "bc"]] | ); 1| testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath], | file1[1..$], | [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], | ["a", "3", "a", "3", "a", "a"], | ["c", "2b", "a", "2b", "c", "a"], | ["c", "", "bc", "", "c", "bc"], | ["a", "2b", "c", "2b", "a", "c"], | ["", "", "bc", "", "", "bc"], | ["c", "3", "bc", "3", "c", "bc"]] | ); | | /* Alternate file widths and lengths. | */ | 1| auto file3x2 = [["fld1", "fld2", "fld3"], | ["a", "b", "c"], | ["c", "b", "a"]]; | 1| auto file3x2Path = buildPath(testDir, "file3x2.tsv"); 1| auto file3x2NoHeaderPath = buildPath(testDir, "file3x2_noheader.tsv"); 1| writeDataFile(file3x2Path, file3x2); 1| writeDataFile(file3x2NoHeaderPath, file3x2[1 .. $]); | 1| testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3", file3x2Path], | file3x2, | [["fld1", "fld3_values"], | ["a", "c"], | ["c", "a"]] | ); 1| testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3", file3x2Path], | file3x2, | [["fld2", "fld3_values"], | ["b", "c|a"]] | ); 1| testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3", file3x2Path], | file3x2, | [["fld2", "fld1", "fld3_values"], | ["b", "a", "c"], | ["b", "c", "a"]] | ); | 1| auto file3x1 = [["fld1", "fld2", "fld3"], | ["a", "b", "c"]]; | 1| auto file3x1Path = buildPath(testDir, "file3x1.tsv"); 1| auto file3x1NoHeaderPath = buildPath(testDir, "file3x1_noheader.tsv"); 1| writeDataFile(file3x1Path, file3x1); 1| writeDataFile(file3x1NoHeaderPath, file3x1[1 .. $]); | 1| testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3", file3x1Path], | file3x1, | [["fld1", "fld3_values"], | ["a", "c"]] | ); 1| testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3", file3x1NoHeaderPath], | file3x1[1..$], | [["a", "c"]] | ); 1| testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3", file3x1Path], | file3x1, | [["fld2", "fld1", "fld3_values"], | ["b", "a", "c"]] | ); 1| testSummarizer(["unittest-3x1-3-named", "-H", "--group-by", "fld2,fld1", "--values", "fld3", file3x1Path], | file3x1, | [["fld2", "fld1", "fld3_values"], | ["b", "a", "c"]] | ); 1| testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3", file3x1NoHeaderPath], | file3x1[1..$], | [["b", "a", "c"]] | ); | 1| auto file3x0 = [["fld1", "fld2", "fld3"]]; | 1| auto file3x0Path = buildPath(testDir, "file3x0.tsv"); 1| auto file3x0NoHeaderPath = buildPath(testDir, "file3x0_noheader.tsv"); 1| writeDataFile(file3x0Path, file3x0); 1| writeDataFile(file3x0NoHeaderPath, file3x0[1 .. $]); | | 1| testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3", file3x0Path], | file3x0, | [["fld1", "fld3_values"]] | ); 1| testSummarizer(["unittest-3x0-1-named", "-H", "--group-by", "fld1", "--values", "fld3", file3x0Path], | file3x0, | [["fld1", "fld3_values"]] | ); 1| testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3", file3x0NoHeaderPath], | file3x0[1..$], | [] | ); 1| testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3", file3x0NoHeaderPath], | file3x0[1..$], | [["field1", "field3_values"]] | ); | | 1| testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3", file3x0Path], | file3x0, | [["fld2", "fld1", "fld3_values"]] | ); | 1| testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath], | file3x0[1..$], | [] | ); | 1| testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath], | file3x0[1..$], | [["field2", "field1", "field3_values"]] | ); | 1| auto file2x1 = [["fld1", "fld2"], | ["a", "b"]]; | 1| auto file2x1Path = buildPath(testDir, "file2x1.tsv"); 1| auto file2x1NoHeaderPath = buildPath(testDir, "file2x1_noheader.tsv"); 1| writeDataFile(file2x1Path, file2x1); 1| writeDataFile(file2x1NoHeaderPath, file2x1[1 .. $]); | 1| testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2", file2x1Path], | file2x1, | [["fld1", "fld2_values"], | ["a", "b"]] | ); 1| testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1", file2x1Path], | file2x1, | [["fld2", "fld1", "fld1_values"], | ["b", "a", "a"]] | ); | 1| auto file2x0 = [["fld1", "fld2"]]; | 1| auto file2x0Path = buildPath(testDir, "file2x0.tsv"); 1| auto file2x0NoHeaderPath = buildPath(testDir, "file2x0_noheader.tsv"); 1| writeDataFile(file2x0Path, file2x0); 1| writeDataFile(file2x0NoHeaderPath, file2x0[1 .. $]); | 1| testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2", file2x0Path], | file2x0, | [["fld1", "fld2_values"]] | ); 1| testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1", file2x0Path], | file2x0, | [["fld2", "fld1", "fld1_values"]] | ); | 1| auto file1x2 = [["fld1"], | ["a"], | [""]]; | 1| auto file1x2Path = buildPath(testDir, "file1x2.tsv"); 1| auto file1x2NoHeaderPath = buildPath(testDir, "file1x2_noheader.tsv"); 1| writeDataFile(file1x2Path, file1x2); 1| writeDataFile(file1x2NoHeaderPath, file1x2[1 .. $]); | 1| testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1", file1x2Path], | file1x2, | [["fld1", "fld1_values"], | ["a", "a"], | ["", ""]] | ); | 1| auto file1x2b = [["fld1"], | [""], | [""]]; | 1| auto file1x2bPath = buildPath(testDir, "file1x2b.tsv"); 1| auto file1x2bNoHeaderPath = buildPath(testDir, "file1x2b_noheader.tsv"); 1| writeDataFile(file1x2bPath, file1x2b); 1| writeDataFile(file1x2bNoHeaderPath, file1x2b[1 .. $]); | 1| testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1", file1x2bPath], | file1x2b, | [["fld1", "fld1_values"], | ["", "|"]] | ); | 1| auto file1x1 = [["fld1"], | ["x"]]; | 1| auto file1x1Path = buildPath(testDir, "file1x1.tsv"); 1| auto file1x1NoHeaderPath = buildPath(testDir, "file1x1_noheader.tsv"); 1| writeDataFile(file1x1Path, file1x1); 1| writeDataFile(file1x1NoHeaderPath, file1x1[1 .. $]); | 1| testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1", file1x1Path], | file1x1, | [["fld1", "fld1_values"], | ["x", "x"]] | ); 1| testSummarizer(["unittest-1x1-1-named", "-H", "--group-by", "fld1", "--values", "fld1", file1x1Path], | file1x1, | [["fld1", "fld1_values"], | ["x", "x"]] | ); | 1| testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1", file1x1NoHeaderPath], | file1x1[1..$], | [["x", "x"]] | ); | 1| testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1", file1x1NoHeaderPath], | file1x1[1..$], | [["field1", "field1_values"], | ["x", "x"]] | ); | 1| auto file1x1b = [["fld1"], | [""]]; | 1| auto file1x1bPath = buildPath(testDir, "file1x1b.tsv"); 1| auto file1x1bNoHeaderPath = buildPath(testDir, "file1x1b_noheader.tsv"); 1| writeDataFile(file1x1bPath, file1x1b); 1| writeDataFile(file1x1bNoHeaderPath, file1x1b[1 .. $]); | 1| testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1", file1x1bPath], | file1x1b, | [["fld1", "fld1_values"], | ["", ""]] | ); | 1| auto file1x0 = [["fld1"]]; | 1| auto file1x0Path = buildPath(testDir, "file1x0.tsv"); 1| auto file1x0NoHeaderPath = buildPath(testDir, "file1x0_noheader.tsv"); 1| writeDataFile(file1x0Path, file1x0); 1| writeDataFile(file1x0NoHeaderPath, file1x0[1 .. $]); | 1| testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1", file1x0Path], | file1x0, | [["fld1", "fld1_values"]] | ); | 1| testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1", file1x0NoHeaderPath], | file1x0[1..$], | [] | ); | 1| testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1", file1x0NoHeaderPath], | file1x0[1..$], | [["field1", "field1_values"]] | ); | | /* Alternate delimiters. | * | * Note: In current unit test setup the data is already in memory (file1). | * 'file1Path' points to a file with equivalent data, but not read, except if | * processing the header line. A data file is created for the '%' and '#' | * delimiter cases (these read the header), but we don't bother for the others. | */ 1| auto file1PctDelimPath = buildPath(testDir, "file1PctDelim.tsv"); 1| auto file1HashDelimPath = buildPath(testDir, "file1HashDelim.tsv"); 1| writeDataFile(file1PctDelimPath, file1, "%"); 1| writeDataFile(file1HashDelimPath, file1, "#"); | 1| testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%", file1PctDelimPath], | file1, | [["fld1_values", "fld2_values"], | ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] | ); 1| testSummarizer(["unittest-delim-1-named", "-H", "--values", "fld1,fld2", "--delimiter", "%", file1PctDelimPath], | file1, | [["fld1_values", "fld2_values"], | ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] | ); 1| testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$", file1Path], | file1, | [["fld1_values", "fld2_values"], | ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]] | ); 1| testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath], | file1, | [["fld1_values", "fld2_values"], | ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] | ); 1| testSummarizer(["unittest-delim-3-named", "-H", "--values", "fld1,fld2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath], | file1, | [["fld1_values", "fld2_values"], | ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] | ); 1| testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1", | "--delimiter", "^", "--values-delimiter", ":", file1NoHeaderPath], | file1[1..$], | [["field2", "field1_values"], | ["a", "a:c"], | ["bc", "c::c"], | ["c", "a"]] | ); 1| testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/", | "--values-delimiter", "\\", file1NoHeaderPath], | file1[1..$], | [["a", "a", "a"], | ["c", "a", "a"], | ["c", "bc", "bc\\bc"], | ["a", "c", "c"], | ["", "bc", "bc"]] | ); |} | |/* Summary Operators and Calculators | * | * Two types of objects are used in implementation: Operators and Calculators. An Operator | * represents a summary calculation specified on the command line, e.g. '--mean 5'. A | * Calculator is used to manage the summary calculation for each unique key in the input. | * | * As an example, consider the command: | * | * $tsv-summarize --group-by 1 --mean 3 --mean 5 | * | * This command will create two instances of a MeanOperator, one each for fields 3 and 5. | * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also | * create MeanCalculator objects for each unique value in field 1. For 'mean', a | * calculator needs to track occurrence count and sum. Calculators produce the final | * value when all processing is finished. | * | * Summary field headers | * | * There are several options for specifying summary field headers. The defaults combine the | * operator name and the header of the field summarized. The defaults can be overridden on | * on the command line. These scenarios are supported via the operator constructor and the | * processHeaderLine() method. | * | * Missing field policy | * | * At present, tsv-summarize has a single policy for handling missing values that applies | * to all operators. However, it is logically operator specific and is implemented that | * way. The MissingFieldPolicy struct describes the policy, each operator contains one. | * Calculators access thier operator's policy struct. | */ | |/** An Operator represents a summary calculation specified on the command line. | * e.g. '--mean 5'. | */ |interface Operator |{ | @property string header(); | @property string name(); | void processHeaderLine(const char[][] fields); | size_t[] numericFieldsToSave(); // Numeric fields this Operator needs saved | size_t[] textFieldsToSave(); // Text fields this Operator needs saved | Calculator makeCalculator(); |} | |/** Calculators are responsible for the calculation of a single computation. They | * process each line and produce the final value when all processing is finished. | */ |interface Calculator |{ | void processNextLine(const char[][] fields); | string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions); |} | |/** This class describes processing behavior when a missing value is encountered. | */ |final class MissingFieldPolicy |{ | private bool _useMissing = true; // True if missing values are processed unchanged. | private bool _replaceMissing = false; // True if missing values are replaced. | private string _missingReplacement; // Replacement string if replaceMissing is true. | 342| this (const bool excludeMissing = false, string missingReplacement = "") | { 342| updatePolicy(excludeMissing, missingReplacement); | } | | void updatePolicy(const bool excludeMissing, string missingReplacement) | { 520| _missingReplacement = missingReplacement; 520| _replaceMissing = missingReplacement.length != 0; 1010| _useMissing = !excludeMissing && !replaceMissing; | } | | final bool isMissingField(const char[] field) const | { 2022| return field.length == 0; | } | | final bool useMissing() const @property | { 6889| return _useMissing; | } | | final bool excludeMissing() const @property | { 0000000| return !_useMissing && !_replaceMissing; | } | | final bool replaceMissing() const @property | { 904| return _replaceMissing; | } | | final string missingReplacement() const @property | { 209| return _missingReplacement; | } |} | |/* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected | * while reading data. Operations like median collect all values and operate on them when | * running the final calculation. Value lists are needed for each unique key. A command | * using multiple Operators may save multiple fields. And, different Operators may be run | * against the same field. | * | * The last part motivates these classes. Handling large data sets necessitates minimizing | * in-memory storage, making it desirable to share identical lists between Calculators. | * Otherwise, each Calculator could implement its own storage, which would be simpler. | * | * The setup works as follows: | * - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods). | * - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list | * of the fields advertised by Operators as needing sharing. This list gets created | * during command initialization (SummarizerBase.setOperators). | * - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every | * time a new unique key is found, in parellel to the Calculator objects created for the | * key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes. | * - A unique key's UniqueKeyValuesLists object is passed each input line, same as | * Calculators, saving the values. | * - Calculators retrieve the saved values during the calculation phase. The calculator's | * ProcessNextField method is typically a no-op. | * - Calculators cannot make assumptions about the order of the saved values. This is | * pragmatic concession to median and quantile calculations, which need to sort the data, | * at least partially. Rather than generate sorted copies, the current algorithms | * sort the data in place. | * | * One concession to duplicate storage is that text and numeric versions of the same | * field might be stored. The reason is because it's important to convert text to numbers | * as they are read so that useful error messages can be generated. And, storing both | * forms of the same field should be less common. | * | * The current implementation uses the same missing values policy for all fields. If | * multiple policies become supported this will need to change. | * | * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is | * to avoid repeated calculations of the median by different calculations. | */ | |final class SharedFieldValues |{ | // Arrays with field indices that need to be saved. | private size_t[] _numericFieldIndices; | private size_t[] _textFieldIndices; | | /* Called during summarizer setup to add a shared field value for a specific field index. | * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index. | * A specific index is only added once. | */ | final void addNumericIndex (size_t index) | { 368| if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index; | } | | /* Similar to addNumericIndex, except adds a text index. */ | final void addTextIndex (size_t index) | { 352| if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index; | } | | /* Called every time a new key is found, or once at the beginning of the program if no keys | * are being used (entire column summarized). | */ | final UniqueKeyValuesLists makeUniqueKeyValuesLists() | { 468| return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices); | } |} | |final class UniqueKeyValuesLists |{ | /* A FieldValues object holds is a list of values collect for a specific field. A | * unique key may hold several. For example, the command: | * $ tsv-summarize --k 1 --median 4 -- median 5 | * requires keeping lists for both fields 4 and 5. This in turn will result in a | * _numericFieldValues being a 2 element array, one with a list of field 4 values, | * the second of field 5 values. Linear search is used to find a specific field. | */ | private FieldValues!double[] _numericFieldValues; | private FieldValues!string[] _textFieldValues; | private double[] _numericFieldMedians; | | /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */ 468| this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices) | { 468| if (numericFieldIndices.length > 0) | { 182| _numericFieldValues = new FieldValues!double[](numericFieldIndices.length); 1274| foreach (i, fieldIndex; numericFieldIndices) 182| _numericFieldValues[i] = new FieldValues!double(fieldIndex); | } | 468| if (textFieldIndices.length > 0) | { 288| _textFieldValues = new FieldValues!string[](textFieldIndices.length); 2544| foreach (i, fieldIndex; textFieldIndices) 420| _textFieldValues[i] = new FieldValues!string(fieldIndex); | } | } | | void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) | { 2466| _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 2914| _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); | } | | private FieldValues!double findNumericFieldValues(size_t index) | { 988| alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b); 988| auto r = find!pred(_numericFieldValues, index); 988| assert(!r.empty); 988| return r.front; | } | | private FieldValues!string findTextFieldValues(size_t index) | { 771| alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b); 600| auto r = find!pred(_textFieldValues, index); 600| assert(!r.empty); 600| return r.front; | } | | final double[] numericValues(size_t index) | { 192| return findNumericFieldValues(index).getArray; | } | | final double[] numericValuesSorted(size_t index) | { 422| return findNumericFieldValues(index).getSortedArray; | } | | final string[] textValues(size_t index) | { 600| return findTextFieldValues(index).getArray; | } | | final string[] textValuesSorted(size_t index) | { 0000000| return findTextFieldValues(index).getSortedArray; | } | | final double numericValuesMedian(size_t index) | { 374| return findNumericFieldValues(index).median; | } | | private final class FieldValues(ValueType) | { | import std.array : appender; | private size_t _fieldIndex; | private Appender!(ValueType[]) _values; | private bool _haveMedian = false; | private bool _isSorted = false; | private ValueType _medianValue; | 602| this(size_t fieldIndex) | { 602| _fieldIndex = fieldIndex; | } | | final size_t length() const @property | { 0000000| return _values.data.length; | } | | final size_t fieldIndex() const @property | { 1759| return _fieldIndex; | } | | final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) | { | debug writefln("[%s]: %s", __FUNCTION__, fields.to!string); | 1472| const char[] field = fields[_fieldIndex]; 1832| if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) | { 1368| _values.put(field.to!ValueType); 1368| _haveMedian = false; 1368| _isSorted = false; | } 104| else if (missingPolicy.replaceMissing) | { 53| _values.put(missingPolicy.missingReplacement.to!ValueType); 53| _haveMedian = false; 53| _isSorted = false; | } | } | | /* Return an input range of the values. */ | final auto values() | { 0000000| return _values.data; | } | | final ValueType[] getArray() | { 792| return _values.data; | } | | final ValueType[] getSortedArray() | { 422| if (!_isSorted) | { | import std.algorithm : sort; 412| sort(_values.data); 412| _isSorted = true; | } 422| return _values.data; | } | | final ValueType median() | { 374| if (!_haveMedian) | { | import tsv_utils.common.numerics : rangeMedian; 352| _medianValue = _values.data.rangeMedian(); 352| _haveMedian = true; | } | 374| return _medianValue; | } | } |} | |/** SingleFieldOperator is a base class for single field operators, the most common | * Operator. Derived classes implement makeCalculator and the Calculator class it returns. | */ |class SingleFieldOperator : Operator |{ | import std.typecons : Flag; | | private string _name; | private string _header; | private size_t _fieldIndex; | private bool _useHeaderSuffix; | private bool _allowCustomHeader; | private bool _hasCustomHeader = false; | private size_t[] _numericFieldsToSave; | private size_t[] _textFieldsToSave; | private MissingFieldPolicy _missingPolicy; | 1246| this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy, | Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix, | Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader) | { 1246| _name = operatorName; 1246| _fieldIndex = fieldIndex; 1246| _missingPolicy = missingPolicy; 1246| _useHeaderSuffix = useHeaderSuffix; 1246| _allowCustomHeader = allowCustomHeader; | // Default header. May be overrridden by custom header or header line. 1246| _header = | fieldHeaderFromIndex(fieldIndex) 1246| .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : ""); | } | | void setCustomHeader (string customHeader) | { 382| assert(_allowCustomHeader); 382| _header = customHeader; 382| _hasCustomHeader = true; | } | | final string name() const @property | { 0000000| return _name; | } | | final bool allowCustomHeader() const @property | { 399| return _allowCustomHeader; | } | | /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field | * that the field values should be saved. These should called during construction. | */ | final void setSaveFieldValuesNumeric() | { 186| _numericFieldsToSave ~= _fieldIndex; | } | | final void setSaveFieldValuesText() | { 176| _textFieldsToSave ~= _fieldIndex; | } | | final MissingFieldPolicy missingPolicy() @property | { 5417| return _missingPolicy; | } | | final size_t fieldIndex() const @property | { 1805| return _fieldIndex; | } | | final string header() const @property | { 1003| return _header; | } | | final bool useHeaderSuffix() const @property | { 0000000| return _useHeaderSuffix; | } | | void processHeaderLine(const char[][] fields) | { 608| if (!_hasCustomHeader) { | debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 417| _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string, 417| _useHeaderSuffix ? _name : ""); | } | } | | final size_t[] numericFieldsToSave() | { 1229| return _numericFieldsToSave; | } | | final size_t[] textFieldsToSave() | { 1229| return _textFieldsToSave; | } | | abstract SingleFieldCalculator makeCalculator(); |} | |/** SingleFieldCalculator is a base class for the common case of calculators using a single | * field. Derived classes implement processNextField() rather than processNextLine(). | */ |class SingleFieldCalculator : Calculator |{ | private size_t _fieldIndex; | 1805| this(size_t fieldIndex) | { 1805| _fieldIndex = fieldIndex; | } | | final size_t fieldIndex() const @property | { 1588| return _fieldIndex; | } | | final void processNextLine(const char[][] fields) | { | debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); | 5417| auto missingPolicy = getOperator.missingPolicy; 5417| const char[] field = fields[_fieldIndex]; | 6455| if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) | { 5107| processNextField(field); | } 310| else if (missingPolicy.replaceMissing) | { 156| processNextField(missingPolicy.missingReplacement); | } | } | | abstract SingleFieldOperator getOperator(); | | abstract void processNextField(const char[] field); |} | |/* Unittest helper functions. Only compiled when -unittest is in effect. */ |version(unittest) |{ | /** A helper for SingleFieldOperator unit tests. | * | * testSingleFieldOperator takes a set of split file values, a field index, a header | * suffix, and a set of expected values. The expected values array contains the | * initial value (zero entries) and the expected values after each line. (One more | * expected value than input lines.) The zero entry case is what is generated for an | * empty file. An example testing the 'min' operator against a file with 2 columns, | * 3 rows, using field index 1: | * | * testSingleFieldOperator!MinOperator( | * [["10", "100"], // The split file. 3 lines by 2 rows. | * ["5", "50"], | * ["20", "200"]], | * 1, // Field index (zero-based, so "100", "50", "200") | * "min", // The header suffix, normally the operator name. | * ["nan", "100", "50", "50"]); // Min value after processing each line. | * | * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3. | * Then run the operator is tested against each column, a total of six calls. Headers | * are automatically checked. Additional entries can be used to extend coverage. | * | * A non-default MissingFieldPolicy can be provide as an optional last argument. | * Operator tests should include exclusion and replacement variations. See operator | * unit tests for details. | * | * The testSingleFieldOperatorBase adds an additional capability - Custom operator | * init arguments. Currently this is used only by the quantile operator. | * | * These tests do not check unique key behavior (group-by). Operators don't have info | * about unique keys, and interact with them only indirectly, via Calculators. | */ | void testSingleFieldOperator(OperatorClass : SingleFieldOperator) | (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, | const char[][] expectedValues, | MissingFieldPolicy missingPolicy = new MissingFieldPolicy) | { 172| testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy); | } | | void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...) | (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, | const char[][] expectedValues, | MissingFieldPolicy missingPolicy, | T extraOpInitArgs) | { | import std.format : format; | import std.array : appender; | import std.string : chomp; | import std.traits : EnumMembers; | 192| auto numFields = (splitFile[0]).length; | 192| assert(fieldIndex < numFields, | format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s", | headerSuffix)); 192| assert(splitFile.length + 1 == expectedValues.length, | format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", | headerSuffix)); | | /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */ 192| auto printOptions = SummarizerPrintOptions('#', '|'); | | /* An input header line. */ 192| string[] inputHeaderLine = new string[numFields]; 2176| foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; | | /* The different expected output field headers. */ 192| auto outputFieldHeaderWithNoHeaderLine = | fieldHeaderFromIndex(fieldIndex) | .summaryHeaderFromFieldHeader(headerSuffix); 192| auto outputFieldHeaderFromHeaderLine = | inputHeaderLine[fieldIndex] | .summaryHeaderFromFieldHeader(headerSuffix); 192| auto customOutputFieldHeader = "custom"; | | enum HeaderUsecase { | HeaderLine_DefaultHeader, | HeaderLine_CustomHeader, | NoHeaderLine_DefaultHeader, | NoHeaderLine_CustomHeader, | NoHeaderLine_NoOutputHeader, | } | | string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) | { 0000000| return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", | op.name, hc, actual, expected); | } | | string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex, | const char[] actual, const char[] expected) | { 0000000| return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d, FieldIndex: %d\n Actual: '%s'; Expected: '%s'", | op.name, hc, rowIndex, fieldIndex, actual, expected); | } | | /* Run the logic for each header use case. */ | foreach (hc; EnumMembers!HeaderUsecase) | { 960| bool hasInputHeader = ( | hc == HeaderUsecase.HeaderLine_DefaultHeader || | hc == HeaderUsecase.HeaderLine_CustomHeader | ); 960| bool hasOutputHeader = ( | hc == HeaderUsecase.HeaderLine_DefaultHeader || | hc == HeaderUsecase.HeaderLine_CustomHeader || | hc == HeaderUsecase.NoHeaderLine_DefaultHeader || | hc == HeaderUsecase.NoHeaderLine_CustomHeader | ); 960| bool hasCustomHeader = ( | hc == HeaderUsecase.HeaderLine_CustomHeader || | hc == HeaderUsecase.NoHeaderLine_CustomHeader | ); | 1344| if (hasCustomHeader) assert(hasOutputHeader); | 960| auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs); | 960| if (hasCustomHeader) | { 400| if (!op.allowCustomHeader) continue; // Custom header not support by this operator 368| op.setCustomHeader(customOutputFieldHeader); | } | 944| Operator[] operatorArray; 944| operatorArray ~= op; | 944| auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 944| summarizer.setOperators(inputRangeObject(operatorArray)); | 1320| if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); | 944| if (hasOutputHeader) | { | /* Write the header line. Note that this is a one-field header, */ 752| auto headerLineOutput = appender!(char[])(); 752| summarizer.writeSummaryHeader(headerLineOutput, printOptions); | | /* Test that the header was generated correctly. | * | * Note: Because the output is generated by a Summarizer, it will have a | * trailing newline. Use chomp to trim it. | */ 752| final switch (hc) | { 192| case HeaderUsecase.HeaderLine_DefaultHeader: 192| assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine, | headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, | outputFieldHeaderFromHeaderLine)); 192| break; 192| case HeaderUsecase.NoHeaderLine_DefaultHeader: 192| assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine, | headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, | outputFieldHeaderWithNoHeaderLine)); 192| break; 184| case HeaderUsecase.HeaderLine_CustomHeader: 368| case HeaderUsecase.NoHeaderLine_CustomHeader: 368| assert(headerLineOutput.data.chomp == customOutputFieldHeader, | headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, | customOutputFieldHeader)); 368| break; 0000000| case HeaderUsecase.NoHeaderLine_NoOutputHeader: 0000000| break; | } | | } | | /* For each line, process the line, generate the output, and test that the | * value is correct. Start with the empty file case. | */ 20116| foreach (i, const char[] expected; expectedValues) | { 7698| if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 4321| auto summaryLineOutput = appender!(char[])(); 4321| summarizer.writeSummaryBody(summaryLineOutput, printOptions); 4321| assert(summaryLineOutput.data.chomp == expected, | valueAssertMessage(operatorArray[0], hc, i, fieldIndex, | summaryLineOutput.data.chomp, expectedValues[i])); | } | } | } |} | |/** ZeroFieldOperator is a base class for operators that take no input. The main use | * case is the CountOperator, which counts the occurrences of each unique key. Other | * uses are possible, for example, weighted random number assignment. | * | * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify | * the information available to such a routine. In particular, the split fields passed | * to processHeaderLine and processNextLine don't include all fields in the input, | * something that might not be obvious when implementing an operator. (Only fields | * required by operators acting on specific fields are included.) | */ |class ZeroFieldOperator : Operator |{ | import std.typecons : Flag; | | private string _name; | private string _header; | 55| this(string operatorName) | { 55| _name = operatorName; 55| _header = operatorName; | } | | void setCustomHeader (string customHeader) | { 7| _header = customHeader; | } | | bool allowCustomHeader() const @property | { 6| return true; | } | | final string name() const @property | { 0000000| return _name; | } | | final string header() const @property | { 34| return _header; | } | | /* A no-op. ZeroFieldOperators have no access to the header line. */ | final void processHeaderLine(const char[][] fields) { } | | /* A no-op. ZeroFieldOperators have no access to fields. */ | final size_t[] numericFieldsToSave() | { 54| size_t[] emptyArray; 54| return emptyArray; | } | | /* A no-op. ZeroFieldOperators have no access to fields. */ | final size_t[] textFieldsToSave() | { 54| size_t[] emptyArray; 54| return emptyArray; | } | | abstract ZeroFieldCalculator makeCalculator(); |} | |/** ZeroFieldCalculator is a base class for operators that don't use fields as input. | * In particular, the Count operator. It is a companion to the ZeroFieldOperator class. | * | * Derived classes implement processNextEntry() rather than processNextLine(), and the | * single argument form of calculate() given as an abstract function. | */ |class ZeroFieldCalculator : Calculator |{ 135| this() { } | | final void processNextLine(const char[][] fields) | { | debug writefln("[%s]", __FUNCTION__,); 366| processNextEntry(); | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 179| return calculate(printOptions); | } | | abstract void processNextEntry(); | abstract string calculate(const ref SummarizerPrintOptions printOptions); |} | |version(unittest) |{ | /* A helper for ZeroFieldOperator unit tests. | * | * testZeroFieldOperator takes a set of split file values, a default header, and a | * set of expected values. The expected values array contains the expected values | * after each line. | * | * testZeroFieldOperator is very similar to testSingleFieldOperator, except that | * there is no use of field indices and fewer types of headers. See the latter's | * documentation and the CountOperator unit tests for examples. | */ | void testZeroFieldOperator(OperatorClass : ZeroFieldOperator) | (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues) | { | import std.format : format; | import std.array : appender; | import std.string : chomp; | import std.traits : EnumMembers; | 3| auto numFields = (splitFile[0]).length; | 3| assert(splitFile.length + 1 == expectedValues.length, | format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", | defaultHeader)); | | /* printOptions - Not used these tests, but needed for API calls. */ 3| auto printOptions = SummarizerPrintOptions('#', '|'); | | /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */ 3| auto missingPolicy = new MissingFieldPolicy; | | /* An input header line. */ 3| string[] inputHeaderLine = new string[numFields]; 33| foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; | 3| auto customOutputFieldHeader = "custom"; | | enum HeaderUsecase { | HeaderLine_DefaultHeader, | HeaderLine_CustomHeader, | NoHeaderLine_DefaultHeader, | NoHeaderLine_CustomHeader, | NoHeaderLine_NoOutputHeader, | } | | string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) | { 0000000| return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", | op.name, hc, actual, expected); | } | | string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, | const char[] actual, const char[] expected) | { 0000000| return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d\n Actual: '%s'; Expected: '%s'", | op.name, hc, rowIndex, actual, expected); | } | | /* Run the logic for each header use case. */ | foreach (hc; EnumMembers!HeaderUsecase) | { 15| bool hasInputHeader = ( | hc == HeaderUsecase.HeaderLine_DefaultHeader || | hc == HeaderUsecase.HeaderLine_CustomHeader | ); 15| bool hasOutputHeader = ( | hc == HeaderUsecase.HeaderLine_DefaultHeader || | hc == HeaderUsecase.HeaderLine_CustomHeader || | hc == HeaderUsecase.NoHeaderLine_DefaultHeader || | hc == HeaderUsecase.NoHeaderLine_CustomHeader | ); 15| bool hasCustomHeader = ( | hc == HeaderUsecase.HeaderLine_CustomHeader || | hc == HeaderUsecase.NoHeaderLine_CustomHeader | ); | 21| if (hasCustomHeader) assert(hasOutputHeader); | 15| auto op = new OperatorClass(); | 15| if (hasCustomHeader) | { 6| if (!op.allowCustomHeader) continue; // Custom header not support by this operator 6| op.setCustomHeader(customOutputFieldHeader); | } | 15| Operator[] operatorArray; 15| operatorArray ~= op; | 15| auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 15| summarizer.setOperators(inputRangeObject(operatorArray)); 21| if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); | 15| if (hasOutputHeader) | { | /* Write the header line. Note that this is a one-field header, */ 12| auto headerLineOutput = appender!(char[])(); 12| summarizer.writeSummaryHeader(headerLineOutput, printOptions); | | /* Test that the header was generated correctly. | * | * Note: Because the output is generated by a Summarizer, it will have a | * trailing newline. Use chomp to trim it. | */ 12| final switch (hc) | { 3| case HeaderUsecase.HeaderLine_DefaultHeader: 6| case HeaderUsecase.NoHeaderLine_DefaultHeader: 6| assert(headerLineOutput.data.chomp == defaultHeader, | headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, | defaultHeader)); 6| break; 3| case HeaderUsecase.HeaderLine_CustomHeader: 6| case HeaderUsecase.NoHeaderLine_CustomHeader: 6| assert(headerLineOutput.data.chomp == customOutputFieldHeader, | headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, | customOutputFieldHeader)); 6| break; 0000000| case HeaderUsecase.NoHeaderLine_NoOutputHeader: 0000000| break; | } | | } | | /* For each line, process the line, generate the output, and test that the | * value is correct. Start with the empty file case. | */ 285| foreach (i, const char[] expected; expectedValues) | { 105| if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 60| auto summaryLineOutput = appender!(char[])(); 60| summarizer.writeSummaryBody(summaryLineOutput, printOptions); 60| assert(summaryLineOutput.data.chomp == expected, | valueAssertMessage(operatorArray[0], hc, i, | summaryLineOutput.data.chomp, expectedValues[i])); | } | } | } |} | |/* Specific operators. | * | * Notes: | * - The 'Calculator' inner classes are 'static'. This means inner class instances do not | * keep a reference to the context of the outer class. In exchange, Calculator instances | * need to hold all needed state, typically the field index they are summarizing. | */ | |/** CountOperator counts the number of occurrences of each unique key, or the number of | * input lines if there is no unique key. | * | * CountOperator differs from most other operators in that it doesn't summarize a specific | * field on the line. Instead it is summarizing a property of the unique key itself. For | * this reason it doesn't derive from SingleFieldOperator. | */ |final class CountOperator : ZeroFieldOperator |{ 55| this() | { 55| super("count"); | } | | final override ZeroFieldCalculator makeCalculator() | { 135| return new CountCalculator(); | } | 135| static final class CountCalculator : ZeroFieldCalculator | { | private size_t _count = 0; | | final override void processNextEntry() | { 366| _count++; | } | | final override string calculate(const ref SummarizerPrintOptions printOptions) | { 179| return printOptions.formatNumber(_count); | } | } |} | |unittest // CountOperator |{ 1| auto col1File = [["10"], ["9.5"], ["11"]]; 1| auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 1| auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; | 1| testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]); 1| testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]); 1| testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]); |} | |/** RetainOperator retains the first occurrence of a field, without changing the header. | * | * RetainOperator is intended for fields where the value is expected to be the same for | * all occurrences of the unique key, and the goal is to pass the value through unchanged. | * It is like FirstOperator, except that the original header is preserved. The original | * header preservation is setup in the call to the SingleFieldOperation constructor. | * | * Notes: | * - An option to signal an error if multiple values are encountered might be useful. | */ |final class RetainOperator : SingleFieldOperator |{ 43| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 43| super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader); | } | | final override SingleFieldCalculator makeCalculator() | { 26| return new RetainCalculator(fieldIndex); | } | | final class RetainCalculator : SingleFieldCalculator | { | private bool _done = false; | private string _value = ""; | 26| this(size_t fieldIndex) | { 26| super(fieldIndex); | } | | final override RetainOperator getOperator() | { 86| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 83| if (!_done) | { 26| _value = nextField.to!string; 26| _done = true; | } | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 98| return _value; | } | } |} | |unittest // RetainOperator |{ 1| auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 1| auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 1| auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; | 1| testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 1| testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 1| testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 1| testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 1| testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 1| testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]); | 1| auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 1| testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"], | new MissingFieldPolicy(false, "NA")); // Replace missing |} | |/** FirstOperator outputs the first value found for the field. | */ |final class FirstOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("first", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new FirstCalculator(fieldIndex); | } | | final class FirstCalculator : SingleFieldCalculator | { | private bool _done = false; | private string _value = ""; | 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override FirstOperator getOperator() | { 134| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 129| if (!_done) | { 42| _value = nextField.to!string; 42| _done = true; | } | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 162| return _value; | } | } |} | |unittest // FirstOperator |{ 1| auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 1| auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 1| auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; | 1| testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 1| testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 1| testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 1| testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 1| testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 1| testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]); | 1| auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 1| testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"], | new MissingFieldPolicy(false, "NA")); // Replace missing |} | |/** LastOperator outputs the last value found for the field. | */ |final class LastOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("last", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new LastCalculator(fieldIndex); | } | | final class LastCalculator : SingleFieldCalculator | { | private string _value = ""; | 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override LastOperator getOperator() | { 134| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 129| _value = nextField.to!string; | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 162| return _value; | } | } |} | |unittest // LastOperator |{ 1| auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 1| auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 1| auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; | 1| testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 1| testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 1| testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 1| testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 1| testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 1| testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]); | 1| auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 1| testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"], | new MissingFieldPolicy(false, "NA")); // Replace missing |} | |/** MinOperator output the minimum value for the field. This is a numeric operator. | * | * This operator returns the original string without additional numeric formatting. | * This can be useful when joining back to the original data. This is different than | * numeric operators that perform calculations. | */ |final class MinOperator : SingleFieldOperator |{ 78| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 78| super("min", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 180| return new MinCalculator(fieldIndex); | } | | final class MinCalculator : SingleFieldCalculator | { | private bool _isFirst = true; | private double _value = double.nan; | private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) | 180| this(size_t fieldIndex) | { 180| super(fieldIndex); | } | | final override MinOperator getOperator() | { 440| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 435| double fieldValue = nextField.to!double; 435| if (_isFirst) | { 180| _value = fieldValue; 180| _originalString = nextField.to!string; 180| _isFirst = false; | } 255| else if (fieldValue < _value) | { 104| _value = fieldValue; 104| _originalString = nextField.to!string; | } | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 300| return _originalString; | } | } |} | |unittest // MinOperator |{ 1| auto col1File = [["10"], ["9.5"], ["11"]]; 1| auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 1| auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; | 1| testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]); 1| testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]); 1| testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]); 1| testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]); 1| testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]); 1| testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]); | 1| auto col1misFile = [[""], ["10"], ["-10"]]; 1| testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"], | new MissingFieldPolicy(false, "5")); // Replace missing |} | |/** MaxOperator output the maximum value for the field. This is a numeric operator. | * | * This operator returns the original string without additional numeric formatting. | * This can be useful when joining back to the original data. This is different than | * numeric operators that perform calculations. | */ |final class MaxOperator : SingleFieldOperator |{ 79| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 79| super("max", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 181| return new MaxCalculator(fieldIndex); | } | | final class MaxCalculator : SingleFieldCalculator | { | private bool _isFirst = true; | private double _value = double.nan; | private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) | 181| this(size_t fieldIndex) | { 181| super(fieldIndex); | } | | final override MaxOperator getOperator() | { 442| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 437| double fieldValue = nextField.to!double; 436| if (_isFirst) | { 181| _value = fieldValue; 181| _originalString = nextField.to!string; 181| _isFirst = false; | } 255| else if (fieldValue > _value) | { 89| _value = fieldValue; 89| _originalString = nextField.to!string; | } | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 300| return _originalString; | } | } |} | |unittest // MaxOperator |{ 1| auto col1File = [["10"], ["9.5"], ["11"]]; 1| auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 1| auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; | 1| testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]); 1| testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]); 1| testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]); 1| testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]); 1| testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]); 1| testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]); | 1| auto col1misFile = [[""], ["-10"], ["10"]]; 1| testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"], | new MissingFieldPolicy(false, "5")); // Replace missing |} | |/** RangeOperator outputs the difference between the minimum and maximum values. | * | * If there is a single value, or all values are the same, the range is zero. This is | * a numeric operator. | */ |final class RangeOperator : SingleFieldOperator |{ 48| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 48| super("range", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 72| return new RangeCalculator(fieldIndex); | } | | final class RangeCalculator : SingleFieldCalculator | { | private bool _isFirst = true; | private double _minValue = 0.0; | private double _maxValue = 0.0; | 72| this(size_t fieldIndex) | { 72| super(fieldIndex); | } | | final override RangeOperator getOperator() | { 226| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 216| double fieldValue = nextField.to!double; 216| if (_isFirst) | { 72| _minValue = _maxValue = fieldValue; 72| _isFirst = false; | } 144| else if (fieldValue > _maxValue) | { 61| _maxValue = fieldValue; | } 83| else if (fieldValue < _minValue) | { 47| _minValue = fieldValue; | } | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 212| return printOptions.formatNumber(_maxValue - _minValue); | } | } |} | |unittest // RangeOperator |{ 1| auto col1File = [["10"], ["9.5"], ["11"]]; 1| auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 1| auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; | 1| testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]); 1| testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]); 1| testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]); 1| testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]); 1| testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]); 1| testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]); | 1| auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 1| testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"], | new MissingFieldPolicy(false, "5.5")); // Replace missing |} | |/** SumOperator produces the sum of all the values. This is a numeric operator. | */ |final class SumOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("sum", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new SumCalculator(fieldIndex); | } | | final class SumCalculator : SingleFieldCalculator | { | private double _total = 0.0; | 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override SumOperator getOperator() | { 154| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 144| _total += nextField.to!double; | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 182| return printOptions.formatNumber(_total); | } | } |} | |unittest // SumOperator |{ 1| auto col1File = [["10"], ["9.5"], ["11"]]; 1| auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 1| auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; | 1| testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]); 1| testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]); 1| testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]); 1| testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]); 1| testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]); 1| testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]); | 1| auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 1| testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"], | new MissingFieldPolicy(false, "1.5")); // Replace missing |} | |/** MeanOperator produces the mean (average) of all the values. This is a numeric operator. | */ |final class MeanOperator : SingleFieldOperator |{ 46| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 46| super("mean", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 58| return new MeanCalculator(fieldIndex); | } | | final class MeanCalculator : SingleFieldCalculator | { | private double _total = 0.0; | private size_t _count = 0; | 58| this(size_t fieldIndex) | { 58| super(fieldIndex); | } | | final override MeanOperator getOperator() | { 176| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 166| _total += nextField.to!double; 166| _count++; | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 198| return printOptions.formatNumber( 198| (_count > 0) ? (_total / _count.to!double) : double.nan); | } | } |} | |unittest // MeanOperator |{ 1| auto col1File = [["10"], ["9.5"], ["7.5"]]; 1| auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 1| auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; | 1| testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]); 1| testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]); 1| testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]); 1| testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]); 1| testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]); 1| testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]); | 1| auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]]; 1| testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"], | new MissingFieldPolicy(false, "0")); // Replace missing |} | |/** MedianOperator produces the median of all the values. This is a numeric operator. | * | * All the field values are stored in memory as part of this calculation. This is | * handled by unique key value lists. | */ |final class MedianOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("median", fieldIndex, missingPolicy); 42| setSaveFieldValuesNumeric(); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new MedianCalculator(fieldIndex); | } | | final class MedianCalculator : SingleFieldCalculator | { 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override MedianOperator getOperator() | { 154| return this.outer; | } | | /* Work is done by saving the field values. */ | final override void processNextField(const char[] nextField) | { } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 182| return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex)); | } | } |} | |unittest // MedianOperator |{ 1| auto col1File = [["10"], ["9.5"], ["7.5"]]; 1| auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 1| auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; | 1| testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]); 1| testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]); 1| testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]); 1| testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]); 1| testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]); 1| testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]); | 1| auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 1| testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"], | new MissingFieldPolicy(false, "0")); // Replace missing |} | |/** QuantileOperator produces the value representing the data at a cummulative probability. | * This is a numeric operation. | * | * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities | * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the | * median). Data is sorted is ascending order. This operator takes one percentile, but it | * is common to generate multiple quantile ranks for the same field when summarizing. | * | * All the field's values are stored in memory as part of this calculation. This is | * handled by unique key value lists. | */ |final class QuantileOperator : SingleFieldOperator |{ | private double _prob; | 102| this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability) | { 204| assert(0.0 <= probability && probability <= 1.0); | import std.format : format; | 204| string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0); 102| super(header, fieldIndex, missingPolicy); 102| _prob = probability; 102| setSaveFieldValuesNumeric(); | } | | final override SingleFieldCalculator makeCalculator() | { 102| return new QuantileCalculator(fieldIndex); | } | | final class QuantileCalculator : SingleFieldCalculator | { 102| this(size_t fieldIndex) | { 102| super(fieldIndex); | } | | final override QuantileOperator getOperator() | { 334| return this.outer; | } | | /* Work is done by saving the field values. */ | final override void processNextField(const char[] nextField) | { } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { | import tsv_utils.common.numerics : quantile; 422| return printOptions.formatNumber( | quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex))); | } | } |} | |unittest // QuantileOperator |{ 1| auto col1File = [["10"], ["9.5"], ["7.5"]]; 1| auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 1| auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; | 1| auto defaultMissing = new MissingFieldPolicy; | | /* Same as the median tests. */ 1| testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50); 1| testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50); 1| testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50); 1| testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50); 1| testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50); 1| testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50); | | /* The extremes (0, 1), are min and max. */ 1| testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0); 1| testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0); 1| testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0); 1| testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0); 1| testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0); 1| testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0); | 1| testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0); 1| testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0); 1| testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0); 1| testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0); 1| testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0); 1| testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0); | | /* For missing policies, re-use the median tests. */ 1| auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 1| testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"], | new MissingFieldPolicy(true, ""), 0.5); // Exclude missing 1| testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"], | new MissingFieldPolicy(false, "0"), 0.5); // Replace missing |} | |/** MadOperator produces the median absolute deviation from the median. This is a numeric | * operation. | * | * The result is the raw MAD value, without a normalization applied. | * | * All the field values are stored in memory as part of this calculation. This is | * handled by unique key value lists. | */ |final class MadOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("mad", fieldIndex, missingPolicy); 42| setSaveFieldValuesNumeric(); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new MadCalculator(fieldIndex); | } | | final class MadCalculator : SingleFieldCalculator | { 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override MadOperator getOperator() | { 164| return this.outer; | } | | /* Work is done by saving the field values. */ | final override void processNextField(const char[] nextField) | { } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { | import std.math : abs; | import tsv_utils.common.numerics : rangeMedian; | 192| auto median = valuesLists.numericValuesMedian(fieldIndex); 192| auto values = valuesLists.numericValues(fieldIndex); 192| auto medianDevs = new double[values.length]; 1972| foreach (size_t i, double v; values) 349| medianDevs[i] = abs(v - median); | 192| return printOptions.formatNumber(medianDevs.rangeMedian); | } | } |} | |unittest // MadOperator |{ 1| auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]]; 1| auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]]; 1| auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]]; | 1| testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]); 1| testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]); 1| testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]); 1| testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]); 1| testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]); 1| testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]); | 1| auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]]; 1| testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"], | new MissingFieldPolicy(false, "0")); // Replace missing |} | |/** Generates the variance of the fields values. This is a numeric operator. | */ |final class VarianceOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("var", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new VarianceCalculator(fieldIndex); | } | | final class VarianceCalculator : SingleFieldCalculator | { | private double _count = 0.0; | private double _mean = 0.0; | private double _m2 = 0.0; // Sum of squares of differences from current mean | 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override VarianceOperator getOperator() | { 134| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 129| _count += 1.0; 129| double fieldValue = nextField.to!double; 129| double delta = fieldValue - _mean; 129| _mean += delta / _count; 129| _m2 += delta * (fieldValue - _mean); | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 162| return printOptions.formatNumber( 162| (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan); | } | } |} | |unittest // VarianceOperator |{ 1| auto col1File = [["5"], ["10"], ["15"]]; 1| auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]]; 1| auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]]; | 1| testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]); 1| testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]); 1| testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]); 1| testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]); 1| testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]); 1| testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]); | 1| auto col1misFile = [["5"], ["10"], [""]]; 1| testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"], | new MissingFieldPolicy(false, "15")); // Replace missing |} | |/** Generates the standard deviation of the fields values. This is a numeric operator. | */ |final class StDevOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("stdev", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new StDevCalculator(fieldIndex); | } | | final class StDevCalculator : SingleFieldCalculator | { | private double _count = 0.0; | private double _mean = 0.0; | private double _m2 = 0.0; // Sum of squares of differences from current mean | 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override StDevOperator getOperator() | { 134| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 129| _count += 1.0; 129| double fieldValue = nextField.to!double; 129| double delta = fieldValue - _mean; 129| _mean += delta / _count; 129| _m2 += delta * (fieldValue - _mean); | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { | import std.math : sqrt; 162| return printOptions.formatNumber( 162| (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan); | } | } |} | |/* StDevOperator unit tests - These would be improved with a tolerance option. | */ |unittest |{ 1| auto col1File = [["1"], ["4"], ["7"]]; 1| auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]]; 1| auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]]; | 1| testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]); 1| testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]); 1| testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]); 1| testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]); 1| testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]); 1| testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]); | 1| auto col1misFile = [["1"], ["4"], [""]]; 1| testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"], | new MissingFieldPolicy(true, "")); // Exclude missing 1| testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"], | new MissingFieldPolicy(false, "7")); // Replace missing |} | |/** UniqueCountOperator generates the number of unique values. Unique values are | * based on exact text match calculation, not a numeric comparison. | * | * All the unique field values are stored in memory as part of this calculation. | */ |final class UniqueCountOperator : SingleFieldOperator |{ 70| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 70| super("unique_count", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 162| return new UniqueCountCalculator(fieldIndex); | } | | final class UniqueCountCalculator : SingleFieldCalculator | { | private bool[string] _values; | 162| this(size_t fieldIndex) | { 162| super(fieldIndex); | } | | final override UniqueCountOperator getOperator() | { 631| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 1012| if (nextField !in _values) _values[nextField.to!string] = true; | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 373| return printOptions.formatNumber(_values.length); | } | } |} | |unittest // UniqueCount |{ 1| auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 1| auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 1| auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; | 1| testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]); 1| testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]); 1| testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]); 1| testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]); 1| testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]); 1| testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]); | 1| auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 1| testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"], | new MissingFieldPolicy(true, "")); // Exclude missing | | 1| testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"], | new MissingFieldPolicy(false, "XYZ")); // Replace missing |} | |/** MissingCountOperator generates the number of missing values. This overrides | * the global missingFieldsPolicy. | */ |final class MissingCountOperator : SingleFieldOperator |{ | private MissingFieldPolicy _globalMissingPolicy; | 92| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 92| _globalMissingPolicy = missingPolicy; 92| super("missing_count", fieldIndex, new MissingFieldPolicy(false, "")); | } | | final override SingleFieldCalculator makeCalculator() | { 92| return new MissingCountCalculator(fieldIndex); | } | | final class MissingCountCalculator : SingleFieldCalculator | { | private size_t _missingCount = 0; | 92| this(size_t fieldIndex) | { 92| super(fieldIndex); | } | | final override MissingCountOperator getOperator() | { 312| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 464| if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++; | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 392| return printOptions.formatNumber(_missingCount); | } | } |} | |unittest // MissingCount |{ 1| auto col1File = [["a"], ["b"], [""], [" "], [""]]; 1| auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 1| auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; | 1| testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]); 1| testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]); 1| testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]); 1| testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]); 1| testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]); 1| testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]); | 1| auto excludeMissing = new MissingFieldPolicy(true, ""); 1| auto replaceMissing = new MissingFieldPolicy(false, "X"); | 1| testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing); 1| testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing); 1| testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing); 1| testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing); 1| testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing); 1| testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing); | 1| testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing); 1| testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing); 1| testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing); 1| testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing); 1| testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing); 1| testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing); |} | |/** NotMissingCountOperator generates the number of not-missing values. This overrides | * the global missingFieldsPolicy. | */ |final class NotMissingCountOperator : SingleFieldOperator |{ | private MissingFieldPolicy _globalMissingPolicy; | 92| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 92| _globalMissingPolicy = missingPolicy; 92| super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, "")); | } | | final override SingleFieldCalculator makeCalculator() | { 92| return new NotMissingCountCalculator(fieldIndex); | } | | final class NotMissingCountCalculator : SingleFieldCalculator | { | private size_t _notMissingCount = 0; | 92| this(size_t fieldIndex) | { 92| super(fieldIndex); | } | | final override NotMissingCountOperator getOperator() | { 312| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 472| if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++; | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 392| return printOptions.formatNumber(_notMissingCount); | } | } |} | |unittest // NotMissingCount |{ 1| auto col1File = [["a"], ["b"], [""], [" "], [""]]; 1| auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 1| auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; | 1| testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]); 1| testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]); 1| testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]); 1| testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]); 1| testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]); 1| testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]); | 1| auto excludeMissing = new MissingFieldPolicy(true, ""); 1| auto replaceMissing = new MissingFieldPolicy(false, "X"); | 1| testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing); | 1| testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing); 1| testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing); |} | |/** ModeOperator outputs the most frequent value seen. In the event of a tie, the | * first value seen is produced. | * | * All the field values are stored in memory as part of this calculation. | * | */ |final class ModeOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("mode", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new ModeCalculator(fieldIndex); | } | | final class ModeCalculator : SingleFieldCalculator | { | private size_t[string] _valueCounts; | private Appender!(string[]) _uniqueValues; | 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override ModeOperator getOperator() | { 204| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 194| auto countPtr = (nextField in _valueCounts); | 194| if (countPtr is null) | { 116| string value = nextField.to!string; 116| _uniqueValues.put(value); 116| _valueCounts[value] = 1; | } | else | { 78| (*countPtr)++; | } | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 232| string modeValue = ""; 232| size_t modeCount = 0; | 1974| foreach (value; _uniqueValues.data) | { 426| assert(value in _valueCounts); | 426| auto count = _valueCounts[value]; | 426| if (count > modeCount) | { 239| modeValue = value; 239| modeCount = count; | } | } | 232| return modeValue; | } | } |} | |unittest // ModeOperator |{ 1| auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 1| auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 1| auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; | 1| testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]); 1| testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]); 1| testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]); 1| testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]); 1| testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]); 1| testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]); | 1| auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 1| testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"], | new MissingFieldPolicy(true, "")); // Exclude missing | | 1| testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"], | new MissingFieldPolicy(false, "X")); // Replace missing |} | |/** ModeCountOperator outputs the count of the most frequent value seen. | * | * All the field values are stored in memory as part of this calculation. | * | */ |final class ModeCountOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("mode_count", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new ModeCountCalculator(fieldIndex); | } | | final class ModeCountCalculator : SingleFieldCalculator | { | private size_t[string] _valueCounts; | 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override ModeCountOperator getOperator() | { 204| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 194| auto countPtr = (nextField in _valueCounts); | 194| if (countPtr is null) | { 116| string value = nextField.to!string; 116| _valueCounts[value] = 1; | } | else | { 78| (*countPtr)++; | } | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 232| size_t modeCount = 0; 2390| foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count; 232| return printOptions.formatNumber(modeCount); | } | } |} | |unittest // ModeCountOperator |{ 1| auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 1| auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]]; 1| auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; | 1| testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]); 1| testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]); 1| testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]); 1| testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]); 1| testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]); 1| testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]); | 1| auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 1| testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"], | new MissingFieldPolicy(true, "")); // Exclude missing | | 1| testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"], | new MissingFieldPolicy(false, "X")); // Replace missing |} | |/** ValuesOperator outputs each value delimited by an alternate delimiter character. | * | * All the field values are stored in memory as part of this calculation. This is | * handled by unique key value lists. | */ | |final class ValuesOperator : SingleFieldOperator |{ 176| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 176| super("values", fieldIndex, missingPolicy); 176| setSaveFieldValuesText(); | } | | final override SingleFieldCalculator makeCalculator() | { 420| return new ValuesCalculator(fieldIndex); | } | | final class ValuesCalculator : SingleFieldCalculator | { 420| this(size_t fieldIndex) | { 420| super(fieldIndex); | } | | final override ValuesOperator getOperator() | { 848| return this.outer; | } | | /* Work is done by saving the field values. */ | final override void processNextField(const char[] nextField) | { } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 600| return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter); | } | } |} | |unittest // ValuesOperator |{ 1| auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 1| auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]]; 1| auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]]; | 1| testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]); 1| testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]); 1| testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]); 1| testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]); 1| testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]); 1| testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]); | 1| testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"], | new MissingFieldPolicy(true, "")); // Exclude missing | | 1| testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"], | new MissingFieldPolicy(false, "X")); // Replace missing |} | |/** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter | * character. Values are output in the order seen. | * | * All unique field values are stored in memory as part of this calculation. | * | */ |final class UniqueValuesOperator : SingleFieldOperator |{ 42| this(size_t fieldIndex, MissingFieldPolicy missingPolicy) | { 42| super("unique_values", fieldIndex, missingPolicy); | } | | final override SingleFieldCalculator makeCalculator() | { 42| return new UniqueValuesCalculator(fieldIndex); | } | | final class UniqueValuesCalculator : SingleFieldCalculator | { | private size_t[string] _valuesHash; | private Appender!(string[]) _uniqueValues; | 42| this(size_t fieldIndex) | { 42| super(fieldIndex); | } | | final override UniqueValuesOperator getOperator() | { 194| return this.outer; | } | | final override void processNextField(const char[] nextField) | { 184| auto ptr = (nextField in _valuesHash); | 184| if (ptr is null) | { 136| string value = nextField.to!string; 136| _uniqueValues.put(value); 136| _valuesHash[value] = 1; | } | } | | final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) | { 222| return _uniqueValues.data.join(printOptions.valuesDelimiter); | } | } |} | |unittest // UniqueValuesOperator |{ 1| auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 1| auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]]; 1| auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]]; | 1| testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]); 1| testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]); 1| testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]); 1| testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]); 1| testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]); 1| testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]); | 1| testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"], | new MissingFieldPolicy(true, "")); // Exclude missing | | 1| testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"], | new MissingFieldPolicy(false, "X")); // Replace missing |} tsv-summarize/src/tsv_utils/tsv-summarize.d is 98% covered <<<<<< EOF # path=./tsv-filter-src-tsv_utils-tsv-filter.lst |/** |Command line tool that filters TSV files. | |This tool filters tab-delimited files based on numeric or string comparisons |against specific fields. See the helpText string for details. | |Copyright (c) 2015-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.tsv_filter; | |import std.algorithm : canFind, equal, findSplit, max, min; |import std.conv : to; |import std.exception : enforce; |import std.format : format; |import std.math : abs, isFinite, isInfinity, isNaN; |import std.range; |import std.regex; |import std.stdio; |import std.string : isNumeric; |import std.typecons; |import std.uni: asLowerCase, toLower, byGrapheme; | |/* The program has two main parts, command line arg processing and processing the input | * files. Much of the work is in command line arg processing. This sets up the tests run | * against each input line. The tests are an array of delegates (closures) run against the | * fields in the line. The tests are based on command line arguments, of which there is | * a lengthy set, one for each test. | */ | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |/** Main program. Invokes command line arg processing and tsv-filter to perform | * the real work. Any errors are caught and reported. | */ |int main(string[] cmdArgs) |{ | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 366| dmd_coverSetMerge(true); | } | 366| TsvFilterOptions cmdopt; 366| const r = cmdopt.processArgs(cmdArgs); 443| if (!r[0]) return r[1]; | version(LDC_Profile) | { | import ldc.profile : resetAll; | resetAll(); | } 289| try tsvFilter(cmdopt); | catch (Exception e) | { 6| stderr.writefln("Error [%s]: %s", cmdopt.programName, e.msg); 6| return 1; | } 283| return 0; |} | |immutable helpText = q"EOS |Synopsis: tsv-filter [options] [file...] | |Filter tab-delimited files for matching lines via comparison tests against |individual fields. Use '--help-verbose' for a more detailed description. | |Fields are specified using field number or field name. Field names require |that the input file has a header line. Use '--help-fields' for details. | |Global options: | --help-verbose Print full help. | --help-options Print the options list by itself. | --help-fields Print help on specifying fields. | --V|version Print version information and exit. | --H|header Treat the first line of each file as a header. | --or Evaluate tests as an OR rather than an AND clause. | --v|invert Invert the filter, printing lines that do not match. | --d|delimiter CHR Field delimiter. Default: TAB. | |Operators: |* Test if a field is empty (no characters) or blank (empty or whitespace only). | Syntax: --empty|not-empty|blank|not-blank FIELD | Example: --empty name # True if the 'name' field is empty | |* Test if a field is numeric, finite, NaN, or infinity | Syntax: --is-numeric|is-finite|is-nan|is-infinity FIELD | Example: --is-numeric 5 --gt 5:100 # Ensure field 5 is numeric before --gt test. | |* Compare a field to a number (integer or float) | Syntax: --eq|ne|lt|le|gt|ge FIELD:NUM | Example: --lt size:1000 --gt weight:0.5 # ('size' < 1000) and ('weight' > 0.5) | |* Compare a field to a string | Syntax: --str-eq|str-ne|istr-eq|istr-ne FIELD:STR | Example: --str-eq color:red # True if 'color' field is "red" | |* Test if a field contains a string (substring search) | Syntax: --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld FIELD:STR | Example: --str-in-fld color:dark # True if 'color field contains "dark" | |* Test if a field matches a regular expression. | Syntax: --regex|iregex|not-regex|not-iregex FIELD:REGEX | Example: --regex '3:ab*c' # True if field 3 contains "ac", "abc", "abbc", etc. | |* Test a field's character or byte length | Syntax: --char-len-[le|lt|ge|gt|eq|ne] FIELD:NUM | --byte-len-[le|lt|ge|gt|eq|ne] FIELD:NUM | Example: --char-len-lt 2:10 # True if field 2 is less than 10 characters long. | --byte-len-gt 2:10 # True if field 2 is greater than 10 bytes long. | |* Field to field comparisons - Similar to field vs literal comparisons, but field vs field. | Syntax: --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge FIELD1:FIELD2 | --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne FIELD1:FIELD2 | Example: --ff-eq 2:4 # True if fields 2 and 4 are numerically equivalent | --ff-str-eq 2:4 # True if fields 2 and 4 are the same strings | |* Field to field difference comparisons - Absolute and relative difference | Syntax: --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM | --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM | Example: --ff-absdiff-lt 1:3:0.25 # True if abs(field1 - field2) < 0.25 | |EOS"; | |immutable helpTextVerbose = q"EOS |Synopsis: tsv-filter [options] [file...] | |Filter lines of tab-delimited files via comparison tests against fields. |Multiple tests can be specified, by default they are evaluated as an AND |clause. Lines satisfying the tests are written to standard output. | |Typical test syntax is '--op field:value', where 'op' is an operator, |'field' is a either a field name and or field number, and 'value' is the |comparison basis. For example, '--lt length:500' tests if the 'length' |field is less than 500. A more complete example: | | tsv-filter --header --gt length:50 --lt length:100 --le width:200 data.tsv | |This outputs all lines from file data.tsv where the 'length' field is |greater than 50 and less than 100, and the 'width' field is less than or |equal to 200. The header line is also output. | |Field numbers can also be used to identify fields, and must be used when |the input file doesn't have a header line. For example: | | tsv-filter --gt 1:50 --lt 1:100 --le 2:200 data.tsv | |Field lists can be used to specify multiple fields at once. For example: | | tsv-filter --not-blank 1-10 --str-ne 1,2,5:'--' data.tsv | |tests that fields 1-10 are not blank and fields 1,2,5 are not "--". | |Wildcarded field names can also be used to specify multiple fields. The |following finds lines where any field name ending in '*_id' is empty: | | tsv-filter -H --or --empty '*_id' | |Use '--help-fields' for details on using field names. | |Tests available include: | * Test if a field is empty (no characters) or blank (empty or whitespace only). | * Test if a field is interpretable as a number, a finite number, NaN, or Infinity. | * Compare a field to a number - Numeric equality and relational tests. | * Compare a field to a string - String equality and relational tests. | * Test if a field matches a regular expression. Case sensitive or insensitive. | * Test if a field contains a string. Sub-string search, case sensitive or insensitive. | * Test a field's character or byte length. | * Field to field comparisons - Similar to the other tests, except comparing | one field to another in the same line. | |Details: | * The run is aborted if there are not enough fields in an input line. | * Numeric tests will fail and abort the run if a field cannot be interpreted as a | number. This includes fields with no text. To avoid this use '--is-numeric' or | '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100' | ensures field 5 is numeric before running the --gt test. | * Regular expression syntax is defined by the D programming language. They follow | common conventions (perl, python, etc.). Most common forms work as expected. | |Options: |EOS"; | |immutable helpTextOptions = q"EOS |Synopsis: tsv-filter [options] [file...] | |Options: |EOS"; | |/* The next blocks of code define the structure of the boolean tests run against input lines. | * This includes function and delegate (closure) signatures, creation mechanisms, option | * handlers, etc. Command line arg processing to build the test structure. |*/ | |/* FieldsPredicate delegate signature - Each input line is run against a set of boolean | * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure) | * containing all info about the test except the field values of the line being tested. | * These delegates are created as part of command line arg processing. The wrapped data | * includes operation, field indexes, literal values, etc. At run-time the delegate is | * passed one argument, the split input line. | */ |alias FieldsPredicate = bool delegate(const char[][] fields); | |/* FieldsPredicate function signatures - These aliases represent the different function | * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make' | * function. The 'make' function takes a real predicate function and closure args and | * returns a FieldsPredicate delegate. Predicates types are: | * | * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4) | * - FieldVsNumberPredicate - Test based on a field index (used to get the field value) | * and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100). | * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc) | * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string. | * (e.g. --istr-eq 2:abc) | * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c') | * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4). | * | * An actual FieldsPredicate takes the fields from the line and the closure args and | * runs the test. For example, a function testing if a field is less than a specific | * value would pull the specified field from the fields array, convert the string to | * a number, then run the less-than test. | */ |alias FieldUnaryPredicate = bool function(const char[][] fields, size_t index); |alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value); |alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value); |alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value); |alias FieldVsRegexPredicate = bool function(const char[][] fields, size_t index, Regex!char value); |alias FieldVsFieldPredicate = bool function(const char[][] fields, size_t index1, size_t index2); |alias FieldFieldNumPredicate = bool function(const char[][] fields, size_t index1, size_t index2, double value); | |FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index) |{ 578| return fields => fn(fields, index); |} | |FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value) |{ 902115| return fields => fn(fields, index, value); |} | |FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value) |{ 885| return fields => fn(fields, index, value); |} | |FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value) |{ 446| return fields => fn(fields, index, value); |} | |FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value) |{ 344| return fields => fn(fields, index, value); |} | |FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2) |{ 464| return fields => fn(fields, index1, index2); |} | |FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value) |{ 330| return fields => fn(fields, index1, index2, value); |} | |/* Predicate functions - These are the actual functions used in a FieldsPredicate. They | * are a direct reflection of the operators available via command line args. Each matches | * one of the FieldsPredicate function aliases defined above. | */ 142|bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; } 69|bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; } 68|bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); } 60|bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); } | 64|bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; } 120|bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; } 60|bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; } 60|bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; } | 200111|bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; } 71|bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double < val; } 100082|bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; } 90|bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double > val; } 600546|bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; } 102|bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; } | 30|bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; } 45|bool strLT(const char[][] fields, size_t index, string val) { return fields[index] < val; } 30|bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; } 30|bool strGT(const char[][] fields, size_t index, string val) { return fields[index] > val; } 276|bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; } 54|bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; } 236|bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); } 117|bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); } | |/* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander. | */ 177|bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); } 45|bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); } 119|bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); } 75|bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); } | |/* Note: Case-sensitivity is built into the regex value, so these regex predicates are | * used for both case-sensitive and case-insensitive regex operators. | */ 231|bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); } 88|bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); } | 69|bool charLenLE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength <= val; } 123|bool charLenLT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength < val; } 185|bool charLenGE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength >= val; } 42|bool charLenGT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength > val; } 72|bool charLenEQ(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength == val; } 72|bool charLenNE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength != val; } | 42|bool byteLenLE(const char[][] fields, size_t index, double val) { return fields[index].length <= val; } 42|bool byteLenLT(const char[][] fields, size_t index, double val) { return fields[index].length < val; } 121|bool byteLenGE(const char[][] fields, size_t index, double val) { return fields[index].length >= val; } 42|bool byteLenGT(const char[][] fields, size_t index, double val) { return fields[index].length > val; } 72|bool byteLenEQ(const char[][] fields, size_t index, double val) { return fields[index].length == val; } 72|bool byteLenNE(const char[][] fields, size_t index, double val) { return fields[index].length != val; } | 45|bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; } 45|bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double < fields[index2].to!double; } 30|bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; } 45|bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double > fields[index2].to!double; } 60|bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; } 60|bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; } 45|bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; } 45|bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; } |bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2) |{ 30| return equal(fields[index1].asLowerCase, fields[index2].asLowerCase); |} |bool ffIStrNE(const char[][] fields, size_t index1, size_t index2) |{ 30| return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase); |} | 140|auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; } 168|auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); } | |bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value) |{ 84| return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value; |} |bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value) |{ 56| return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value; |} |bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value) |{ 84| return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value; |} |bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value) |{ 84| return RelDiff(fields[index1].to!double, fields[index2].to!double) > value; |} | |/* Command line option handlers - There is a command line option handler for each | * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate, | * etc. Option handlers are passed the tests array, the predicate function, and the | * command line option arguments. A FieldsPredicate delegate is created and appended to | * the tests array. An exception is thrown if errors are detected while processing the | * option, the error text is intended for the end user. | * | * All the option handlers have similar functionality, differing in option processing and | * error message generation. fieldVsNumberOptionHandler is described as an example. It | * handles command options such as '--lt 3:1000', which tests field 3 for a values less | * than 1000. It is passed the tests array, the 'numLE' predicate function used for the | * test, and the string "3:1000" representing the option value. It is also passed the | * header line from the first input file and an indication of whether header processing | * is enabled (--H|header). parseFieldList (fieldlist module) is used to parse the | * field-list component of the option ("3" in the example). The comparison value ("1000") | * is converted to a double. These are wrapped in a FieldsPredicate delegate which is | * added to the tests array. An error is signaled if the option string is invalid. | * | * During processing, fields indexes are converted from one-based to zero-based. As an | * optimization, the maximum field index is also tracked. This allows early termination of | * line splitting. | * | * The header line from the input file is not available when std.getop processes the | * command line option. The processing described above must be deferred. This is done | * using a 'CmdOptionHandler' delegate. There is a 'make' function for every Command line | * option handler that creates these. These are created during std.getopt processing. | * They are run when the header line becomes available. | * | * The final setup for the '--lt' (numeric less-than) operator' is as follows: | * - Function 'handlerNumLE' (in TsvFilterOptions.processArgs) is associated with the | * command line option "--lt ". When called by std.getopt it creates an option | * hander delegate via 'makeFieldVsNumberOptionHandler'. This is appended to an | * array of delegates. | * - 'fieldVsNumberOptionHandler' is invoked via the delegate after the header line | * becomes available (in TsvFilterOptions.processArgs). If args are valid, | * 'makeFieldVsNumberDelegate' is used to create a delegate invoking the 'numLE' | * predicate function. This delegate is added to the set of run-time tests. | * | * Note that in the above setup the 'numLE' predicate is specified in 'handlerNumLE' | * and passed through all the steps. This is how the command line option gets | * associated with the predicate function. | */ | |/* CmdOptionHandler delegate signature - This is the call made to process the command | * line option arguments after the header line has been read. | */ |alias CmdOptionHandler = void delegate(ref FieldsPredicate[] tests, ref size_t maxFieldIndex, | bool hasHeader, string[] headerFields); | |CmdOptionHandler makeFieldUnaryOptionHandler(FieldUnaryPredicate predicateFn, string option, string optionVal) |{ 48| return | (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 92| => fieldUnaryOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); |} | |void fieldUnaryOptionHandler( | ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, | FieldUnaryPredicate fn, string option, string optionVal) |{ | import tsv_utils.common.fieldlist; | 327| try foreach (fieldNum, fieldIndex; | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields) | .enumerate(1)) | { 47| tests ~= makeFieldUnaryDelegate(fn, fieldIndex); 94| maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; | } | catch (Exception e) | { 4| e.msg = format("Invalid option: [--%s %s]. %s\n Expected: '--%s ' or '--%s '.", | option, optionVal, e.msg, option, option); 4| throw e; | } |} | |CmdOptionHandler makeFieldVsNumberOptionHandler(FieldVsNumberPredicate predicateFn, string option, string optionVal) |{ 147| return | (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 278| => fieldVsNumberOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); |} | |void fieldVsNumberOptionHandler( | ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, | FieldVsNumberPredicate fn, string option, string optionVal) |{ | import tsv_utils.common.fieldlist; | | auto formatErrorMsg(string option, string optionVal, string errorMessage="") | { 24| string optionalSpace = (errorMessage.length == 0) ? "" : " "; 12| return format( | "Invalid option: [--%s %s].%s%s\n Expected: '--%s :' or '--%s : where is a number.", | option, optionVal, optionalSpace, errorMessage, option, option); | } | | try | { 145| auto optionValParse = | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields); | 138| auto fieldIndices = optionValParse.array; 142| enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 134| double value = optionVal[optionValParse.consumed + 1 .. $].to!double; | 876| foreach (fieldIndex; fieldIndices) | { 159| tests ~= makeFieldVsNumberDelegate(fn, fieldIndex, value); 318| maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; | } | } | catch (Exception e) | { 12| e.msg = formatErrorMsg(option, optionVal, e.msg); 12| throw e; | } |} | |CmdOptionHandler makeFieldVsStringOptionHandler(FieldVsStringPredicate predicateFn, string option, string optionVal) |{ 70| return | (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 131| => fieldVsStringOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); |} | |void fieldVsStringOptionHandler( | ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, | FieldVsStringPredicate fn, string option, string optionVal) |{ | import tsv_utils.common.fieldlist; | | try | { 70| auto optionValParse = | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields); | 62| auto fieldIndices = optionValParse.array; 63| enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 61| string value = optionVal[optionValParse.consumed + 1 .. $].idup; | 384| foreach (fieldIndex; fieldIndices) | { 67| tests ~= makeFieldVsStringDelegate(fn, fieldIndex, value); 134| maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; | } | | } | catch (Exception e) | { 9| e.msg = format( | "[--%s %s]. %s\n Expected: '--%s :' or '--%s :' where is a string.", | option, optionVal, e.msg, option, option); 9| throw e; | } |} | |CmdOptionHandler makeFieldVsIStringOptionHandler(FieldVsIStringPredicate predicateFn, string option, string optionVal) |{ 32| return | (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 59| => fieldVsIStringOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); |} | |/* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the | * case-insensitive comparison will be done on lower-cased values. | */ |void fieldVsIStringOptionHandler( | ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, | FieldVsIStringPredicate fn, string option, string optionVal) |{ | import tsv_utils.common.fieldlist; | | try | { 32| auto optionValParse = | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields); | 28| auto fieldIndices = optionValParse.array; 29| enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 27| string value = optionVal[optionValParse.consumed + 1 .. $].idup; | 171| foreach (fieldIndex; fieldIndices) | { 30| tests ~= makeFieldVsIStringDelegate(fn, fieldIndex, value.to!dstring.toLower); 60| maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; | } | } | catch (Exception e) | { 5| e.msg = format( | "[--%s %s]. %s\n Expected: '--%s :' or '--%s :' where is a string.", | option, optionVal, e.msg, option, option); 5| throw e; | } |} | |CmdOptionHandler makeFieldVsRegexOptionHandler(FieldVsRegexPredicate predicateFn, string option, string optionVal, bool caseSensitive) |{ 34| return | (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 53| => fieldVsRegexOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal, caseSensitive); |} | |void fieldVsRegexOptionHandler( | ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, | FieldVsRegexPredicate fn, string option, string optionVal, bool caseSensitive) |{ | import tsv_utils.common.fieldlist; | | try | { 34| auto optionValParse = | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields); | 25| auto fieldIndices = optionValParse.array; 27| enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); | 46| immutable modifiers = caseSensitive ? "" : "i"; 23| Regex!char value = | optionVal[optionValParse.consumed + 1 .. $] | .regex(modifiers); | 132| foreach (fieldIndex; fieldIndices) | { 25| tests ~= makeFieldVsRegexDelegate(fn, fieldIndex, value); 50| maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; | } | } | catch (RegexException e) | { 4| e.msg = format( | "[--%s %s]. Invalid regular expression: %s\n Expected: '--%s :' or '--%s :' where is a regular expression.", | option, optionVal, e.msg, option, option); 4| throw e; | } | catch (Exception e) | { 11| e.msg = format( | "[--%s %s]. %s\n Expected: '--%s :' or '--%s :' where is a regular expression.", | option, optionVal, e.msg, option, option); 11| throw e; | } |} | | |CmdOptionHandler makeFieldVsFieldOptionHandler(FieldVsFieldPredicate predicateFn, string option, string optionVal) |{ 41| return | (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 70| => fieldVsFieldOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); |} | |void fieldVsFieldOptionHandler( | ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, | FieldVsFieldPredicate fn, string option, string optionVal) |{ | import tsv_utils.common.fieldlist; | | try | { 41| auto optionValParse = | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields); | 34| auto fieldIndices1 = optionValParse.array; | 34| enforce(fieldIndices1.length != 0, "First field argument is empty."); 34| enforce(fieldIndices1.length == 1, "First field argument references multiple fields."); 35| enforce(optionVal.length - optionValParse.consumed > 1, " Second field argument is empty."); | 33| auto fieldIndices2 = | optionVal[optionValParse.consumed + 1 .. $] | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, Yes.consumeEntireFieldListString) | (hasHeader, headerFields) | .array; | 30| enforce(fieldIndices2.length != 0, "Second field argument is empty."); 30| enforce(fieldIndices2.length == 1, "Second field argument references multiple fields."); | 30| enforce(fieldIndices1[0] != fieldIndices2[0], 1| format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); | 29| tests ~= makeFieldVsFieldDelegate(fn, fieldIndices1[0], fieldIndices2[0]); 29| maxFieldIndex = max(maxFieldIndex, fieldIndices1[0], fieldIndices2[0]); | } | catch (Exception e) | { 12| e.msg = format( | "[--%s %s]. %s\n Expected: '--%s :' where and are individual fields.", | option, optionVal, e.msg, option); 12| throw e; | } |} | |CmdOptionHandler makeFieldFieldNumOptionHandler(FieldFieldNumPredicate predicateFn, string option, string optionVal) |{ 31| return | (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 53| => fieldFieldNumOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); |} | |void fieldFieldNumOptionHandler( | ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, | FieldFieldNumPredicate fn, string option, string optionVal) |{ | import tsv_utils.common.fieldlist; | | try | { 31| auto optionValParse1 = | optionVal | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields); | 28| auto fieldIndices1 = optionValParse1.array; | 28| enforce(fieldIndices1.length != 0, "First field argument is empty."); 28| enforce(fieldIndices1.length == 1, "First field argument references multiple fields."); 28| enforce(optionVal.length - optionValParse1.consumed > 1, " Second field argument is empty."); | 28| auto optionValSegment2 = optionVal[optionValParse1.consumed + 1 .. $]; 28| auto optionValParse2 = | optionValSegment2 | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields); | 25| auto fieldIndices2 = optionValParse2.array; | 25| enforce(fieldIndices2.length != 0, "Second field argument is empty."); 25| enforce(fieldIndices2.length == 1, "Second field argument references multiple fields."); 26| enforce(optionValSegment2.length - optionValParse2.consumed > 1, "Number argument is empty."); | 24| size_t field1 = fieldIndices1[0]; 24| size_t field2 = fieldIndices2[0]; 24| double value = optionValSegment2[optionValParse2.consumed + 1 .. $].to!double; | 23| enforce(field1 != field2, 1| format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); | 22| tests ~= makeFieldFieldNumDelegate(fn, field1, field2, value); 22| maxFieldIndex = max(maxFieldIndex, field1, field2); | } | catch (Exception e) | { 9| e.msg = format( | "[--%s %s]. %s\n Expected: '--%s ::' where and are individual fields.", | option, optionVal, e.msg, option); 9| throw e; | } |} | |/** Command line options - This struct holds the results of command line option processing. | * It also has a method, processArgs, that invokes command line arg processing. | */ |struct TsvFilterOptions |{ | import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; | | string programName; | InputSourceRange inputSources; /// Input files | FieldsPredicate[] tests; /// Derived from tests | size_t maxFieldIndex; /// Derived from tests | bool hasHeader = false; /// --H|header | bool invert = false; /// --invert | bool disjunct = false; /// --or | char delim = '\t'; /// --delimiter | | /* Returns a tuple. First value is true if command line arguments were successfully | * processed and execution should continue, or false if an error occurred or the user | * asked for help. If false, the second value is the appropriate exit code (0 or 1). | * | * Returning true (execution continues) means args have been validated and the | * tests array has been established. | */ | auto processArgs (ref string[] cmdArgs) | { | import std.algorithm : each; | import std.array : split; | import std.conv : to; | import std.getopt; | import std.path : baseName, stripExtension; | import tsv_utils.common.getopt_inorder; | import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; | 366| bool helpVerbose = false; // --help-verbose 366| bool helpOptions = false; // --help-options 366| bool helpFields = false; // --help-fields 366| bool versionWanted = false; // --V|version | 732| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | /* Command option handlers - One handler for each option. These conform to the | * getopt required handler signature, and separate knowledge the specific command | * option text from the option processing. | */ | 366| CmdOptionHandler[] cmdLineTestOptions; | 17| void handlerFldEmpty(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldEmpty, option, value); } 7| void handlerFldNotEmpty(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldNotEmpty, option, value); } 6| void handlerFldBlank(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldBlank, option, value); } 6| void handlerFldNotBlank(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldNotBlank, option, value); } | 4| void handlerFldIsNumeric(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsNumeric, option, value); } 4| void handlerFldIsFinite(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsFinite, option, value); } 2| void handlerFldIsNaN(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsNaN, option, value); } 2| void handlerFldIsInfinity(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsInfinity, option, value); } | 20| void handlerNumLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numLE, option, value); } 5| void handlerNumLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numLT, option, value); } 12| void handlerNumGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numGE, option, value); } 8| void handlerNumGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numGT, option, value); } 54| void handlerNumEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numEQ, option, value); } 8| void handlerNumNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numNE, option, value); } | 2| void handlerStrLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strLE, option, value); } 3| void handlerStrLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strLT, option, value); } 2| void handlerStrGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strGE, option, value); } 4| void handlerStrGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strGT, option, value); } 27| void handlerStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strEQ, option, value); } 7| void handlerStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strNE, option, value); } | 17| void handlerStrInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strInFld, option, value); } 8| void handlerStrNotInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strNotInFld, option, value); } | 16| void handlerIStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrEQ, option, value); } 3| void handlerIStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrNE, option, value); } 8| void handlerIStrInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrInFld, option, value); } 5| void handlerIStrNotInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrNotInFld, option, value); } | 17| void handlerRegexMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exMatch, option, value, true); } 4| void handlerRegexNotMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exNotMatch, option, value, true); } 10| void handlerIRegexMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exMatch, option, value, false); } 3| void handlerIRegexNotMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exNotMatch, option, value, false); } | 3| void handlerCharLenLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenLE, option, value); } 5| void handlerCharLenLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenLT, option, value); } 5| void handlerCharLenGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenGE, option, value); } 2| void handlerCharLenGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenGT, option, value); } 4| void handlerCharLenEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenEQ, option, value); } 4| void handlerCharLenNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenNE, option, value); } | 2| void handlerByteLenLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenLE, option, value); } 2| void handlerByteLenLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenLT, option, value); } 3| void handlerByteLenGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenGE, option, value); } 2| void handlerByteLenGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenGT, option, value); } 4| void handlerByteLenEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenEQ, option, value); } 4| void handlerByteLenNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenNE, option, value); } | 8| void handlerFFLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffLE, option, value); } 4| void handlerFFLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffLT, option, value); } 2| void handlerFFGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffGE, option, value); } 5| void handlerFFGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffGT, option, value); } 5| void handlerFFEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffEQ, option, value); } 5| void handlerFFNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffNE, option, value); } | 4| void handlerFFStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffStrEQ, option, value); } 4| void handlerFFStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffStrNE, option, value); } 2| void handlerFFIStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffIStrEQ, option, value); } 2| void handlerFFIStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffIStrNE, option, value); } | 15| void handlerFFAbsDiffLE(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffAbsDiffLE, option, value); } 4| void handlerFFAbsDiffGT(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffAbsDiffGT, option, value); } 6| void handlerFFRelDiffLE(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffRelDiffLE, option, value); } 6| void handlerFFRelDiffGT(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffRelDiffGT, option, value); } | | try | { 366| arraySep = ","; // Use comma to separate values in command line options 366| auto r = getoptInorder( | cmdArgs, | "help-verbose", " Print full help.", &helpVerbose, | "help-options", " Print the options list by itself.", &helpOptions, | "help-fields", " Print help on specifying fields.", &helpFields, | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | "H|header", " Treat the first line of each file as a header.", &hasHeader, | std.getopt.config.caseInsensitive, | "or", " Evaluate tests as an OR rather than an AND.", &disjunct, | std.getopt.config.caseSensitive, | "v|invert", " Invert the filter, printing lines that do not match.", &invert, | std.getopt.config.caseInsensitive, | "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, | | "empty", " True if FIELD is empty.", &handlerFldEmpty, | "not-empty", " True if FIELD is not empty.", &handlerFldNotEmpty, | "blank", " True if FIELD is empty or all whitespace.", &handlerFldBlank, | "not-blank", " True if FIELD contains a non-whitespace character.", &handlerFldNotBlank, | | "is-numeric", " True if FIELD is interpretable as a number.", &handlerFldIsNumeric, | "is-finite", " True if FIELD is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite, | "is-nan", " True if FIELD is NaN.", &handlerFldIsNaN, | "is-infinity", " True if FIELD is infinity.", &handlerFldIsInfinity, | | "le", ":NUM FIELD <= NUM (numeric).", &handlerNumLE, | "lt", ":NUM FIELD < NUM (numeric).", &handlerNumLT, | "ge", ":NUM FIELD >= NUM (numeric).", &handlerNumGE, | "gt", ":NUM FIELD > NUM (numeric).", &handlerNumGT, | "eq", ":NUM FIELD == NUM (numeric).", &handlerNumEQ, | "ne", ":NUM FIELD != NUM (numeric).", &handlerNumNE, | | "str-le", ":STR FIELD <= STR (string).", &handlerStrLE, | "str-lt", ":STR FIELD < STR (string).", &handlerStrLT, | "str-ge", ":STR FIELD >= STR (string).", &handlerStrGE, | "str-gt", ":STR FIELD > STR (string).", &handlerStrGT, | "str-eq", ":STR FIELD == STR (string).", &handlerStrEQ, | "istr-eq", ":STR FIELD == STR (string, case-insensitive).", &handlerIStrEQ, | "str-ne", ":STR FIELD != STR (string).", &handlerStrNE, | "istr-ne", ":STR FIELD != STR (string, case-insensitive).", &handlerIStrNE, | "str-in-fld", ":STR FIELD contains STR (substring search).", &handlerStrInFld, | "istr-in-fld", ":STR FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld, | "str-not-in-fld", ":STR FIELD does not contain STR (substring search).", &handlerStrNotInFld, | "istr-not-in-fld", ":STR FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld, | | "regex", ":REGEX FIELD matches regular expression.", &handlerRegexMatch, | "iregex", ":REGEX FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch, | "not-regex", ":REGEX FIELD does not match regular expression.", &handlerRegexNotMatch, | "not-iregex", ":REGEX FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch, | | "char-len-le", ":NUM character-length(FIELD) <= NUM.", &handlerCharLenLE, | "char-len-lt", ":NUM character-length(FIELD) < NUM.", &handlerCharLenLT, | "char-len-ge", ":NUM character-length(FIELD) >= NUM.", &handlerCharLenGE, | "char-len-gt", ":NUM character-length(FIELD) > NUM.", &handlerCharLenGT, | "char-len-eq", ":NUM character-length(FIELD) == NUM.", &handlerCharLenEQ, | "char-len-ne", ":NUM character-length(FIELD) != NUM.", &handlerCharLenNE, | | "byte-len-le", ":NUM byte-length(FIELD) <= NUM.", &handlerByteLenLE, | "byte-len-lt", ":NUM byte-length(FIELD) < NUM.", &handlerByteLenLT, | "byte-len-ge", ":NUM byte-length(FIELD) >= NUM.", &handlerByteLenGE, | "byte-len-gt", ":NUM byte-length(FIELD) > NUM.", &handlerByteLenGT, | "byte-len-eq", ":NUM byte-length(FIELD) == NUM.", &handlerByteLenEQ, | "byte-len-ne", ":NUM byte-length(FIELD) != NUM.", &handlerByteLenNE, | | "ff-le", "FIELD1:FIELD2 FIELD1 <= FIELD2 (numeric).", &handlerFFLE, | "ff-lt", "FIELD1:FIELD2 FIELD1 < FIELD2 (numeric).", &handlerFFLT, | "ff-ge", "FIELD1:FIELD2 FIELD1 >= FIELD2 (numeric).", &handlerFFGE, | "ff-gt", "FIELD1:FIELD2 FIELD1 > FIELD2 (numeric).", &handlerFFGT, | "ff-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (numeric).", &handlerFFEQ, | "ff-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (numeric).", &handlerFFNE, | "ff-str-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string).", &handlerFFStrEQ, | "ff-istr-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ, | "ff-str-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string).", &handlerFFStrNE, | "ff-istr-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE, | | "ff-absdiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE, | "ff-absdiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) > NUM", &handlerFFAbsDiffGT, | "ff-reldiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE, | "ff-reldiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) > NUM", &handlerFFRelDiffGT, | ); | | /* Both help texts are a bit long. In this case, for "regular" help, don't | * print options, just the text. The text summarizes the options. | */ 363| if (r.helpWanted) | { 1| stdout.write(helpText); 1| return tuple(false, 0); | } 362| else if (helpVerbose) | { 1| defaultGetoptPrinter(helpTextVerbose, r.options); 1| return tuple(false, 0); | } 361| else if (helpOptions) | { 1| defaultGetoptPrinter(helpTextOptions, r.options); 1| return tuple(false, 0); | } 360| else if (helpFields) | { | import tsv_utils.common.fieldlist : fieldListHelpText ; 1| writeln(fieldListHelpText); 1| return tuple(false, 0); | } 359| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("tsv-filter")); 2| return tuple(false, 0); | } | | /* Input files. Remaining command line args are files. */ 714| string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 357| cmdArgs.length = 1; | 357| string[] headerFields; | | /* FieldListArgProcessing encapsulates the field list processing. It is | * called prior to reading the header line if headers are not being used, | * and after if headers are being used. | */ | void fieldListArgProcessing() | { 1091| cmdLineTestOptions.each!(dg => dg(tests, maxFieldIndex, hasHeader, headerFields)); | } | 392| if (!hasHeader) fieldListArgProcessing(); | 692| ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 346| inputSources = inputSourceRange(filepaths, readHeader); | 345| if (hasHeader) | { 321| throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1); 320| headerFields = inputSources.front.header.split(delim).to!(string[]); 320| fieldListArgProcessing(); | } | } | catch (Exception e) | { 71| stderr.writefln("[%s] Error processing command line arguments: %s", programName, e.msg); 71| return tuple(false, 1); | } 289| return tuple(true, 0); | } |} | |/** tsvFilter processes the input files and runs the tests. | */ |void tsvFilter(ref TsvFilterOptions cmdopt) |{ | import std.algorithm : all, any, splitter; | import std.range; | import tsv_utils.common.utils : BufferedOutputRange, bufferedByLine, InputSourceRange, | throwIfWindowsNewlineOnUnix; | | /* inputSources must be an InputSourceRange and include at least stdin. */ 289| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | | /* BufferedOutputRange improves performance on narrow files with high percentages of | * writes. Want responsive output if output is rare, so ensure the first matched | * line is written, and that writes separated by long stretches of non-matched lines | * are written. | */ | enum maxInputLinesWithoutBufferFlush = 1024; 289| size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1; | 578| auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); | | /* First header is read during command line argument processing. Immediately | * flush it so subsequent processes in a unix command pipeline see it early. | * This helps provide timely error messages. | */ 554| if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) | { 264| auto inputStream = cmdopt.inputSources.front; 264| bufferedOutput.appendln(inputStream.header); 264| bufferedOutput.flush; | } | | /* Process each input file, one line at a time. */ 578| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 289| auto lineFields = new char[][](cmdopt.maxFieldIndex + 1); | 1457| foreach (inputStream; cmdopt.inputSources) | { 569| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 1521912| foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) | { 304290| if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); | | /* Copy the needed number of fields to the fields array. */ 304264| int fieldIndex = -1; 1549624| foreach (fieldValue; line.splitter(cmdopt.delim)) | { 318622| if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break; 312714| fieldIndex++; 312714| lineFields[fieldIndex] = fieldValue; | } | 304264| if (fieldIndex == -1) | { 16| assert(line.length == 0); | /* Bug work-around. Currently empty lines are not handled properly by splitter. | * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 | * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 | * Work-around: Point to the line. It's an empty string. | */ 16| fieldIndex++; 16| lineFields[fieldIndex] = line; | } | 304264| enforce(fieldIndex >= cast(long) cmdopt.maxFieldIndex, 1| format("Not enough fields in line. File: %s, Line: %s", | inputStream.name, lineNum)); | | /* Run the tests. Tests will fail (throw) if a field cannot be converted | * to the expected type. | */ | try | { 304263| inputLinesWithoutBufferFlush++; 304263| bool passed = cmdopt.disjunct ? 1000364| cmdopt.tests.any!(x => x(lineFields)) : 208682| cmdopt.tests.all!(x => x(lineFields)); 304331| if (cmdopt.invert) passed = !passed; 304261| if (passed) | { 31743| const bool wasFlushed = bufferedOutput.appendln(line); 31743| if (wasFlushed) inputLinesWithoutBufferFlush = 0; 31743| else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush) | { 311| bufferedOutput.flush; 311| inputLinesWithoutBufferFlush = 0; | } | } | } | catch (Exception e) | { 2| throw new Exception( | format("Could not process line or field: %s\n File: %s Line: %s%s", | e.msg, inputStream.name, lineNum, 2| (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); | } | } | } |} tsv-filter/src/tsv_utils/tsv-filter.d is 100% covered <<<<<< EOF # path=./common-src-tsv_utils-common-fieldlist.lst |/** | Utilities for parsing "field-lists" entered on the command line. | | # Field-lists | | A "field-list" is entered on the command line to specify a set of fields for a | command option. A field-list is a comma separated list of individual fields and | "field-ranges". Fields are identified either by field number or by field names found | in the header line of the input data. A field-range is a pair of fields separated | by a hyphen and includes both the listed fields and all the fields in between. | | $(NOTE Note: Internally, the comma separated entries in a field-list are called a | field-group.) | | Fields-lists are parsed into an ordered set of one-based field numbers. Repeating | fields are allowed. Some examples of numeric fields with the `tsv-select` tool: | | $(CONSOLE | $ tsv-select -f 3 # Field 3 | $ tsv-select -f 3-5 # Fields 3,4,5 | $ tsv-select -f 7,3-5 # Fields 7,3,4,5 | $ tsv-select -f 3,5-3,5 # Fields 3,5,4,3,5 | ) | | Fields specified by name must match a name in the header line of the input data. | Glob-style wildcards are supported using the asterisk (`*`) character. When | wildcards are used with a single field, all matching fields in the header are used. | When used in a field range, both field names must match a single header field. | | Consider a file `data.tsv` containing timing information: | | $(CONSOLE | $ tsv-pretty data.tsv | run elapsed_time user_time system_time max_memory | 1 57.5 52.0 5.5 1420 | 2 52.0 49.0 3.0 1270 | 3 55.5 51.0 4.5 1410 | ) | | The header fields are: | | ``` | 1 run | 2 elapsed_time | 3 user_time | 4 system_time | 5 max_memory | ``` | | Some examples using named fields for this file. (Note: `-H` turns on header processing): | | $(CONSOLE | $ tsv-select data.tsv -H -f user_time # Field 3 | $ tsv-select data.tsv -H -f run,user_time # Fields 1,3 | $ tsv-select data.tsv -H -f run-user_time # Fields 1,2,3 | $ tsv-select data.tsv -H -f '*_memory' # Field 5 | $ tsv-select data.tsv -H -f '*_time' # Fields 2,3,4 | $ tsv-select data.tsv -H -f '*_time,*_memory' # Fields 2,3,4,5 | $ tsv-select data.tsv -H -f '*_memory,*_time' # Fields 5,2,3,4 | $ tsv-select data.tsv -H -f 'run-*_time' # Invalid range. '*_time' matches 3 fields | ) | | Both field numbers and fields names can both be used in the same field-list, except | when specifying a field range: | | $(CONSOLE | $ tsv-select data.tsv -H -f 1,user_time # Fields 1,3 | $ tsv-select data.tsv -H -f 1-user_time # Invalid range | ) | | A backslash is used to escape special characters occurring in field names. Characters | that must be escaped when specifying them field names are: asterisk (`*`), comma(`,`), | colon (`:`), space (` `), hyphen (`-`), and backslash (`\`). A backslash is also used | to escape numbers that should be treated as field names rather than field numbers. | Consider a file with the following header fields: | ``` | 1 test id | 2 run:id | 3 time-stamp | 4 001 | 5 100 | ``` | | These fields can be used in named field commands as follows: | | $(CONSOLE | $ tsv-select file.tsv -H -f 'test\ id' # Field 1 | $ tsv-select file.tsv -H -f 'run\:1' # Field 2 | $ tsv-select file.tsv -H -f 'time\-stamp' # Field 3 | $ tsv-select file.tsv -H -f '\001' # Field 4 | $ tsv-select file.tsv -H -f '\100' # Field 5 | $ tsv-select file.tsv -H -f '\001,\100' # Fields 4,5 | ) | | $(NOTE Note: The use of single quotes on the command line is necessary to avoid shell | interpretation of the backslash character.) | | Fields lists are combined with other content in some command line options. The colon | and space characters are both terminator characters for field-lists. Some examples: | | $(CONSOLE | $ tsv-filter -H --lt 3:100 # Field 3 < 100 | $ tsv-filter -H --lt elapsed_time:100 # 'elapsed_time' field < 100 | $ tsv-summarize -H --quantile '*_time:0.25,0.75' # 1st and 3rd quantiles for time fields | ) | | Field-list support routines identify the termination of the field-list. They do not | do any processing of content occurring after the field-list. | | # Numeric field-lists | | The original field-lists used in tsv-utils were numeric only. This is still the | format used when a header line is not available. They are a strict subset of the | field-list syntax described so above. Due to this history there are support routines | that only support numeric field-lists. They are used by tools supporting only numeric | field lists. They are also used by the more general field-list processing routines in | this file when a named field or field range can be reduced to a numeric field-group. | | # Field-list utilities | | The following functions provide the APIs for field-list processing: | | $(LIST | * [parseFieldList] - The main routine for parsing a field-list entered on the | command line. It returns a range iterating over the field numbers represented | by field-list. It handles both numeric and named field-lists and works with or | without header lines. The range has a special member function that tracks how | much of the original input range has been consumed. | | * [parseNumericFieldList] - This is a top-level routine for processing numeric | field-lists entered on the command line. It was the original routine used by | tsv-utils tools when only numeric field-lists where supported. It is still | used in cases where only numeric field-lists are supported. | | * [makeFieldListOptionHandler] - Returns a delegate that can be passed to | std.getopt for parsing numeric field-lists. It was part of the original code | supporting numeric field-lists. Note that delegates passed to std.getopt do | not have access to the header line of the input file, so the technique can | only be used for numeric field-lists. | | * [fieldListHelpText] - A global variable containing help text describing the | field list syntax that can be shown to end users. | ) | | The following private functions handle key parts of the implementation: | | $(LIST | * [findFieldGroups] - Range that iterates over the "field-groups" in a | "field-list". | | * [isNumericFieldGroup] - Determines if a field-group is a valid numeric | field-group. | | * [isNumericFieldGroupWithHyphenFirstOrLast] - Determines if a field-group is a | valid numeric field-group, except for having a leading or trailing hyphen. | This test is used to provide better error messages. A field-group that does not | pass either [isNumericFieldGroup] or [isNumericFieldGroupWithHyphenFirstOrLast] | is processed as a named field-group. | | * [isMixedNumericNamedFieldGroup] - determines if a field group is a range where | one element is a field number and the other element is a named field (not a | number). This is used for error handling. | | * [namedFieldGroupToRegex] - Generates regexes for matching field names in a | field group to field names in the header line. One regex is generated for a | single field, two are generated for a range. Wildcards and escape characters | are translated into the correct regex format. | | * [namedFieldRegexMatches] - Returns an input range iterating over all the | fields (strings) in a range matching a regular expression. It is used in | conjunction with [namedFieldGroupToRegex] to find the fields in a header line | matching a regular expression and map them to field numbers. | | * [parseNumericFieldGroup] - A helper function that parses a numeric field | group (a string) and returns a range that iterates over all the field numbers | in the field group. A numeric field-group is either a single number or a | range. E.g. `5` or `5-8`. This routine was part of the original code | supporting only numeric field-lists. | ) |*/ | |module tsv_utils.common.fieldlist; | |import std.exception : enforce; |import std.format : format; |import std.range; |import std.regex; |import std.stdio; |import std.traits : isIntegral, isNarrowString, isUnsigned, ReturnType, Unqual; |import std.typecons : tuple, Tuple; | |/** | fieldListHelpText is text intended display to end users to describe the field-list | syntax. |*/ |immutable fieldListHelpText = q"EOS |tsv-utils Field Syntax | |Most tsv-utils tools operate on fields specified on the command line. All |tools use the same syntax to identify fields. tsv-select is used in this |document for examples, but the syntax shown applies to all tools. | |Fields can be identified either by a one-upped field number or by field |name. Field names require the first line of input data to be a header with |field names. Header line processing is enabled by the '--H|header' option. | |Some command options only accept a single field, but many operate on lists |of fields. Here are some examples (using tsv-select): | | $ tsv-select -f 1,2 file.tsv # Selection using field numbers | $ tsv-select -f 5-9 file.txt # Selection using a range | $ tsv-select -H -f RecordID file.txt # Selection using a field name | $ tsv-select -H -f Date,Time,3,5-7,9 # Mix of names, numbers, ranges | |Wildcards: Named fields support a simple 'glob' style wildcarding scheme. |The asterisk character ('*') can be used to match any sequence of |characters, including no characters. This is similar to how '*' can be |used to match file names on the Unix command line. All fields with |matching names are selected, so wildcards are a convenient way to select |a set of related fields. Quotes should be placed around command line |arguments containing wildcards to avoid interpretation by the shell. | |Examples - Consider a file 'data.tsv' containing timing information: | | $ tsv-pretty data.tsv | run elapsed_time user_time system_time max_memory | 1 57.5 52.0 5.5 1420 | 2 52.0 49.0 3.0 1270 | 3 55.5 51.0 4.5 1410 | |Some examples selecting fields from this file: | | $ tsv-select data.tsv -H -f 3 # Field 3 (user_time) | $ tsv-select data.tsv -H -f user_time # Field 3 | $ tsv-select data.tsv -H -f run,user_time # Fields 1,3 | $ tsv-select data.tsv -H -f '*_memory' # Field 5 | $ tsv-select data.tsv -H -f '*_time' # Fields 2,3,4 | $ tsv-select data.tsv -H -f 1-3 # Fields 1,2,3 | $ tsv-select data.tsv -H -f run-user_time # Fields 1,2,3 (range with names) | |Special characters: There are several special characters that need to be |escaped when specifying field names. Escaping is done by preceeding the |special character with a backslash. Characters requiring escapes are: |asterisk (`*`), comma(`,`), colon (`:`), space (` `), hyphen (`-`), and |backslash (`\`). A field name that contains only digits also needs to be |backslash escaped, this indicates it should be treated as a field name |and not a field number. A backslash can be used to escape any character, |so it's not necessary to remember the list. Use an escape when not sure. | |Examples - Consider a file with five fields named as follows: | | 1 test id | 2 run:id | 3 time-stamp | 4 001 | 5 100 | |Some examples using specifying these fields by name: | | $ tsv-select file.tsv -H -f 'test\ id' # Field 1 | $ tsv-select file.tsv -H -f '\test\ id' # Field 1 | $ tsv-select file.tsv -H -f 'run\:1' # Field 2 | $ tsv-select file.tsv -H -f 'time\-stamp' # Field 3 | $ tsv-select file.tsv -H -f '\001' # Field 4 | $ tsv-select file.tsv -H -f '\100' # Field 5 | $ tsv-select file.tsv -H -f '\001,\100' # Fields 4,5 |EOS"; | |/** | The `convertToZeroBasedIndex` flag is used as a template parameter controlling | whether field numbers are converted to zero-based indices. It is used by | [parseFieldList], [parseNumericFieldList], and [makeFieldListOptionHandler]. |*/ |alias ConvertToZeroBasedIndex = Flag!"convertToZeroBasedIndex"; | |/** | The `allowFieldNumZero` flag is used as a template parameter controlling | whether zero is a valid field. It is used by [parseFieldList], | [parseNumericFieldList], and [makeFieldListOptionHandler]. |*/ |alias AllowFieldNumZero = Flag!"allowFieldNumZero"; | |/** | The `consumeEntireFieldListString` flag is used as a template parameter | indicating whether the entire field-list string should be consumed. It is | used by [parseNumericFieldList]. |*/ |alias ConsumeEntireFieldListString = Flag!"consumeEntireFieldListString"; | |/** | `parseFieldList` returns a range iterating over the field numbers in a field-list. | | `parseFieldList` is the main routine for parsing field-lists entered on the command | line. It handles both numeric and named field-lists. The elements of the returned | range are sequence of 1-up field numbers corresponding to the fields specified in | the field-list string. | | An error is thrown if the field-list string is malformed. The error text is | intended for display to the user invoking the tsv-utils tool from the command | line. | | Named field-lists require an array of field names from the header line. Named | fields are allowed only if a header line is available. Using a named field-list | without a header line generates an error message referencing the headerCmdArg | string as a hint to the end user. | | Several optional modes of operation are available: | | $(LIST | * Conversion to zero-based indexes (`convertToZero` template parameter) - Returns | the field numbers as zero-based array indices rather than 1-based field numbers. | | * Allow zero as a field number (`allowZero` template parameter) - This allows zero | to be used as a field number. This is typically used to allow the user to | specify the entire line rather than an individual field. Use a signed result | type if also using covertToZero, as this will be returned as (-1). | | * Consuming the entire field list string (`consumeEntire` template parameter) - By | default, an error is thrown if the entire field-list string is not consumed. | This is the most common behavior. Turning this off (the `No` option) will | terminate processing without error when a valid field-list termination character | is found. The `parseFieldList.consumed` member function can be used to see where | in the input string processing terminated. | ) | | The optional `cmdOptionString` and `headerCmdArg` arguments are used to generate better | error messages. `cmdOptionString` should be the command line arguments string passed to | `std.getopt`. e.g `"f|field"`. This is added to the error message. Callers already | adding the option name to the error message should pass the empty string. | | The `headerCmdArg` argument should be the option for turning on header line processing. | This is standard for tsv-utils tools (`--H|header`), so most tsv-utils tools will use | the default value. | | `parseFieldList` returns a reference range. This is so the `consumed` member function | remains valid when using the range with facilities that would copy a value-based | range. |*/ |auto parseFieldList(T = size_t, | ConvertToZeroBasedIndex convertToZero = No.convertToZeroBasedIndex, | AllowFieldNumZero allowZero = No.allowFieldNumZero, | ConsumeEntireFieldListString consumeEntire = Yes.consumeEntireFieldListString) |(string fieldList, bool hasHeader = false, string[] headerFields = [], | string cmdOptionString = "", string headerCmdArg = "H|header") |if (isIntegral!T && (!allowZero || !convertToZero || !isUnsigned!T)) |{ | final class Result | { | private string _fieldList; | private bool _hasHeader; | private string[] _headerFields; | private string _cmdOptionMsgPart; | private string _headerCmdArg; | private ReturnType!(findFieldGroups!string) _fieldGroupRange; | private bool _isFrontNumericRange; | private ReturnType!(parseNumericFieldGroup!(T, convertToZero, allowZero)) _numericFieldRange; | private ReturnType!(namedFieldRegexMatches!(T, convertToZero, string[])) _namedFieldMatches; | private size_t _consumed; | 2762| this(string fieldList, bool hasHeader, string[] headerFields, | string cmdOptionString, string headerCmdArg) | { 3183| _fieldList = fieldList; 3183| _hasHeader = hasHeader; 3183| _headerFields = headerFields.dup; 4369| if (!cmdOptionString.empty) _cmdOptionMsgPart = "[--" ~ cmdOptionString ~ "] "; 6366| if (!headerCmdArg.empty) _headerCmdArg = "--" ~ headerCmdArg; 3183| _fieldGroupRange = findFieldGroups(fieldList); | | /* _namedFieldMatches must be initialized in the constructor because it | * is a nested struct. | */ 3183| _namedFieldMatches = namedFieldRegexMatches!(T, convertToZero)(["X"], ctRegex!`^No Match$`); | | try | { 3183| consumeNextFieldGroup(); 3096| enforce(!empty, format("Empty field list: '%s'.", _fieldList)); | } | catch (Exception e) | { 421| throw new Exception(_cmdOptionMsgPart ~ e.msg); | } | 2762| assert(_consumed <= _fieldList.length); | } | | private void consumeNextFieldGroup() | { 7126| if (!_fieldGroupRange.empty) | { 4240| auto fieldGroup = _fieldGroupRange.front.value; 4240| _consumed = _fieldGroupRange.front.consumed; 4240| _fieldGroupRange.popFront; | 4240| enforce(!fieldGroup.isNumericFieldGroupWithHyphenFirstOrLast, 26| format("Incomplete ranges are not supported: '%s'.", | fieldGroup)); | 4214| if (fieldGroup.isNumericFieldGroup) | { 3148| _isFrontNumericRange = true; 3148| _numericFieldRange = | parseNumericFieldGroup!(T, convertToZero, allowZero)(fieldGroup); | } | else | { 1066| enforce(_hasHeader, 33| format("Non-numeric field group: '%s'. Use '%s' when using named field groups.", | fieldGroup, _headerCmdArg)); | 1033| enforce(!fieldGroup.isMixedNumericNamedFieldGroup, 20| format("Ranges with both numeric and named components are not supported: '%s'.", | fieldGroup)); | 1013| auto fieldGroupRegex = namedFieldGroupToRegex(fieldGroup); | 998| if (!fieldGroupRegex[1].empty) | { | /* A range formed by a pair of field names. Find the field | * numbers and generate the string form of the numeric | * field-group. Pass this to parseNumberFieldRange. | */ 97| auto f0 = namedFieldRegexMatches(_headerFields, fieldGroupRegex[0]).array; 97| auto f1 = namedFieldRegexMatches(_headerFields, fieldGroupRegex[1]).array; | 97| string hintMsg = "Not specifying a range? Backslash escape any hyphens in the field name."; | 97| enforce(f0.length > 0, 8| format("First field in range not found in file header. Range: '%s'.\n%s", | fieldGroup, hintMsg)); 89| enforce(f1.length > 0, 8| format("Second field in range not found in file header. Range: '%s'.\n%s", | fieldGroup, hintMsg)); 81| enforce(f0.length == 1, 8| format("First field in range matches multiple header fields. Range: '%s'.\n%s", | fieldGroup, hintMsg)); 73| enforce(f1.length == 1, 8| format("Second field in range matches multiple header fields. Range: '%s'.\n%s", | fieldGroup, hintMsg)); | 65| _isFrontNumericRange = true; 65| auto fieldGroupAsNumericRange = format("%d-%d", f0[0][0], f1[0][0]); 65| _numericFieldRange = | parseNumericFieldGroup!(T, convertToZero, allowZero)(fieldGroupAsNumericRange); | } | else | { 901| enforce (!fieldGroupRegex[0].empty, "Empty field list entry: '%s'.", fieldGroup); | 901| _isFrontNumericRange = false; 901| _namedFieldMatches = | namedFieldRegexMatches!(T, convertToZero)(_headerFields, fieldGroupRegex[0]); | 901| enforce(!_namedFieldMatches.empty, 48| format("Field not found in file header: '%s'.", fieldGroup)); | } | } | } | } | | bool empty() @safe | { 16810780| return _fieldGroupRange.empty && 33609236| (_isFrontNumericRange ? _numericFieldRange.empty : _namedFieldMatches.empty); | } | | @property T front() @safe | { 4201302| assert(!empty, "Attempting to fetch the front of an empty field list."); 8402604| return _isFrontNumericRange ? _numericFieldRange.front : _namedFieldMatches.front[0]; | } | | void popFront() @safe | { | | /* TODO: Move these definitions to a common location in the file. */ | enum char SPACE = ' '; | enum char COLON = ':'; | 4201302| assert(!empty, "Attempting to popFront an empty field-list."); | | try | { 8401461| if (_isFrontNumericRange) _numericFieldRange.popFront; 1143| else _namedFieldMatches.popFront; | 8402604| if (_isFrontNumericRange ? _numericFieldRange.empty : _namedFieldMatches.empty) | { 3943| consumeNextFieldGroup(); | } | 4201259| assert(_consumed <= _fieldList.length); | 4201259| if (empty) | { | static if (consumeEntire) | { 1909| enforce(_consumed == _fieldList.length, 69| format("Invalid field list: '%s'.", _fieldList)); | } | else | { 810| enforce((_consumed == _fieldList.length || 472| _fieldList[_consumed] == SPACE || 448| _fieldList[_consumed] == COLON), 58| format("Invalid field list: '%s'.", _fieldList)); | } | } | } | catch (Exception e) | { 170| throw new Exception(_cmdOptionMsgPart ~ e.msg); | } | } | | size_t consumed() const nothrow pure @safe | { 988| return _consumed; | } | } | 3183| return new Result(fieldList, hasHeader, headerFields, cmdOptionString, headerCmdArg); |} | |/// Basic cases showing how `parseFieldList` works |@safe unittest |{ | import std.algorithm : each, equal; | 7| string[] emptyHeader = []; | | // Numeric field-lists, with no header line. 7| assert(`5`.parseFieldList | .equal([5])); | 7| assert(`10`.parseFieldList(false, emptyHeader) | .equal([10])); | 7| assert(`1-3,17`.parseFieldList(false, emptyHeader) | .equal([1, 2, 3, 17])); | | // General field lists, when a header line is available 7| assert(`5,1-3`.parseFieldList(true, [`f1`, `f2`, `f3`, `f4`, `f5`]) | .equal([5, 1, 2, 3])); | 7| assert(`f1`.parseFieldList(true, [`f1`, `f2`, `f3`]) | .equal([1])); | 7| assert(`f3`.parseFieldList(true, [`f1`, `f2`, `f3`]) | .equal([3])); | 7| assert(`f1-f3`.parseFieldList(true, [`f1`, `f2`, `f3`]) | .equal([1, 2, 3])); | 7| assert(`f3-f1`.parseFieldList(true, [`f1`, `f2`, `f3`]) | .equal([3, 2, 1])); | 7| assert(`f*`.parseFieldList(true, [`f1`, `f2`, `f3`]) | .equal([1, 2, 3])); | 7| assert(`B*`.parseFieldList(true, [`A1`, `A2`, `B1`, `B2`]) | .equal([3, 4])); | 7| assert(`*2`.parseFieldList(true, [`A1`, `A2`, `B1`, `B2`]) | .equal([2, 4])); | 7| assert(`1-2,f4`.parseFieldList(true, [`f1`, `f2`, `f3`, `f4`, `f5`]) | .equal([1, 2, 4])); | | /* The next few examples are closer to the code that would really be | * used during in command line arg processing. | */ | { 7| string getoptOption = "f|fields"; 7| bool hasHeader = true; 7| auto headerFields = [`A1`, `A2`, `B1`, `B2`]; 7| auto fieldListCmdArg = `B*,A1`; 7| auto fieldNumbers = fieldListCmdArg.parseFieldList(hasHeader, headerFields, getoptOption); 7| assert(fieldNumbers.equal([3, 4, 1])); 7| assert(fieldNumbers.consumed == fieldListCmdArg.length); | } | { | /* Supplimentary options after the field-list. */ 7| string getoptOption = "f|fields"; 7| bool hasHeader = false; 7| string[] headerFields; 7| auto fieldListCmdArg = `3,4:option`; 7| auto fieldNumbers = | fieldListCmdArg.parseFieldList!(size_t, No.convertToZeroBasedIndex, | No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields, getoptOption); 7| assert(fieldNumbers.equal([3, 4])); 7| assert(fieldNumbers.consumed == 3); 7| assert(fieldListCmdArg[fieldNumbers.consumed .. $] == `:option`); | } | { | /* Supplimentary options after the field-list. */ 7| string getoptOption = "f|fields"; 7| bool hasHeader = true; 7| auto headerFields = [`A1`, `A2`, `B1`, `B2`]; 7| auto fieldListCmdArg = `B*:option`; 7| auto fieldNumbers = | fieldListCmdArg.parseFieldList!(size_t, No.convertToZeroBasedIndex, | No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields, getoptOption); 7| assert(fieldNumbers.equal([3, 4])); 7| assert(fieldNumbers.consumed == 2); 7| assert(fieldListCmdArg[fieldNumbers.consumed .. $] == `:option`); | } | { | /* Supplementary options after the field-list. */ 7| string getoptOption = "f|fields"; 7| bool hasHeader = true; 7| auto headerFields = [`A1`, `A2`, `B1`, `B2`]; 7| auto fieldListCmdArg = `B* option`; 7| auto fieldNumbers = | fieldListCmdArg.parseFieldList!(size_t, No.convertToZeroBasedIndex, | No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields, getoptOption); 7| assert(fieldNumbers.equal([3, 4])); 7| assert(fieldNumbers.consumed == 2); 7| assert(fieldListCmdArg[fieldNumbers.consumed .. $] == ` option`); | } | { | /* Mixed numeric and named fields. */ 7| string getoptOption = "f|fields"; 7| bool hasHeader = true; 7| auto headerFields = [`A1`, `A2`, `B1`, `B2`]; 7| auto fieldListCmdArg = `B2,1`; 7| auto fieldNumbers = | fieldListCmdArg.parseFieldList!(size_t, No.convertToZeroBasedIndex, | No.allowFieldNumZero, No.consumeEntireFieldListString) | (hasHeader, headerFields, getoptOption); 7| assert(fieldNumbers.equal([4, 1])); 7| assert(fieldNumbers.consumed == fieldListCmdArg.length); | } |} | |// parseFieldList - Empty and erroneous field list tests |@safe unittest |{ | import std.exception : assertThrown, assertNotThrown; | 14| assertThrown(``.parseFieldList); 14| assertThrown(`,`.parseFieldList); 14| assertThrown(`:`.parseFieldList); 14| assertThrown(` `.parseFieldList); 14| assertThrown(`\`.parseFieldList); 14| assertThrown(`,x`.parseFieldList); 14| assertThrown(`:option`.parseFieldList); 14| assertThrown(` option`.parseFieldList); 14| assertThrown(`:1-3`.parseFieldList); | | { 7| string getoptOption = "f|fields"; 7| string cmdHeaderOption = "header"; 7| bool hasHeader = true; 7| auto headerFields = [`A1`, `A2`, `B1`, `B2`]; 7| auto fieldListCmdArg = `XYZ`; 7| size_t[] fieldNumbers; 7| bool wasCaught = false; 7| try fieldNumbers = fieldListCmdArg.parseFieldList(hasHeader, headerFields, getoptOption).array; | catch (Exception e) | { 7| wasCaught = true; 7| assert(e.msg == "[--f|fields] Field not found in file header: 'XYZ'."); | } 7| finally assert(wasCaught); | } | { 7| string getoptOption = "f|fields"; 7| bool hasHeader = false; // hasHeader=false triggers this error. 7| auto headerFields = [`A1`, `A2`, `B1`, `B2`]; 7| auto fieldListCmdArg = `A1`; 7| size_t[] fieldNumbers; 7| bool wasCaught = false; | 7| try fieldNumbers = fieldListCmdArg.parseFieldList(hasHeader, headerFields, getoptOption).array; | catch (Exception e) | { 7| wasCaught = true; 7| assert(e.msg == "[--f|fields] Non-numeric field group: 'A1'. Use '--H|header' when using named field groups."); | } 7| finally assert(wasCaught); | 7| string cmdHeaderOption = "ZETA"; | 7| try fieldNumbers = fieldListCmdArg.parseFieldList(hasHeader, headerFields, getoptOption, cmdHeaderOption).array; | catch (Exception e) | { 7| wasCaught = true; 7| assert(e.msg == "[--f|fields] Non-numeric field group: 'A1'. Use '--ZETA' when using named field groups."); | } 7| finally assert(wasCaught); | } | { 7| bool hasHeader = true; 7| auto headerFields = [`A1`, `A2`, `B1`, `B2`]; | 14| assertThrown(`XYZ`.parseFieldList(hasHeader, headerFields)); 14| assertThrown(`XYZ-B1`.parseFieldList(hasHeader, headerFields)); 14| assertThrown(`B1-XYZ`.parseFieldList(hasHeader, headerFields)); 14| assertThrown(`A*-B1`.parseFieldList(hasHeader, headerFields)); 14| assertThrown(`B1-A*`.parseFieldList(hasHeader, headerFields)); 14| assertThrown(`B1-`.parseFieldList(hasHeader, headerFields)); 14| assertThrown(`-A1`.parseFieldList(hasHeader, headerFields)); 14| assertThrown(`A1-3`.parseFieldList(hasHeader, headerFields)); 14| assertThrown(`1-A3`.parseFieldList(hasHeader, headerFields)); | } | |} | |//parseFieldList - Named field groups |@safe unittest |{ | import std.algorithm : each, equal; | 7| bool hasHeader = true; 7| auto singleFieldHeader = [`a`]; | 7| assert(`a`.parseFieldList(hasHeader, singleFieldHeader) | .equal([1])); | 7| assert(`a*`.parseFieldList(hasHeader, singleFieldHeader) | .equal([1])); | 7| assert(`*a`.parseFieldList(hasHeader, singleFieldHeader) | .equal([1])); | 7| assert(`*a*`.parseFieldList(hasHeader, singleFieldHeader) | .equal([1])); | 7| assert(`*`.parseFieldList(hasHeader, singleFieldHeader) | .equal([1])); | 7| auto twoFieldHeader = [`f1`, `f2`]; | 7| assert(`f1`.parseFieldList(hasHeader, twoFieldHeader) | .equal([1])); | 7| assert(`f2`.parseFieldList(hasHeader, twoFieldHeader) | .equal([2])); | 7| assert(`f1,f2`.parseFieldList(hasHeader, twoFieldHeader) | .equal([1, 2])); | 7| assert(`f2,f1`.parseFieldList(hasHeader, twoFieldHeader) | .equal([2, 1])); | 7| assert(`f1-f2`.parseFieldList(hasHeader, twoFieldHeader) | .equal([1, 2])); | 7| assert(`f2-f1`.parseFieldList(hasHeader, twoFieldHeader) | .equal([2, 1])); | 7| assert(`*`.parseFieldList(hasHeader, twoFieldHeader) | .equal([1, 2])); | 7| auto multiFieldHeader = [`f1`, `f2`, `x`, `01`, `02`, `3`, `snow storm`, `雪风暴`, `Tempête de neige`, `x`]; | 7| assert(`*`.parseFieldList(hasHeader, multiFieldHeader) | .equal([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])); | 7| assert(`*2`.parseFieldList(hasHeader, multiFieldHeader) | .equal([2, 5])); | 7| assert(`snow*`.parseFieldList(hasHeader, multiFieldHeader) | .equal([7])); | 7| assert(`snow\ storm`.parseFieldList(hasHeader, multiFieldHeader) | .equal([7])); | 7| assert(`雪风暴`.parseFieldList(hasHeader, multiFieldHeader) | .equal([8])); | 7| assert(`雪风*`.parseFieldList(hasHeader, multiFieldHeader) | .equal([8])); | 7| assert(`*风*`.parseFieldList(hasHeader, multiFieldHeader) | .equal([8])); | 7| assert(`Tempête\ de\ neige`.parseFieldList(hasHeader, multiFieldHeader) | .equal([9])); | 7| assert(`x`.parseFieldList(hasHeader, multiFieldHeader) | .equal([3, 10])); | | /* Convert to zero - A subset of the above tests. */ 7| assert(`a`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, singleFieldHeader) | .equal([0])); | 7| assert(`a*`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, singleFieldHeader) | .equal([0])); | 7| assert(`f1`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, twoFieldHeader) | .equal([0])); | 7| assert(`f2`.parseFieldList!(long, Yes.convertToZeroBasedIndex)(hasHeader, twoFieldHeader) | .equal([1])); | 7| assert(`f2,f1`.parseFieldList!(int, Yes.convertToZeroBasedIndex)(hasHeader, twoFieldHeader) | .equal([1, 0])); | 7| assert(`f2-f1`.parseFieldList!(uint, Yes.convertToZeroBasedIndex)(hasHeader, twoFieldHeader) | .equal([1, 0])); | 7| assert(`*`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, multiFieldHeader) | .equal([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])); | 7| assert(`*2`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, multiFieldHeader) | .equal([1, 4])); | 7| assert(`snow*`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, multiFieldHeader) | .equal([6])); | 7| assert(`snow\ storm`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, multiFieldHeader) | .equal([6])); | 7| assert(`雪风暴`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, multiFieldHeader) | .equal([7])); | 7| assert(`雪风*`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, multiFieldHeader) | .equal([7])); | 7| assert(`x`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, multiFieldHeader) | .equal([2, 9])); | | /* Allow zero tests. */ 7| assert(`0,f1`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, twoFieldHeader) | .equal([-1, 0])); | 7| assert(`f2,0`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, twoFieldHeader) | .equal([1, -1])); | 7| assert(`f2,f1,0`.parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, twoFieldHeader) | .equal([2, 1, 0])); | 7| assert(`0,f2-f1`.parseFieldList!(uint, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, twoFieldHeader) | .equal([0, 2, 1])); | 7| assert(`*,0`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, multiFieldHeader) | .equal([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0])); | 7| assert(`0,snow\ storm`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, multiFieldHeader) | .equal([0,7])); |} | |// parseFieldList - The same tests as used for parseNumericFieldGroup |@safe unittest |{ | import std.algorithm : each, equal; | import std.exception : assertThrown, assertNotThrown; | | /* Basic tests. */ 7| assert(`1`.parseFieldList.equal([1])); 7| assert(`1,2`.parseFieldList.equal([1, 2])); 7| assert(`1,2,3`.parseFieldList.equal([1, 2, 3])); 7| assert(`1-2`.parseFieldList.equal([1, 2])); 7| assert(`1-2,6-4`.parseFieldList.equal([1, 2, 6, 5, 4])); 7| assert(`1-2,1,1-2,2,2-1`.parseFieldList.equal([1, 2, 1, 1, 2, 2, 2, 1])); 7| assert(`1-2,5`.parseFieldList!size_t.equal([1, 2, 5])); | | /* Signed Int tests */ 7| assert(`1`.parseFieldList!int.equal([1])); 7| assert(`1,2,3`.parseFieldList!int.equal([1, 2, 3])); 7| assert(`1-2`.parseFieldList!int.equal([1, 2])); 7| assert(`1-2,6-4`.parseFieldList!int.equal([1, 2, 6, 5, 4])); 7| assert(`1-2,5`.parseFieldList!int.equal([1, 2, 5])); | | /* Convert to zero tests */ 7| assert(`1`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0])); 7| assert(`1,2,3`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1, 2])); 7| assert(`1-2`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1])); 7| assert(`1-2,6-4`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1, 5, 4, 3])); 7| assert(`1-2,5`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1, 4])); | 7| assert(`1`.parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0])); 7| assert(`1,2,3`.parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1, 2])); 7| assert(`1-2`.parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1])); 7| assert(`1-2,6-4`.parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1, 5, 4, 3])); 7| assert(`1-2,5`.parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1, 4])); | | /* Allow zero tests. */ 7| assert(`0`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert(`1,0,3`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 0, 3])); 7| assert(`1-2,5`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 2, 5])); 7| assert(`0`.parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert(`1,0,3`.parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 0, 3])); 7| assert(`1-2,5`.parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 2, 5])); 7| assert(`0`.parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([-1])); 7| assert(`1,0,3`.parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0, -1, 2])); 7| assert(`1-2,5`.parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0, 1, 4])); | | /* Error cases. */ 14| assertThrown(``.parseFieldList.each); 14| assertThrown(` `.parseFieldList.each); 14| assertThrown(`,`.parseFieldList.each); 14| assertThrown(`5 6`.parseFieldList.each); 14| assertThrown(`,7`.parseFieldList.each); 14| assertThrown(`8,`.parseFieldList.each); 14| assertThrown(`8,9,`.parseFieldList.each); 14| assertThrown(`10,,11`.parseFieldList.each); 14| assertThrown(``.parseFieldList!(long, Yes.convertToZeroBasedIndex).each); 14| assertThrown(`1,2-3,`.parseFieldList!(long, Yes.convertToZeroBasedIndex).each); 14| assertThrown(`2-,4`.parseFieldList!(long, Yes.convertToZeroBasedIndex).each); 14| assertThrown(`1,2,3,,4`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown(`,7`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown(`8,`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown(`10,0,,11`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown(`8,9,`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown(`0`.parseFieldList.each); 14| assertThrown(`1,0,3`.parseFieldList.each); 14| assertThrown(`0`.parseFieldList!(int, Yes.convertToZeroBasedIndex, No.allowFieldNumZero).each); 14| assertThrown(`1,0,3`.parseFieldList!(int, Yes.convertToZeroBasedIndex, No.allowFieldNumZero).each); 14| assertThrown(`0-2,6-0`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown(`0-2,6-0`.parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown(`0-2,6-0`.parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); |} | |// parseFieldList - Subset of tests used for parseNumericFieldGroup, but allowing non-consumed characters. |@safe unittest |{ | import std.algorithm : each, equal; | import std.exception : assertThrown, assertNotThrown; | | /* Basic tests. */ 7| assert(`1`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1])); 7| assert(`1,2`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2])); 7| assert(`1,2,3`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2, 3])); 7| assert(`1-2`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2])); 7| assert(`1-2,6-4`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2, 6, 5, 4])); 7| assert(`1-2,1,1-2,2,2-1`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2, 1, 1, 2, 2, 2, 1])); 7| assert(`1-2,5`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2, 5])); | | /* Signed Int tests. */ 7| assert(`1`.parseFieldList!(int, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1])); 7| assert(`1,2,3`.parseFieldList!(int, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2, 3])); 7| assert(`1-2`.parseFieldList!(int, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2])); 7| assert(`1-2,6-4`.parseFieldList!(int, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2, 6, 5, 4])); 7| assert(`1-2,5`.parseFieldList!(int, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2, 5])); | | /* Convert to zero tests */ 7| assert(`1`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([0])); 7| assert(`1,2,3`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([0, 1, 2])); 7| assert(`1-2`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([0, 1])); 7| assert(`1-2,6-4`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([0, 1, 5, 4, 3])); 7| assert(`1-2,5`.parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([0, 1, 4])); | | /* Allow zero tests. */ 7| assert(`0`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([0])); 7| assert(`1,0,3`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 0, 3])); 7| assert(`1-2,5`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([1, 2, 5])); 7| assert(`0`.parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([-1])); 7| assert(`1,0,3`.parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([0, -1, 2])); 7| assert(`1-2,5`.parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString) | .equal([0, 1, 4])); | | /* Error cases. */ 14| assertThrown(``.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(` `.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`,`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`,7`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); | 14| assertThrown(``.parseFieldList!(long, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`2-,4`.parseFieldList!(long, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`,7`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString).each); | 14| assertThrown(`0`.parseFieldList!(int, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`1,0,3`.parseFieldList!(int, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); | 14| assertThrown(`0`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`1,0,3`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); | 14| assertThrown(`0-2,6-0`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`0-2,6-0`.parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`0-2,6-0`.parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString).each); | | /* Allowed termination without consuming entire string. */ | { 7| auto x = `5:abc`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString); 7| assert(x.equal([5])); 7| assert(x.consumed == 1); | } | | { 7| auto x = `1-3,6-10:abc`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString); 7| assert(x.equal([1, 2, 3, 6, 7, 8, 9, 10])); 7| assert(x.consumed == 8); | } | | { 7| auto x = `1-3,6-10 xyz`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString); 7| assert(x.equal([1, 2, 3, 6, 7, 8, 9, 10])); 7| assert(x.consumed == 8); | } | | { 7| auto x = `5 6`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString); 7| assert(x.equal([5])); 7| assert(x.consumed == 1); | } | | /* Invalid termination when not consuming the entire string. */ 14| assertThrown(`8,`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`8,9,`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`10,,11`.parseFieldList!(size_t, No.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`1,2-3,`.parseFieldList!(long, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`1,2,3,,4`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`8,`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`10,0,,11`.parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString).each); 14| assertThrown(`8,9,`.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero, No.consumeEntireFieldListString).each); |} | |/** | `findFieldGroups` creates range that iterates over the 'field-groups' in a 'field-list'. | (Private function.) | | Input is typically a string or character array. The range becomes empty when the end | of input is reached or an unescaped field-list terminator character is found. | | A 'field-list' is a comma separated list of 'field-groups'. A 'field-group' is a | single numeric or named field, or a hyphen-separated pair of numeric or named fields. | For example: | | ``` | 1,3,4-7 # 3 numeric field-groups | field_a,field_b # 2 named fields | ``` | | Each element in the range is represented by a tuple of two values: | | $(LIST | * consumed - The total index positions consumed by the range so far | * value - A slice containing the text of the field-group. | ) | | The field-group slice does not contain the separator character, but this is included | in the total consumed. The field-group tuples from the previous examples: | | ``` | Input: 1,2,4-7 | tuple(1, "1") | tuple(3, "2") | tuple(7, "4-7") | | Input: field_a,field_b | tuple(7, "field_a") | tuple(8, "field_b") | ``` | | The details of field-groups are not material to this routine, it is only concerned | with finding the boundaries between field-groups and the termination boundary for the | field-list. This is relatively straightforward. The main parsing concern is the use | of escape character when delimiter characters are included in field names. | | Field-groups are separated by a single comma (','). A field-list is terminated by a | colon (':') or space (' ') character. Comma, colon, and space characters can be | included in a field-group by preceding them with a backslash. A backslash not | intended as an escape character must also be backslash escaped. | | A field-list is also terminated if an unescaped backslash is encountered or a pair | of consecutive commas. This is normally an error, but handling of these cases is left | to the caller. | | Additional characters need to be backslash escaped inside field-groups, the asterisk | ('*') and hyphen ('-') characters in particular. However, this routine needs only be | aware of characters that affect field-list and field-group boundaries, which are the | set listed above. | | Backslash escape sequences are recognized but not removed from field-groups. | | Field and record delimiter characters (usually TAB and newline) are not handled by | this routine. They cannot be used in field names as there is no way to represent them | in the header line. However, it is not necessary for this routine to check for them, | these checks occurs naturally when processing header lines. | | $(ALWAYS_DOCUMENT) |*/ |private auto findFieldGroups(Range)(Range r) |if (isInputRange!Range && | (is(Unqual!(ElementEncodingType!Range) == char) || is(Unqual!(ElementEncodingType!Range) == ubyte)) && | (isNarrowString!Range || (isRandomAccessRange!Range && | hasSlicing!Range && | hasLength!Range)) | ) |{ | static struct Result | { | private alias R = Unqual!Range; | private alias Char = ElementType!R; | private alias ResultType = Tuple!(size_t, "consumed", R, "value"); | | private R _input; | private R _front; | private size_t _consumed; | 3400| this(Range data) nothrow pure @safe | { 3400| auto fieldGroup = nextFieldGroup!true(data); 3400| assert(fieldGroup.start == 0); | 3400| _front = data[0 .. fieldGroup.end]; 3400| _consumed = fieldGroup.end; 3400| _input = data[fieldGroup.end .. $]; | | // writefln("[this] data: '%s', _front: '%s', _input: '%s', _frontEnd: %d", data, _front, _input, _frontEnd); | } | | bool empty() const nothrow pure @safe | { 16831788| return _front.empty; | } | | ResultType front() const nothrow pure @safe | { 8795| assert(!empty, "Attempt to take the front of an empty findFieldGroups."); | 8795| return ResultType(_consumed, _front); | } | | void popFront() nothrow pure @safe | { 4555| assert(!empty, "Attempt to popFront an empty findFieldGroups."); | 4555| auto fieldGroup = nextFieldGroup!false(_input); | | // writefln("[popFront] _input: '%s', next start: %d, next end: %d", _input, fieldGroup.start, fieldGroup.end); | 4555| _front = _input[fieldGroup.start .. fieldGroup.end]; 4555| _consumed += fieldGroup.end; 4555| _input = _input[fieldGroup.end .. $]; | } | | /* Finds the start and end indexes of the next field-group. | * | * The start and end indexes exclude delimiter characters (comma, space, colon). | */ | private auto nextFieldGroup(bool isFirst)(R r) const nothrow pure @safe | { | alias RetType = Tuple!(size_t, "start", size_t, "end"); | | enum Char COMMA = ','; | enum Char BACKSLASH = '\\'; | enum Char SPACE = ' '; | enum Char COLON = ':'; | 10463| if (r.empty) return RetType(0, 0); | 5447| size_t start = 0; | | static if (!isFirst) | { 3694| if (r[0] == COMMA) start = 1; | } | 5447| size_t end = start; | 15100| while (end < r.length) | { 12468| Char lookingAt = r[end]; | 36605| if (lookingAt == COMMA || lookingAt == SPACE || lookingAt == COLON) break; | 9681| if (lookingAt == BACKSLASH) | { 340| if (end + 1 == r.length) break; 284| end += 2; | } | else | { 9369| end += 1; | } | } | 5447| return RetType(start, end); | } | } | 3400| return Result(r); |} | |// findFieldGroups |@safe unittest |{ | import std.algorithm : equal; | | /* Note: backticks generate string literals without escapes. */ | | /* Immediate termination. */ 7| assert(``.findFieldGroups.empty); 7| assert(`,`.findFieldGroups.empty); 7| assert(`:`.findFieldGroups.empty); 7| assert(` `.findFieldGroups.empty); 7| assert(`\`.findFieldGroups.empty); | 7| assert(`,1`.findFieldGroups.empty); 7| assert(`:1`.findFieldGroups.empty); 7| assert(` 1`.findFieldGroups.empty); | | /* Common cases. */ 7| assert(equal(`1`.findFieldGroups, | [tuple(1, `1`) | ])); | 7| assert(equal(`1,2`.findFieldGroups, | [tuple(1, `1`), | tuple(3, `2`) | ])); | 7| assert(equal(`1,2,3`.findFieldGroups, | [tuple(1, `1`), | tuple(3, `2`), | tuple(5, `3`) | ])); | 7| assert(equal(`1-3`.findFieldGroups, | [tuple(3, `1-3`) | ])); | 7| assert(equal(`1-3,5,7-2`.findFieldGroups, | [tuple(3, `1-3`), | tuple(5, `5`), | tuple(9, `7-2`) | ])); | 7| assert(equal(`field1`.findFieldGroups, | [tuple(6, `field1`) | ])); | 7| assert(equal(`field1,field2`.findFieldGroups, | [tuple(6, `field1`), | tuple(13, `field2`) | ])); | 7| assert(equal(`field1-field5`.findFieldGroups, | [tuple(13, `field1-field5`) | ])); | 7| assert(equal(`snow\ storm,雪风暴,Tempête\ de\ neige,Χιονοθύελλα,吹雪`.findFieldGroups, | [tuple(11, `snow\ storm`), | tuple(21, `雪风暴`), | tuple(41, `Tempête\ de\ neige`), | tuple(64, `Χιονοθύελλα`), | tuple(71, `吹雪`) | ])); | | /* Escape sequences. */ 7| assert(equal(`Field\ 1,Field\ 2,Field\ 5-Field\ 11`.findFieldGroups, | [tuple(8, `Field\ 1`), | tuple(17, `Field\ 2`), | tuple(36, `Field\ 5-Field\ 11`) | ])); | 7| assert(equal(`Jun\ 03\-08,Jul\ 14\-23`.findFieldGroups, | [tuple(11, `Jun\ 03\-08`), | tuple(23, `Jul\ 14\-23`) | ])); | 7| assert(equal(`field\:1`.findFieldGroups, | [tuple(8, `field\:1`) | ])); | 7| assert(equal(`\\,\,,\:,\ ,\a`.findFieldGroups, | [tuple(2, `\\`), | tuple(5, `\,`), | tuple(8, `\:`), | tuple(11, `\ `), | tuple(14, `\a`) | ])); | 7| assert(equal(`\001,\a\b\c\ \ \-\d,fld\*1`.findFieldGroups, | [tuple(4, `\001`), | tuple(19, `\a\b\c\ \ \-\d`), | tuple(26, `fld\*1`) | ])); | | /* field-list termination. */ 7| assert(equal(`X:`.findFieldGroups, | [tuple(1, `X`) | ])); | 7| assert(equal(`X `.findFieldGroups, | [tuple(1, `X`) | ])); | 7| assert(equal(`X\`.findFieldGroups, | [tuple(1, `X`) | ])); | 7| assert(equal(`1-3:5-7`.findFieldGroups, | [tuple(3, `1-3`) | ])); | 7| assert(equal(`1-3,4:5-7`.findFieldGroups, | [tuple(3, `1-3`), | tuple(5, `4`) | ])); | 7| assert(equal(`abc,,def`.findFieldGroups, | [tuple(3, `abc`), | ])); | 7| assert(equal(`abc,,`.findFieldGroups, | [tuple(3, `abc`), | ])); | 7| assert(equal(`abc,`.findFieldGroups, | [tuple(3, `abc`), | ])); | | /* Leading, trailing, or solo hyphen. Captured for error handling. */ 7| assert(equal(`-1,1-,-`.findFieldGroups, | [tuple(2, `-1`), | tuple(5, `1-`), | tuple(7, `-`) | ])); |} | |/** | `isNumericFieldGroup` determines if a field-group is a valid numeric field-group. | (Private function.) | | A numeric field-group is single, non-negative integer or a pair of non-negative | integers separated by a hyphen. | | Note that zero is valid by this definition, even though it is usually disallowed as a | field number, except when representing the entire line. | | $(ALWAYS_DOCUMENT) |*/ |private bool isNumericFieldGroup(const char[] fieldGroup) @safe |{ 4326| return cast(bool) fieldGroup.matchFirst(ctRegex!`^[0-9]+(-[0-9]+)?$`); |} | |@safe unittest |{ | import std.conv : to; | 7| assert(!isNumericFieldGroup(``)); 7| assert(!isNumericFieldGroup(`-`)); 7| assert(!isNumericFieldGroup(`\1`)); 7| assert(!isNumericFieldGroup(`\01`)); 7| assert(!isNumericFieldGroup(`1-`)); 7| assert(!isNumericFieldGroup(`-1`)); 7| assert(!isNumericFieldGroup(`a`)); 7| assert(!isNumericFieldGroup(`a1`)); 7| assert(!isNumericFieldGroup(`1.1`)); | 7| assert(isNumericFieldGroup(`1`)); 7| assert(isNumericFieldGroup(`0123456789`)); 7| assert(isNumericFieldGroup(`0-0`)); 7| assert(isNumericFieldGroup(`3-5`)); 7| assert(isNumericFieldGroup(`30-5`)); 7| assert(isNumericFieldGroup(`0123456789-0123456789`)); | 7| assert(`0123456789-0123456789`.to!(char[]).isNumericFieldGroup); |} | |/** | `isNumericFieldGroupWithHyphenFirstOrLast` determines if a field-group is a field | number with a leading or trailing hyphen. (Private function.) | | This routine is used for better error handling. Currently, incomplete field ranges | are not supported. That is, field ranges leaving off the first or last field, | defaulting to the end of the line. This syntax is available in `cut`, e.g. | | $(CONSOLE | $ cut -f 2- | ) | | In `cut`, this represents field 2 to the end of the line. This routine identifies | these forms so an error message specific to this case can be generated. | | $(ALWAYS_DOCUMENT) |*/ |private bool isNumericFieldGroupWithHyphenFirstOrLast(const char[] fieldGroup) @safe |{ 4331| return cast(bool) fieldGroup.matchFirst(ctRegex!`^((\-[0-9]+)|([0-9]+\-))$`); |} | |@safe unittest |{ 7| assert(!isNumericFieldGroupWithHyphenFirstOrLast(``)); 7| assert(!isNumericFieldGroupWithHyphenFirstOrLast(`-`)); 7| assert(!isNumericFieldGroupWithHyphenFirstOrLast(`1-2`)); 7| assert(!isNumericFieldGroupWithHyphenFirstOrLast(`-a`)); 7| assert(isNumericFieldGroupWithHyphenFirstOrLast(`-1`)); 7| assert(isNumericFieldGroupWithHyphenFirstOrLast(`-12`)); 7| assert(isNumericFieldGroupWithHyphenFirstOrLast(`1-`)); 7| assert(isNumericFieldGroupWithHyphenFirstOrLast(`12-`)); 7| assert(!isNumericFieldGroupWithHyphenFirstOrLast(`-1333-`)); 7| assert(!isNumericFieldGroupWithHyphenFirstOrLast(`\-1`)); 7| assert(!isNumericFieldGroupWithHyphenFirstOrLast(`\-12`)); 7| assert(!isNumericFieldGroupWithHyphenFirstOrLast(`1\-`)); 7| assert(!isNumericFieldGroupWithHyphenFirstOrLast(`12\-`)); |} | |/** | `isMixedNumericNamedFieldGroup` determines if a field group is a range where one | element is a field number and the other element is a named field (not a number). | | This routine is used for better error handling. Currently, field ranges must be | either entirely numeric or entirely named. This is primarily to catch unintended | used of a mixed range on the command line. | | $(ALWAYS_DOCUMENT) | */ |private bool isMixedNumericNamedFieldGroup(const char[] fieldGroup) @safe |{ | /* Patterns cases: | * - Field group starts with a series of digits followed by a hyphen, followed | * sequence containing a non-digit character. | * ^([0-9]+\-.*[^0-9].*)$ | * - Field ends with an unescaped hyphen and a series of digits. Two start cases: | * - Non-digit, non-backslash immediately preceding the hyphen | * ^(.*[^0-9\\]\-[0-9]+)$ | * - Digit immediately preceding the hyphen, non-hyphen earlier | * ^(.*[^0-9].*[0-9]\-[0-9]+)$ | * These two combined: | * ^( ( (.*[^0-9\\]) | (.*[^0-9].*[0-9]) ) \-[0-9]+ )$ | * | * All cases combined: | * ^( ([0-9]+\-.*[^0-9].*) | ( (.*[^0-9\\]) | (.*[^0-9].*[0-9]) ) \-[0-9]+)$ | */ 1432| return cast(bool) fieldGroup.matchFirst(ctRegex!`^(([0-9]+\-.*[^0-9].*)|((.*[^0-9\\])|(.*[^0-9].*[0-9]))\-[0-9]+)$`); |} | |@safe unittest |{ 7| assert(isMixedNumericNamedFieldGroup(`1-g`)); 7| assert(isMixedNumericNamedFieldGroup(`y-2`)); 7| assert(isMixedNumericNamedFieldGroup(`23-zy`)); 7| assert(isMixedNumericNamedFieldGroup(`pB-37`)); | 7| assert(isMixedNumericNamedFieldGroup(`5x-0`)); 7| assert(isMixedNumericNamedFieldGroup(`x5-9`)); 7| assert(isMixedNumericNamedFieldGroup(`0-2m`)); 7| assert(isMixedNumericNamedFieldGroup(`9-m2`)); 7| assert(isMixedNumericNamedFieldGroup(`5x-37`)); 7| assert(isMixedNumericNamedFieldGroup(`x5-37`)); 7| assert(isMixedNumericNamedFieldGroup(`37-2m`)); 7| assert(isMixedNumericNamedFieldGroup(`37-m2`)); | 7| assert(isMixedNumericNamedFieldGroup(`18-23t`)); 7| assert(isMixedNumericNamedFieldGroup(`x12-632`)); 7| assert(isMixedNumericNamedFieldGroup(`15-15.5`)); | 7| assert(isMixedNumericNamedFieldGroup(`1-g\-h`)); 7| assert(isMixedNumericNamedFieldGroup(`z\-y-2`)); 7| assert(isMixedNumericNamedFieldGroup(`23-zy\-st`)); 7| assert(isMixedNumericNamedFieldGroup(`ts\-pB-37`)); | 7| assert(!isMixedNumericNamedFieldGroup(`a-c`)); 7| assert(!isMixedNumericNamedFieldGroup(`1-3`)); 7| assert(!isMixedNumericNamedFieldGroup(`\1-g`)); 7| assert(!isMixedNumericNamedFieldGroup(`-g`)); 7| assert(!isMixedNumericNamedFieldGroup(`h-`)); 7| assert(!isMixedNumericNamedFieldGroup(`-`)); 7| assert(!isMixedNumericNamedFieldGroup(``)); 7| assert(!isMixedNumericNamedFieldGroup(`\2-\3`)); 7| assert(!isMixedNumericNamedFieldGroup(`\10-\20`)); 7| assert(!isMixedNumericNamedFieldGroup(`x`)); 7| assert(!isMixedNumericNamedFieldGroup(`xyz`)); 7| assert(!isMixedNumericNamedFieldGroup(`0`)); 7| assert(!isMixedNumericNamedFieldGroup(`9`)); | 7| assert(!isMixedNumericNamedFieldGroup(`1\-g`)); 7| assert(!isMixedNumericNamedFieldGroup(`y\-2`)); 7| assert(!isMixedNumericNamedFieldGroup(`23\-zy`)); 7| assert(!isMixedNumericNamedFieldGroup(`pB\-37`)); 7| assert(!isMixedNumericNamedFieldGroup(`18\-23t`)); 7| assert(!isMixedNumericNamedFieldGroup(`x12\-632`)); | 7| assert(!isMixedNumericNamedFieldGroup(`5x\-0`)); 7| assert(!isMixedNumericNamedFieldGroup(`x5\-9`)); 7| assert(!isMixedNumericNamedFieldGroup(`0\-2m`)); 7| assert(!isMixedNumericNamedFieldGroup(`9\-m2`)); 7| assert(!isMixedNumericNamedFieldGroup(`5x\-37`)); 7| assert(!isMixedNumericNamedFieldGroup(`x5\-37`)); 7| assert(!isMixedNumericNamedFieldGroup(`37\-2m`)); 7| assert(!isMixedNumericNamedFieldGroup(`37\-m2`)); | 7| assert(!isMixedNumericNamedFieldGroup(`1\-g\-h`)); 7| assert(!isMixedNumericNamedFieldGroup(`z\-y\-2`)); 7| assert(!isMixedNumericNamedFieldGroup(`23\-zy\-st`)); 7| assert(!isMixedNumericNamedFieldGroup(`ts\-pB\-37`)); | 7| assert(!isMixedNumericNamedFieldGroup(`\-g`)); 7| assert(!isMixedNumericNamedFieldGroup(`h\-`)); 7| assert(!isMixedNumericNamedFieldGroup(`i\-j`)); 7| assert(!isMixedNumericNamedFieldGroup(`\-2`)); 7| assert(!isMixedNumericNamedFieldGroup(`2\-`)); 7| assert(!isMixedNumericNamedFieldGroup(`2\-3`)); 7| assert(!isMixedNumericNamedFieldGroup(`\2\-\3`)); |} | |/** | `namedFieldGroupToRegex` generates regular expressions for matching fields in named | field-group to field names in a header line. (Private function.) | | One regex is generated for a single field, two are generated for a range. These are | returned as a tuple with a pair of regex instances. The first regex is used for | single field entries and the first entry of range. The second regex is filled with | the second entry of a range and is empty otherwise. (Test with 'empty()'.) | | This routine converts all field-list escape and wildcard syntax into the necessary | regular expression syntax. Backslash escaped characters are converted to their plain | characters and asterisk wildcarding (glob style) is converted to regex syntax. | | Regular expressions include beginning and end of string markers. This is intended for | matching field names after they have been extracted from the header line. | | Most field-group syntax errors requiring end-user error messages should be detected | elsewhere in field-list processing. The exception is field-names with a non-escaped | leading or trailing hyphen. A user-appropriate error message is thrown for this case. | Other erroneous inputs result in both regex's set empty. | | There is no detection of numeric field-groups. If a numeric-field group is passed in | it will be treated as a named field-group and regular expressions generated. | | $(ALWAYS_DOCUMENT) |*/ |private auto namedFieldGroupToRegex(const char[] fieldGroup) |{ | import std.array : appender; | import std.conv : to; | import std.uni : byCodePoint, byGrapheme; | | import std.stdio; | | enum dchar BACKSLASH = '\\'; | enum dchar HYPHEN = '-'; | enum dchar ASTERISK = '*'; | | auto createRegex(const dchar[] basePattern) | { 1663| return ("^"d ~ basePattern ~ "$").to!string.regex; | } | 1482| Regex!char field1Regex; 1482| Regex!char field2Regex; | 1482| auto regexString = appender!(dchar[])(); | 1482| bool hyphenSeparatorFound = false; 1482| bool isEscaped = false; 25824| foreach (g; fieldGroup.byGrapheme) | { 5362| if (isEscaped) | { 269| put(regexString, [g].byCodePoint.escaper); 269| isEscaped = false; | } 5093| else if (g.length == 1) | { 4988| if (g[0] == HYPHEN) | { 546| enforce(!hyphenSeparatorFound && regexString.data.length != 0, 35| format("Hyphens in field names must be backslash escaped unless separating two field names: '%s'.", | fieldGroup)); | 245| assert(field1Regex.empty); | 245| field1Regex = createRegex(regexString.data); 245| hyphenSeparatorFound = true; 245| regexString.clear; | } 4708| else if (g[0] == BACKSLASH) | { 276| isEscaped = true; | } 4432| else if (g[0] == ASTERISK) | { 331| put(regexString, ".*"d); | } | else | { 4101| put(regexString, [g].byCodePoint.escaper); | } | } | else | { 105| put(regexString, [g].byCodePoint.escaper); | } | } 1678| enforce(!hyphenSeparatorFound || regexString.data.length != 0, 15| format("Hyphens in field names must be backslash escaped unless separating two field names: '%s'.", | fieldGroup)); | 1432| if (!hyphenSeparatorFound) | { 2418| if (regexString.data.length != 0) field1Regex = createRegex(regexString.data); | } 216| else field2Regex = createRegex(regexString.data); | 1432| return tuple(field1Regex, field2Regex); |} | |@safe unittest |{ | import std.algorithm : all, equal; | import std.exception : assertThrown; | | /* Use when both regexes should be empty. */ | void testBothRegexEmpty(string test, Tuple!(Regex!char, Regex!char) regexPair) | { 14| assert(regexPair[0].empty, format("[namedFieldGroupToRegex: %s]", test)); 14| assert(regexPair[1].empty, format("[namedFieldGroupToRegex: %s]", test)); | } | | /* Use when there should only be one regex. */ | void testFirstRegexMatches(string test, Tuple!(Regex!char, Regex!char) regexPair, | string[] regex1Matches) | { 140| assert(!regexPair[0].empty, format("[namedFieldGroupToRegex: %s]", test)); 140| assert(regexPair[1].empty, format("[namedFieldGroupToRegex: %s]", test)); | 329| assert(regex1Matches.all!(s => s.matchFirst(regexPair[0])), | format("[namedFieldGroupToRegex: %s] regex: %s; strings: %s", | test, regexPair[0], regex1Matches)); | } | | /* Use when there should be two regex with matches. */ | void testBothRegexMatches(string test, Tuple!(Regex!char, Regex!char) regexPair, | const (char[])[] regex1Matches, const (char[])[] regex2Matches) | { 49| assert(!regexPair[0].empty, format("[namedFieldGroupToRegex: %s]", test)); 49| assert(!regexPair[1].empty, format("[namedFieldGroupToRegex: %s]", test)); | 112| assert(regex1Matches.all!(s => s.matchFirst(regexPair[0])), | format("[namedFieldGroupToRegex: %s] regex1: %s; strings: %s", | test, regexPair[0], regex1Matches)); | 112| assert(regex2Matches.all!(s => s.matchFirst(regexPair[1])), | format("[namedFieldGroupToRegex: %s] regex2: %s; strings: %s", | test, regexPair[1], regex2Matches)); | } | | /* Invalid hyphen use. These are the only error cases. */ 14| assertThrown(`-`.namedFieldGroupToRegex); 14| assertThrown(`a-`.namedFieldGroupToRegex); 14| assertThrown(`-a`.namedFieldGroupToRegex); 14| assertThrown(`a-b-`.namedFieldGroupToRegex); 14| assertThrown(`a-b-c`.namedFieldGroupToRegex); | | /* Some special cases. These cases are caught elsewhere and errors signaled to the | * user. nameFieldGroupToRegex should just send back empty. | */ 7| testBothRegexEmpty(`test-empty-1`, ``.namedFieldGroupToRegex); 7| testBothRegexEmpty(`test-empty-2`, `\`.namedFieldGroupToRegex); | | /* Single name cases. */ 7| testFirstRegexMatches(`test-single-1`, `a`.namedFieldGroupToRegex, [`a`]); 7| testFirstRegexMatches(`test-single-2`, `\a`.namedFieldGroupToRegex, [`a`]); 7| testFirstRegexMatches(`test-single-3`, `abc`.namedFieldGroupToRegex, [`abc`]); 7| testFirstRegexMatches(`test-single-4`, `abc*`.namedFieldGroupToRegex, [`abc`, `abcd`, `abcde`]); 7| testFirstRegexMatches(`test-single-5`, `*`.namedFieldGroupToRegex, [`a`, `ab`, `abc`, `abcd`, `abcde`, `*`]); 7| testFirstRegexMatches(`test-single-6`, `abc\*`.namedFieldGroupToRegex, [`abc*`]); 7| testFirstRegexMatches(`test-single-7`, `abc{}`.namedFieldGroupToRegex, [`abc{}`]); 7| testFirstRegexMatches(`test-single-8`, `\002`.namedFieldGroupToRegex, [`002`]); 7| testFirstRegexMatches(`test-single-9`, `\\002`.namedFieldGroupToRegex, [`\002`]); 7| testFirstRegexMatches(`test-single-10`, `With A Space`.namedFieldGroupToRegex, [`With A Space`]); 7| testFirstRegexMatches(`test-single-11`, `With\-A\-Hyphen`.namedFieldGroupToRegex, [`With-A-Hyphen`]); 7| testFirstRegexMatches(`test-single-11`, `\a\b\c\d\e\f\g`.namedFieldGroupToRegex, [`abcdefg`]); 7| testFirstRegexMatches(`test-single-12`, `雪风暴`.namedFieldGroupToRegex, [`雪风暴`]); 7| testFirstRegexMatches(`test-single-13`, `\雪风暴`.namedFieldGroupToRegex, [`雪风暴`]); 7| testFirstRegexMatches(`test-single-14`, `\雪\风\暴`.namedFieldGroupToRegex, [`雪风暴`]); 7| testFirstRegexMatches(`test-single-15`, `雪*`.namedFieldGroupToRegex, [`雪`]); 7| testFirstRegexMatches(`test-single-16`, `雪*`.namedFieldGroupToRegex, [`雪风`]); 7| testFirstRegexMatches(`test-single-17`, `雪*`.namedFieldGroupToRegex, [`雪风暴`]); 7| testFirstRegexMatches(`test-single-18`, `g̈각நிกำषिkʷक्षि`.namedFieldGroupToRegex, [`g̈각நிกำषिkʷक्षि`]); 7| testFirstRegexMatches(`test-single-19`, `*g̈각நிกำषिkʷक्षि*`.namedFieldGroupToRegex, [`XYZg̈각நிกำषिkʷक्षिPQR`]); | 7| testBothRegexMatches(`test-pair-1`, `a-b`.namedFieldGroupToRegex, [`a`], [`b`]); 7| testBothRegexMatches(`test-pair-2`, `\a-\b`.namedFieldGroupToRegex, [`a`], [`b`]); 7| testBothRegexMatches(`test-pair-3`, `a*-b*`.namedFieldGroupToRegex, [`a`, `ab`, `abc`], [`b`, `bc`, `bcd`]); 7| testBothRegexMatches(`test-pair-4`, `abc-bcd`.namedFieldGroupToRegex, [`abc`], [`bcd`]); 7| testBothRegexMatches(`test-pair-5`, `a\-f-r\-t`.namedFieldGroupToRegex, [`a-f`], [`r-t`]); 7| testBothRegexMatches(`test-pair-6`, `雪风暴-吹雪`.namedFieldGroupToRegex, [`雪风暴`], [`吹雪`]); 7| testBothRegexMatches(`test-pair-7`, `நிกำ각-aिg̈क्षिkʷ`.namedFieldGroupToRegex, [`நிกำ각`], [`aिg̈क्षिkʷ`]); |} | |/** | `namedFieldRegexMatches` returns an input range iterating over all the fields (strings) | in an input range that match a regular expression. (Private function.) | | This routine is used in conjunction with `namedFieldGroupToRegex` to find the set of | header line fields that match a field in a field-group expression. The input is a | range where the individual elements are strings, e.g. an array of strings. | | The elements of the returned range are a tuple where the first element is the | one-based field number of the matching field and the second is the matched field | name. A zero-based index is returned if `convertToZero` is Yes. | | The regular expression must not be empty. | | $(ALWAYS_DOCUMENT) |*/ |private auto namedFieldRegexMatches(T = size_t, | ConvertToZeroBasedIndex convertToZero = No.convertToZeroBasedIndex, | Range) |(Range headerFields, Regex!char fieldRegex) |if (isInputRange!Range && is(ElementEncodingType!Range == string)) |{ | import std.algorithm : filter; | 4579| assert(!fieldRegex.empty); | | static if (convertToZero) enum T indexOffset = 0; | else enum T indexOffset = 1; | 4579| return headerFields | .enumerate!(T)(indexOffset) 7196| .filter!(x => x[1].matchFirst(fieldRegex)); |} | |/* namedFieldRegexMatches tests. Some additional testing of namedFieldGroupToRegex, | * though all the regex edge cases occur in the namedFieldGroupToRegex tests. | */ |@safe unittest |{ | import std.algorithm : equal; | import std.array : array; | | void testBothRegexMatches(T = size_t, | ConvertToZeroBasedIndex convertToZero = No.convertToZeroBasedIndex) | (string test, string[] headerFields, | Tuple!(Regex!char, Regex!char) regexPair, | Tuple!(T, string)[] regex0Matches, | Tuple!(T, string)[] regex1Matches) | { 231| if (regexPair[0].empty) | { 00000000| assert(regex1Matches.empty, | format("[namedFieldRegexMatches: %s] (empty regex[0], non-empty matches]", test)); | } | else | { 231| assert(equal(headerFields.namedFieldRegexMatches!(T, convertToZero)(regexPair[0]), | regex0Matches), | format("[namedFieldRegexMatches: %s] (regex[0] mismatch\nExpected: %s\nActual : %s", | test, regex0Matches, headerFields.namedFieldRegexMatches!(T, convertToZero)(regexPair[0]).array)); | } | 231| if (regexPair[1].empty) | { 161| assert(regex1Matches.empty, | format("[namedFieldRegexMatches: %s] (empty regex[1], non-empty matches]", test)); | } | else | { 70| assert(equal(headerFields.namedFieldRegexMatches!(T, convertToZero)(regexPair[1]), | regex1Matches), | format("[namedFieldRegexMatches: %s] (regex[1] mismatch\nExpected: %s\nActual : %s", | test, regex1Matches, headerFields.namedFieldRegexMatches!(T, convertToZero)(regexPair[1]).array)); | } | } | 7| Tuple!(size_t, string)[] emptyRegexMatch; | 7| testBothRegexMatches( | "test-1", | [`a`, `b`, `c`], // Header line | `a`.namedFieldGroupToRegex, // field-group | [ tuple(1UL, `a`) ], // regex-0 expected match | emptyRegexMatch); // regex-1 expected match | 7| testBothRegexMatches( | "test-2", | [`a`, `b`, `c`], | `b`.namedFieldGroupToRegex, | [ tuple(2UL, `b`) ], | emptyRegexMatch); | 7| testBothRegexMatches( | "test-3", | [`a`, `b`, `c`], | `c`.namedFieldGroupToRegex, | [ tuple(3UL, `c`) ], | emptyRegexMatch); | 7| testBothRegexMatches( | "test-4", | [`a`, `b`, `c`], | `x`.namedFieldGroupToRegex, | emptyRegexMatch, | emptyRegexMatch); | 7| testBothRegexMatches( | "test-5", | [`a`], | `a`.namedFieldGroupToRegex, | [ tuple(1UL, `a`) ], | emptyRegexMatch); | 7| testBothRegexMatches( | "test-6", | [`abc`, `def`, `ghi`], | `abc`.namedFieldGroupToRegex, | [ tuple(1UL, `abc`) ], | emptyRegexMatch); | 7| testBothRegexMatches( | "test-7", | [`x_abc`, `y_def`, `x_ghi`], | `x_*`.namedFieldGroupToRegex, | [ tuple(1UL, `x_abc`), tuple(3UL, `x_ghi`),], | emptyRegexMatch); | 7| testBothRegexMatches( | "test-8", | [`x_abc`, `y_def`, `x_ghi`], | `*`.namedFieldGroupToRegex, | [ tuple(1UL, `x_abc`), tuple(2UL, `y_def`), tuple(3UL, `x_ghi`),], | emptyRegexMatch); | 7| testBothRegexMatches( | "test-9", | [`a`, `b`, `c`], | `a-c`.namedFieldGroupToRegex, | [ tuple(1UL, `a`),], | [ tuple(3UL, `c`),]); | 7| testBothRegexMatches( | "test-10", | [`a`, `b`, `c`], | `c-a`.namedFieldGroupToRegex, | [ tuple(3UL, `c`),], | [ tuple(1UL, `a`),]); | 7| testBothRegexMatches( | "test-11", | [`a`, `b`, `c`], | `c*-a*`.namedFieldGroupToRegex, | [ tuple(3UL, `c`),], | [ tuple(1UL, `a`),]); | 7| testBothRegexMatches( | "test-12", | [`abc`, `abc-def`, `def`], | `abc-def`.namedFieldGroupToRegex, | [ tuple(1UL, `abc`) ], | [ tuple(3UL, `def`) ]); | 7| testBothRegexMatches( | "test-13", | [`abc`, `abc-def`, `def`], | `abc\-def`.namedFieldGroupToRegex, | [ tuple(2UL, `abc-def`) ], | emptyRegexMatch); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-101", | [`a`, `b`, `c`], | `a`.namedFieldGroupToRegex, | [ tuple(0UL, `a`) ], | emptyRegexMatch); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-102", | [`a`, `b`, `c`], | `b`.namedFieldGroupToRegex, | [ tuple(1UL, `b`) ], | emptyRegexMatch); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-103", | [`a`, `b`, `c`], | `c`.namedFieldGroupToRegex, | [ tuple(2UL, `c`) ], | emptyRegexMatch); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-104", | [`a`, `b`, `c`], | `x`.namedFieldGroupToRegex, | emptyRegexMatch, | emptyRegexMatch); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-105", | [`a`], | `a`.namedFieldGroupToRegex, | [ tuple(0UL, `a`) ], | emptyRegexMatch); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-106", | [`abc`, `def`, `ghi`], | `abc`.namedFieldGroupToRegex, | [ tuple(0UL, `abc`) ], | emptyRegexMatch); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-107", | [`x_abc`, `y_def`, `x_ghi`], | `x_*`.namedFieldGroupToRegex, | [ tuple(0UL, `x_abc`), tuple(2UL, `x_ghi`),], | emptyRegexMatch); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-108", | [`x_abc`, `y_def`, `x_ghi`], | `*`.namedFieldGroupToRegex, | [ tuple(0UL, `x_abc`), tuple(1UL, `y_def`), tuple(2UL, `x_ghi`),], | emptyRegexMatch); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-109", | [`a`, `b`, `c`], | `a-c`.namedFieldGroupToRegex, | [ tuple(0UL, `a`),], | [ tuple(2UL, `c`),]); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-110", | [`a`, `b`, `c`], | `c-a`.namedFieldGroupToRegex, | [ tuple(2UL, `c`),], | [ tuple(0UL, `a`),]); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-111", | [`a`, `b`, `c`], | `c*-a*`.namedFieldGroupToRegex, | [ tuple(2UL, `c`),], | [ tuple(0UL, `a`),]); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-112", | [`abc`, `abc-def`, `def`], | `abc-def`.namedFieldGroupToRegex, | [ tuple(0UL, `abc`) ], | [ tuple(2UL, `def`) ]); | 7| testBothRegexMatches!(size_t, Yes.convertToZeroBasedIndex) | ("test-113", | [`abc`, `abc-def`, `def`], | `abc\-def`.namedFieldGroupToRegex, | [ tuple(1UL, `abc-def`) ], | emptyRegexMatch); | 7| Tuple!(int, string)[] intEmptyRegexMatch; 7| Tuple!(uint, string)[] uintEmptyRegexMatch; 7| Tuple!(long, string)[] longEmptyRegexMatch; | 7| testBothRegexMatches!(int, Yes.convertToZeroBasedIndex) | ("test-201", | [`a`, `b`, `c`], | `a`.namedFieldGroupToRegex, | [ tuple(0, `a`) ], | intEmptyRegexMatch); | 7| testBothRegexMatches!(long, Yes.convertToZeroBasedIndex) | ("test-202", | [`a`, `b`, `c`], | `b`.namedFieldGroupToRegex, | [ tuple(1L, `b`) ], | longEmptyRegexMatch); | 7| testBothRegexMatches!(uint, Yes.convertToZeroBasedIndex) | ("test-203", | [`a`, `b`, `c`], | `c`.namedFieldGroupToRegex, | [ tuple(2U, `c`) ], | uintEmptyRegexMatch); | 7| testBothRegexMatches!(uint, Yes.convertToZeroBasedIndex)( | "test-204", | [`a`, `b`, `c`], | `x`.namedFieldGroupToRegex, | uintEmptyRegexMatch, | uintEmptyRegexMatch); | 7| testBothRegexMatches!(int) | ("test-211", | [`a`, `b`, `c`], | `c*-a*`.namedFieldGroupToRegex, | [ tuple(3, `c`),], | [ tuple(1, `a`),]); | 7| testBothRegexMatches!(long) | ("test-212", | [`abc`, `abc-def`, `def`], | `abc-def`.namedFieldGroupToRegex, | [ tuple(1L, `abc`) ], | [ tuple(3L, `def`) ]); | 7| testBothRegexMatches!(uint) | ("test-213", | [`abc`, `abc-def`, `def`], | `abc\-def`.namedFieldGroupToRegex, | [ tuple(2U, `abc-def`) ], | uintEmptyRegexMatch); |} | |/** | `parseNumericFieldGroup` parses a single number or number range. E.g. '5' or '5-8'. | (Private function.) | | `parseNumericFieldGroup` returns a range that iterates over all the values in the | field-group. It has options supporting conversion of field numbers to zero-based | indices and the use of '0' (zero) as a field number. | | This was part of the original code supporting numeric field list and is used by | both numeric and named field-list routines. | | $(ALWAYS_DOCUMENT) |*/ |private auto parseNumericFieldGroup(T = size_t, | ConvertToZeroBasedIndex convertToZero = No.convertToZeroBasedIndex, | AllowFieldNumZero allowZero = No.allowFieldNumZero) | (string fieldRange) |if (isIntegral!T && (!allowZero || !convertToZero || !isUnsigned!T)) |{ | import std.algorithm : findSplit; | import std.conv : to; | import std.range : iota; | import std.traits : Signed; | | /* Pick the largest compatible integral type for the IOTA range. This must be the | * signed type if convertToZero is true, as a reverse order range may end at -1. | */ | static if (convertToZero) alias S = Signed!T; | else alias S = T; | 5299| enforce(fieldRange.length != 0, "Empty field number."); | 5047| auto rangeSplit = findSplit(fieldRange, "-"); | | /* Make sure the range does not start or end with a dash. */ 7510| enforce(rangeSplit[1].empty || (!rangeSplit[0].empty && !rangeSplit[2].empty), 203| format("Incomplete ranges are not supported: '%s'.", fieldRange)); | 4844| S start = rangeSplit[0].to!S; 9450| S last = rangeSplit[1].empty ? start : rangeSplit[2].to!S; 9394| Signed!T increment = (start <= last) ? 1 : -1; | | static if (allowZero) | { 2136| enforce(rangeSplit[1].empty || (start != 0 && last != 0), 93| format("Zero cannot be used as part of a range: '%s'.", fieldRange)); | } | | static if (allowZero) | { 3324| enforce(start >= 0 && last >= 0, 7| format("Field numbers must be non-negative integers: '%d'.", 7| (start < 0) ? start : last)); | } | else | { 5743| enforce(start >= 1 && last >= 1, 163| format("Field numbers must be greater than zero: '%d'.", 163| (start < 1) ? start : last)); | } | | static if (convertToZero) | { 1974| start--; 1974| last--; | } | 4434| return iota(start, last + increment, increment); |} | |// parseNumericFieldGroup. |@safe unittest |{ | import std.algorithm : equal; | import std.exception : assertThrown, assertNotThrown; | | /* Basic cases */ 7| assert(parseNumericFieldGroup("1").equal([1])); 7| assert("2".parseNumericFieldGroup.equal([2])); 7| assert("3-4".parseNumericFieldGroup.equal([3, 4])); 7| assert("3-5".parseNumericFieldGroup.equal([3, 4, 5])); 7| assert("4-3".parseNumericFieldGroup.equal([4, 3])); 7| assert("10-1".parseNumericFieldGroup.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); | | /* Convert to zero-based indices */ 7| assert(parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)("1").equal([0])); 7| assert("2".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex).equal([1])); 7| assert("3-4".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex).equal([2, 3])); 7| assert("3-5".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex).equal([2, 3, 4])); 7| assert("4-3".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex).equal([3, 2])); 7| assert("10-1".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex).equal([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])); | | /* Allow zero. */ 7| assert("0".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert(parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)("1").equal([1])); 7| assert("3-4".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([3, 4])); 7| assert("10-1".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); | | /* Allow zero, convert to zero-based index. */ 7| assert("0".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([-1])); 7| assert(parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)("1").equal([0])); 7| assert("3-4".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([2, 3])); 7| assert("10-1".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])); | | /* Alternate integer types. */ 7| assert("2".parseNumericFieldGroup!uint.equal([2])); 7| assert("3-5".parseNumericFieldGroup!uint.equal([3, 4, 5])); 7| assert("10-1".parseNumericFieldGroup!uint.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); 7| assert("2".parseNumericFieldGroup!int.equal([2])); 7| assert("3-5".parseNumericFieldGroup!int.equal([3, 4, 5])); 7| assert("10-1".parseNumericFieldGroup!int.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); 7| assert("2".parseNumericFieldGroup!ushort.equal([2])); 7| assert("3-5".parseNumericFieldGroup!ushort.equal([3, 4, 5])); 7| assert("10-1".parseNumericFieldGroup!ushort.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); 7| assert("2".parseNumericFieldGroup!short.equal([2])); 7| assert("3-5".parseNumericFieldGroup!short.equal([3, 4, 5])); 7| assert("10-1".parseNumericFieldGroup!short.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); | 7| assert("0".parseNumericFieldGroup!(long, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert("0".parseNumericFieldGroup!(uint, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert("0".parseNumericFieldGroup!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert("0".parseNumericFieldGroup!(ushort, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert("0".parseNumericFieldGroup!(short, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert("0".parseNumericFieldGroup!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([-1])); 7| assert("0".parseNumericFieldGroup!(short, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([-1])); | | /* Max field value cases. */ 7| assert("65535".parseNumericFieldGroup!ushort.equal([65535])); // ushort max 7| assert("65533-65535".parseNumericFieldGroup!ushort.equal([65533, 65534, 65535])); 7| assert("32767".parseNumericFieldGroup!short.equal([32767])); // short max 7| assert("32765-32767".parseNumericFieldGroup!short.equal([32765, 32766, 32767])); 7| assert("32767".parseNumericFieldGroup!(short, Yes.convertToZeroBasedIndex).equal([32766])); | | /* Error cases. */ 14| assertThrown("".parseNumericFieldGroup); 14| assertThrown(" ".parseNumericFieldGroup); 14| assertThrown("-".parseNumericFieldGroup); 14| assertThrown(" -".parseNumericFieldGroup); 14| assertThrown("- ".parseNumericFieldGroup); 14| assertThrown("1-".parseNumericFieldGroup); 14| assertThrown("-2".parseNumericFieldGroup); 14| assertThrown("-1".parseNumericFieldGroup); 14| assertThrown("1.0".parseNumericFieldGroup); 14| assertThrown("0".parseNumericFieldGroup); 14| assertThrown("0-3".parseNumericFieldGroup); 14| assertThrown("3-0".parseNumericFieldGroup); 14| assertThrown("-2-4".parseNumericFieldGroup); 14| assertThrown("2--4".parseNumericFieldGroup); 14| assertThrown("2-".parseNumericFieldGroup); 14| assertThrown("a".parseNumericFieldGroup); 14| assertThrown("0x3".parseNumericFieldGroup); 14| assertThrown("3U".parseNumericFieldGroup); 14| assertThrown("1_000".parseNumericFieldGroup); 14| assertThrown(".".parseNumericFieldGroup); | 14| assertThrown("".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown(" ".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown("-".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown("1-".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown("-2".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown("-1".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown("0".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown("0-3".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown("3-0".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown("-2-4".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); 14| assertThrown("2--4".parseNumericFieldGroup!(size_t, Yes.convertToZeroBasedIndex)); | 14| assertThrown("".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown(" ".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("-".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("1-".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("-2".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("-1".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("0-3".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("3-0".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("-2-4".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("2--4".parseNumericFieldGroup!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); | 14| assertThrown("".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown(" ".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("-".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("1-".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("-2".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("-1".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("0-3".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("3-0".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("-2-4".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 14| assertThrown("2--4".parseNumericFieldGroup!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); | | /* Value out of range cases. */ 14| assertThrown("65536".parseNumericFieldGroup!ushort); // One more than ushort max. 14| assertThrown("65535-65536".parseNumericFieldGroup!ushort); 14| assertThrown("32768".parseNumericFieldGroup!short); // One more than short max. 14| assertThrown("32765-32768".parseNumericFieldGroup!short); | // Convert to zero limits signed range. 14| assertThrown("32768".parseNumericFieldGroup!(ushort, Yes.convertToZeroBasedIndex)); 7| assert("32767".parseNumericFieldGroup!(ushort, Yes.convertToZeroBasedIndex).equal([32766])); |} | |/** | Numeric field-lists | | Numeric field-lists are the original form of field-list supported by tsv-utils tools. | They have largely been superseded by the more general field-list support provided by | [parseFieldList], but the basic facilities for processing numeric field-lists are | still available. | | A numeric field-list is a string entered on the command line identifying one or more | field numbers. They are used by the majority of the tsv-utils applications. There are | two helper functions, [makeFieldListOptionHandler] and [parseNumericFieldList]. Most | applications will use [makeFieldListOptionHandler], it creates a delegate that can be | passed to `std.getopt` to process the command option. Actual processing of the option | text is done by [parseNumericFieldList]. It can be called directly when the text of the | option value contains more than just the field number. | | Syntax and behavior: | | A 'numeric field-list' is a list of numeric field numbers entered on the command line. | Fields are 1-upped integers representing locations in an input line, in the traditional | meaning of Unix command line tools. Fields can be entered as single numbers or a range. | Multiple entries are separated by commas. Some examples (with 'fields' as the command | line option): | | ``` | --fields 3 # Single field | --fields 4,1 # Two fields | --fields 3-9 # A range, fields 3 to 9 inclusive | --fields 1,2,7-34,11 # A mix of ranges and fields | --fields 15-5,3-1 # Two ranges in reverse order. | ``` | | Incomplete ranges are not supported, for example, '6-'. Zero is disallowed as a field | value by default, but can be enabled to support the notion of zero as representing the | entire line. However, zero cannot be part of a range. Field numbers are one-based by | default, but can be converted to zero-based. If conversion to zero-based is enabled, | field number zero must be disallowed or a signed integer type specified for the | returned range. | | An error is thrown if an invalid field specification is encountered. Error text is | intended for display. Error conditions include: | | $(LIST | * Empty fields list | * Empty value, e.g. Two consecutive commas, a trailing comma, or a leading comma | * String that does not parse as a valid integer | * Negative integers, or zero if zero is disallowed. | * An incomplete range | * Zero used as part of a range. | ) | | No other behaviors are enforced. Repeated values are accepted. If zero is allowed, | other field numbers can be entered as well. Additional restrictions need to be | applied by the caller. | | Notes: | | $(LIST | * The data type determines the max field number that can be entered. Enabling | conversion to zero restricts to the signed version of the data type. | * Use 'import std.typecons : Yes, No' to use the convertToZeroBasedIndex and | allowFieldNumZero template parameters. | ) |*/ | |/** | `OptionHandlerDelegate` is the signature of the delegate returned by | [makeFieldListOptionHandler]. | */ |alias OptionHandlerDelegate = void delegate(string option, string value); | |/** | `makeFieldListOptionHandler` creates a std.getopt option handler for processing field-lists | entered on the command line. A field-list is as defined by [parseNumericFieldList]. |*/ |OptionHandlerDelegate makeFieldListOptionHandler( | T, | ConvertToZeroBasedIndex convertToZero = No.convertToZeroBasedIndex, | AllowFieldNumZero allowZero = No.allowFieldNumZero) | (ref T[] fieldsArray) |if (isIntegral!T && (!allowZero || !convertToZero || !isUnsigned!T)) |{ | void fieldListOptionHandler(ref T[] fieldArray, string option, string value) pure @safe | { | import std.algorithm : each; 1078| try value.parseNumericFieldList!(T, convertToZero, allowZero).each!(x => fieldArray ~= x); | catch (Exception exc) | { 63| exc.msg = format("[--%s] %s", option, exc.msg); 63| throw exc; | } | } | 700| return (option, value) => fieldListOptionHandler(fieldsArray, option, value); |} | |// makeFieldListOptionHandler. |unittest |{ | import std.exception : assertThrown, assertNotThrown; | import std.getopt; | | { 7| size_t[] fields; 7| auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 7| getopt(args, "f|fields", fields.makeFieldListOptionHandler); 7| assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); | } | { 7| size_t[] fields; 7| auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex)); 7| assert(fields == [0, 1, 3, 6, 7, 8, 22, 21, 20]); | } | { 7| size_t[] fields; 7| auto args = ["program", "-f", "0"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 7| assert(fields == [0]); | } | { 7| size_t[] fields; 7| auto args = ["program", "-f", "0", "-f", "1,0", "-f", "0,1"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 7| assert(fields == [0, 1, 0, 0, 1]); | } | { 7| size_t[] ints; 7| size_t[] fields; 7| auto args = ["program", "--ints", "1,2,3", "--fields", "1", "--ints", "4,5,6", "--fields", "2,4,7-9,23-21"]; 7| std.getopt.arraySep = ","; 7| getopt(args, | "i|ints", "Built-in list of integers.", &ints, | "f|fields", "Field-list style integers.", fields.makeFieldListOptionHandler); 7| assert(ints == [1, 2, 3, 4, 5, 6]); 7| assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); | } | | /* Basic cases involved unsigned types smaller than size_t. */ | { 7| uint[] fields; 7| auto args = ["program", "-f", "0", "-f", "1,0", "-f", "0,1", "-f", "55-58"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(uint, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 7| assert(fields == [0, 1, 0, 0, 1, 55, 56, 57, 58]); | } | { 7| ushort[] fields; 7| auto args = ["program", "-f", "0", "-f", "1,0", "-f", "0,1", "-f", "55-58"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(ushort, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 7| assert(fields == [0, 1, 0, 0, 1, 55, 56, 57, 58]); | } | | /* Basic cases involving unsigned types. */ | { 7| long[] fields; 7| auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 7| getopt(args, "f|fields", fields.makeFieldListOptionHandler); 7| assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); | } | { 7| long[] fields; 7| auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(long, Yes.convertToZeroBasedIndex)); 7| assert(fields == [0, 1, 3, 6, 7, 8, 22, 21, 20]); | } | { 7| long[] fields; 7| auto args = ["program", "-f", "0"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 7| assert(fields == [-1]); | } | { 7| int[] fields; 7| auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 7| getopt(args, "f|fields", fields.makeFieldListOptionHandler); 7| assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); | } | { 7| int[] fields; 7| auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(int, Yes.convertToZeroBasedIndex)); 7| assert(fields == [0, 1, 3, 6, 7, 8, 22, 21, 20]); | } | { 7| int[] fields; 7| auto args = ["program", "-f", "0"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 7| assert(fields == [-1]); | } | { 7| short[] fields; 7| auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 7| getopt(args, "f|fields", fields.makeFieldListOptionHandler); 7| assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); | } | { 7| short[] fields; 7| auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(short, Yes.convertToZeroBasedIndex)); 7| assert(fields == [0, 1, 3, 6, 7, 8, 22, 21, 20]); | } | { 7| short[] fields; 7| auto args = ["program", "-f", "0"]; 7| getopt(args, | "f|fields", fields.makeFieldListOptionHandler!(short, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 7| assert(fields == [-1]); | } | | { | /* Error cases. */ 7| size_t[] fields; 7| auto args = ["program", "-f", "0"]; 14| assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); | 7| args = ["program", "-f", "-1"]; 14| assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); | 7| args = ["program", "-f", "--fields", "1"]; 14| assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); | 7| args = ["program", "-f", "a"]; 14| assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); | 7| args = ["program", "-f", "1.5"]; 14| assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); | 7| args = ["program", "-f", "2-"]; 14| assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); | 7| args = ["program", "-f", "3,5,-7"]; 14| assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); | 7| args = ["program", "-f", "3,5,"]; 14| assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); | 7| args = ["program", "-f", "-1"]; 14| assertThrown(getopt(args, | "f|fields", fields.makeFieldListOptionHandler!( | size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero))); | } |} | |/** | `parseNumericFieldList` lazily generates a range of fields numbers from a | 'numeric field-list' string. |*/ |auto parseNumericFieldList( | T = size_t, | ConvertToZeroBasedIndex convertToZero = No.convertToZeroBasedIndex, | AllowFieldNumZero allowZero = No.allowFieldNumZero) |(string fieldList, char delim = ',') |if (isIntegral!T && (!allowZero || !convertToZero || !isUnsigned!T)) |{ | import std.algorithm : splitter; | import std.conv : to; | | alias SplitFieldListRange = typeof(fieldList.splitter(delim)); | alias NumericFieldGroupParse | = ReturnType!(parseNumericFieldGroup!(T, convertToZero, allowZero)); | | static struct Result | { | private SplitFieldListRange _splitFieldList; | private NumericFieldGroupParse _currFieldParse; | 532| this(string fieldList, char delim) | { 672| _splitFieldList = fieldList.splitter(delim); 672| _currFieldParse = 672| (_splitFieldList.empty ? "" : _splitFieldList.front) | .parseNumericFieldGroup!(T, convertToZero, allowZero); | 1064| if (!_splitFieldList.empty) _splitFieldList.popFront; | } | | bool empty() pure nothrow @safe @nogc | { 4984| return _currFieldParse.empty; | } | | T front() pure @safe | { | import std.conv : to; | 1512| assert(!empty, "Attempting to fetch the front of an empty numeric field-list."); 1512| assert(!_currFieldParse.empty, "Internal error. Call to front with an empty _currFieldParse."); | 1512| return _currFieldParse.front.to!T; | } | | void popFront() pure @safe | { 1512| assert(!empty, "Attempting to popFront an empty field-list."); | 1512| _currFieldParse.popFront; 2541| if (_currFieldParse.empty && !_splitFieldList.empty) | { 581| _currFieldParse = _splitFieldList.front.parseNumericFieldGroup!( | T, convertToZero, allowZero); 497| _splitFieldList.popFront; | } | } | } | 672| return Result(fieldList, delim); |} | |// parseNumericFieldList. |@safe unittest |{ | import std.algorithm : each, equal; | import std.exception : assertThrown, assertNotThrown; | | /* Basic tests. */ 7| assert("1".parseNumericFieldList.equal([1])); 7| assert("1,2".parseNumericFieldList.equal([1, 2])); 7| assert("1,2,3".parseNumericFieldList.equal([1, 2, 3])); 7| assert("1-2".parseNumericFieldList.equal([1, 2])); 7| assert("1-2,6-4".parseNumericFieldList.equal([1, 2, 6, 5, 4])); 7| assert("1-2,1,1-2,2,2-1".parseNumericFieldList.equal([1, 2, 1, 1, 2, 2, 2, 1])); 7| assert("1-2,5".parseNumericFieldList!size_t.equal([1, 2, 5])); | | /* Signed Int tests */ 7| assert("1".parseNumericFieldList!int.equal([1])); 7| assert("1,2,3".parseNumericFieldList!int.equal([1, 2, 3])); 7| assert("1-2".parseNumericFieldList!int.equal([1, 2])); 7| assert("1-2,6-4".parseNumericFieldList!int.equal([1, 2, 6, 5, 4])); 7| assert("1-2,5".parseNumericFieldList!int.equal([1, 2, 5])); | | /* Convert to zero tests */ 7| assert("1".parseNumericFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0])); 7| assert("1,2,3".parseNumericFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1, 2])); 7| assert("1-2".parseNumericFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1])); 7| assert("1-2,6-4".parseNumericFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1, 5, 4, 3])); 7| assert("1-2,5".parseNumericFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1, 4])); | 7| assert("1".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex).equal([0])); 7| assert("1,2,3".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1, 2])); 7| assert("1-2".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1])); 7| assert("1-2,6-4".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1, 5, 4, 3])); 7| assert("1-2,5".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1, 4])); | | /* Allow zero tests. */ 7| assert("0".parseNumericFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert("1,0,3".parseNumericFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 0, 3])); 7| assert("1-2,5".parseNumericFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 2, 5])); 7| assert("0".parseNumericFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 7| assert("1,0,3".parseNumericFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 0, 3])); 7| assert("1-2,5".parseNumericFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 2, 5])); 7| assert("0".parseNumericFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([-1])); 7| assert("1,0,3".parseNumericFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0, -1, 2])); 7| assert("1-2,5".parseNumericFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0, 1, 4])); | | /* Error cases. */ 14| assertThrown("".parseNumericFieldList.each); 14| assertThrown(" ".parseNumericFieldList.each); 14| assertThrown(",".parseNumericFieldList.each); 14| assertThrown("5 6".parseNumericFieldList.each); 14| assertThrown(",7".parseNumericFieldList.each); 14| assertThrown("8,".parseNumericFieldList.each); 14| assertThrown("8,9,".parseNumericFieldList.each); 14| assertThrown("10,,11".parseNumericFieldList.each); 14| assertThrown("".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex).each); 14| assertThrown("1,2-3,".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex).each); 14| assertThrown("2-,4".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex).each); 14| assertThrown("1,2,3,,4".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown(",7".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown("8,".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown("10,0,,11".parseNumericFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown("8,9,".parseNumericFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); | 14| assertThrown("0".parseNumericFieldList.each); 14| assertThrown("1,0,3".parseNumericFieldList.each); 14| assertThrown("0".parseNumericFieldList!(int, Yes.convertToZeroBasedIndex, No.allowFieldNumZero).each); 14| assertThrown("1,0,3".parseNumericFieldList!(int, Yes.convertToZeroBasedIndex, No.allowFieldNumZero).each); 14| assertThrown("0-2,6-0".parseNumericFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown("0-2,6-0".parseNumericFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 14| assertThrown("0-2,6-0".parseNumericFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); |} common/src/tsv_utils/common/fieldlist.d is 99% covered <<<<<< EOF # path=./tsv-select-src-tsv_utils-tsv-select.lst |/** |A variant of the unix 'cut' program, with the ability to reorder fields. | |tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder |fields. Lines are read from files or standard input and split on a delimiter character. |Fields are written to standard output in the order listed. Fields can be listed more |than once, and fields not listed can be written out as a group. | |This program is intended both as a useful utility and a D programming language example. |Functionality and constructs used include command line argument processing, file I/O, |exception handling, ranges, tuples and strings, templates, universal function call syntax |(UFCS), lambdas and functional programming constructs. Comments are more verbose than |typical to shed light on D programming constructs, but not to the level of a tutorial. | |Copyright (c) 2015-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ | |module tsv_utils.tsv_select; // Module name defaults to file name, but hyphens not allowed, so set it here. | |// Imports used by multiple routines. Others imports made in local context. |import std.exception : enforce; |import std.range; |import std.stdio; |import std.typecons : tuple, Tuple; | |// 'Heredoc' style help text. When printed it is followed by a getopt formatted option list. |immutable helpText = q"EOS |Synopsis: tsv-select [options] [file...] | |tsv-select reads files or standard input and writes selected fields to |standard output. Fields are written in the order listed. This is similar |to Unix 'cut', but with the ability to reorder fields. | |Fields can be specified by field number or, for files with header lines, |by field name. Use '--H|header' to enable selection by name. This also |manages header lines from multiple files, retaining only the first header. | |Field numbers start with one. The field list is comma separated. Ranges |can be used, and wildcards can be used when specifying fields by name. | |Fields can be dropped using '--e|exclude'. Fields not included in the |'--f|fields' option can be selected as a group using '--r|rest'. | |Examples: | | # Selecting fields. Output is in the order listed | tsv-select -H date,time file.tsv | tsv-select -f 2,1 file.tsv | tsv-select -f 5-7,2,9-11 | tsv-select -H -f '*_date' file.tsv | | # Dropping fields | tsv-select --exclude 1 file.tsv | tsv-select -H -e date,time file.tsv | | # Move fields to the front or the back | tsv-select -f 1 --rest first file.tsv # Move field 1 to the end | tsv-select -H -f date --rest last # Move 'date' field to the front | | # Read multiple files, keep the header from only the first | tsv-select data*.tsv -H --fields 1,2,4-7,14 | |Use '--help-verbose' for detailed information. Use '--help-fields' for |details about field lists and field names. | |Options: |EOS"; | |immutable helpTextVerbose = q"EOS |Synopsis: tsv-select [options] [file...] | |tsv-select reads files or standard input and writes selected fields to |standard output. Fields are written in the order listed. This is similar |to Unix 'cut', but with the ability to reorder fields. | |Fields can be specified by field number or, for files with header lines, |by field name. Use '--H|header' to enable selection by name. This also |manages header lines from multiple files, retaining only the first header. | |Field numbers start with one. The field list is comma separated. Fields |can be repeated and ranges can be used. Wildcards can be used when |specifying fields by name, and escapes can be used to specify fields names |containing special characters. Run '--help-fields' for details. | |Fields can be excluded using '--e|exclude'. All fields not excluded are |output. Fields not included in the '--f|fields' option can be selected as |a group using '--r|rest'. '--f|fields' and '--r|rest' can be used with | '--e|exclude' to reorder non-excluded fields. | |Examples: | | # Keep the first field from two files | tsv-select -f 1 file1.tsv file2.tsv | | # Keep fields 1 and 2, retaining the header from only the first file | tsv-select -H -f 1,2 file1.tsv file2.tsv | | # Keep the 'time' field | tsv-select -H -f time file1.tsv | | # Keep all fields ending '_date' or '_time' | tsv-select -H -f '*_date,*_time' file.tsv | | # Drop all the '*_time' fields | tsv-select -H --exclude '*_time' file.tsv | | # Field reordering and field ranges | tsv-select -f 3,2,1 file.tsv | tsv-select -f 1,4-7,11 file.tsv | tsv-select -f 1,7-4,11 file.tsv | | # Repeating fields | tsv-select -f 1,2,1 file.tsv | tsv-select -f 1-3,3-1 file.tsv | | # Move fields to the front | tsv-select -f 5 --rest last file.tsv | tsv-select -H -f Date,Time --rest last file.tsv | | # Move fields to the end | tsv-select -f 4,5 --rest first file.tsv | tsv-select -f '*_time' --rest first file.tsv | | # Move field 2 to the front and drop fields 10-15 | tsv-select -f 2 -e 10-15 file.tsv | | # Move field 2 to the end, dropping fields 10-15 | tsv-select -f 2 -rest first -e 10-15 file.tsv | |Use '--help-fields' for detailed help on field lists. | |Notes: |* One of '--f|fields' or '--e|exclude' is required. |* Fields specified by '--f|fields' and '--e|exclude' cannot overlap. |* When '--f|fields' and '--e|exclude' are used together, the effect is to | specify '--rest last'. This can be overridden by using '--rest first'. |* Each input line must be long enough to contain all fields specified | with '--f|fields'. This is not necessary for '--e|exclude' fields. |* Specifying names of fields containing special characters may require | escaping the special characters. See '--help-fields' for details. | |Options: |EOS"; | |/** Container for command line options. | */ |struct TsvSelectOptions |{ | import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange; | | // The allowed values for the --rest option. | enum RestOption { none, first, last}; | | string programName; /// Program name | ByLineSourceRange!() inputSources; /// Input Files | bool hasHeader = false; /// --H|header | char delim = '\t'; /// --d|delimiter | RestOption restArg; /// --rest first|last (none is hidden default) | size_t[] fields; /// Derived from --f|fields | bool[] excludedFieldsTable; /// Derived. Lookup table for excluded fields. | | /** Process command line arguments (getopt cover). | * | * processArgs calls getopt to process command line arguments. It does any additional | * validation and parameter derivations needed. A tuple is returned. First value is | * true if command line arguments were successfully processed and execution should | * continue, or false if an error occurred or the user asked for help. If false, the | * second value is the appropriate exit code (0 or 1). | * | * Returning true (execution continues) means args have been validated and derived | * values calculated. In addition, field indices have been converted to zero-based. | */ | auto processArgs (ref string[] cmdArgs) | { | import std.algorithm : any, each, maxElement; | import std.array : split; | import std.conv : to; | import std.format : format; | import std.getopt; | import std.path : baseName, stripExtension; | import std.typecons : Yes, No; | import tsv_utils.common.fieldlist; | import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; | 185| bool helpVerbose = false; // --help-verbose 185| bool helpFields = false; // --help-fields 185| bool versionWanted = false; // --V|version 185| string fieldsArg; // --f|fields 185| string excludedFieldsArg; // --e|exclude | 185| string fieldsOptionString = "f|fields"; 185| string excludedFieldsOptionString = "e|exclude"; | 370| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | try | { 185| arraySep = ","; // Use comma to separate values in command line options 185| auto r = getopt( | cmdArgs, | "help-verbose", | " Print more detailed help.", | &helpVerbose, | | "help-fields", | " Print help on specifying fields.", | &helpFields, | | std.getopt.config.caseSensitive, | "H|header", | " Treat the first line of each file as a header.", | &hasHeader, | std.getopt.config.caseInsensitive, | | fieldsOptionString, | " Fields to retain. Fields are output in the order listed.", | &fieldsArg, | | excludedFieldsOptionString, | " Fields to exclude.", | &excludedFieldsArg, | | "r|rest", | "first|last Output location for fields not included in '--f|fields'.", | &restArg, | | "d|delimiter", | "CHR Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", | &delim, | | std.getopt.config.caseSensitive, | "V|version", | " Print version information and exit.", | &versionWanted, | std.getopt.config.caseInsensitive, | ); | 182| if (r.helpWanted) | { 1| defaultGetoptPrinter(helpText, r.options); 1| return tuple(false, 0); | } 181| else if (helpVerbose) | { 1| defaultGetoptPrinter(helpTextVerbose, r.options); 1| return tuple(false, 0); | } 180| else if (helpFields) | { 1| writeln(fieldListHelpText); 1| return tuple(false, 0); | } 179| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("tsv-select")); 2| return tuple(false, 0); | } | | /* Remaining command line args are files. Use standard input if files | * were not provided. Truncate cmdArgs to consume the arguments. | */ 354| string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 177| cmdArgs.length = 1; | | /* Validation and derivations - Do as much validation prior to header line | * processing as possible (avoids waiting on stdin). | * | * Note: fields and excludedFields depend on header line processing, but | * fieldsArg and excludedFieldsArg can be used to detect whether the | * command line argument was specified. | */ | 227| enforce(!fieldsArg.empty || !excludedFieldsArg.empty, 2| "One of '--f|fields' or '--e|exclude' is required."); | 175| string[] headerFields; | | /* fieldListArgProcessing encapsulates the field list processing. It is | * called prior to reading the header line if headers are not being used, | * and after if headers are being used. | */ | void fieldListArgProcessing() | { 174| if (!fieldsArg.empty) | { 126| fields = fieldsArg | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)( | hasHeader, headerFields, fieldsOptionString) | .array; | } | 157| size_t[] excludedFields; | 157| if (!excludedFieldsArg.empty) | { 64| excludedFields = excludedFieldsArg | .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)( | hasHeader, headerFields, excludedFieldsOptionString) | .array; | } | 152| if (excludedFields.length > 0) | { | /* Make sure selected and excluded fields do not overlap. */ 12586361| foreach (e; excludedFields) | { 15589352| foreach (f; fields) | { 1001061| enforce(e != f, "'--f|fields' and '--e|exclude' have overlapping fields."); | } | } | | /* '--exclude' changes '--rest' default to 'last'. */ 99| if (restArg == RestOption.none) restArg = RestOption.last; | | /* Build the excluded field lookup table. | * | * Note: Users won't have any reason to expect memory is allocated based | * on the max field number. However, users might pick arbitrarily large | * numbers when trimming fields. So, limit the max field number to something | * big but reasonable (more than 1 million). The limit can be raised if use | * cases arise. | */ 54| size_t maxExcludedField = excludedFields.maxElement; 54| size_t maxAllowedExcludedField = 1024 * 1024; | 54| enforce(maxExcludedField < maxAllowedExcludedField, 4| format("Maximum allowed '--e|exclude' field number is %d.", | maxAllowedExcludedField)); | 50| excludedFieldsTable.length = maxExcludedField + 1; // Initialized to false 8389106| foreach (e; excludedFields) excludedFieldsTable[e] = true; | } | } | 310| if (!hasHeader) fieldListArgProcessing(); | | /* | * Create the byLineSourceRange and perform header line processing. | */ 157| inputSources = byLineSourceRange(filepaths); | 156| if (hasHeader) | { 40| if (!inputSources.front.byLine.empty) | { 39| throwIfWindowsNewlineOnUnix(inputSources.front.byLine.front, inputSources.front.name, 1); 38| headerFields = inputSources.front.byLine.front.split(delim).to!(string[]); | } | 39| fieldListArgProcessing(); | } | | } | catch (Exception exc) | { 38| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 38| return tuple(false, 1); | } 142| return tuple(true, 0); | } |} | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |/** Main program. | */ |int main(string[] cmdArgs) |{ | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 185| dmd_coverSetMerge(true); | } | 185| TsvSelectOptions cmdopt; 185| const r = cmdopt.processArgs(cmdArgs); 228| if (!r[0]) return r[1]; | version(LDC_Profile) | { | import ldc.profile : resetAll; | resetAll(); | } | try | { | /* Invoke the tsvSelect template matching the --rest option chosen. Option args | * are removed by command line processing (getopt). The program name and any files | * remain. Pass the files to tsvSelect. | */ 142| final switch (cmdopt.restArg) | { 54| case TsvSelectOptions.RestOption.none: 54| tsvSelect!(RestLocation.none)(cmdopt); 49| break; 26| case TsvSelectOptions.RestOption.first: 26| tsvSelect!(RestLocation.first)(cmdopt); 26| break; 62| case TsvSelectOptions.RestOption.last: 62| tsvSelect!(RestLocation.last)(cmdopt); 62| break; | } | } | catch (Exception exc) | { 5| stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 5| return 1; | } | 137| return 0; |} | |// tsvSelect | |/** Enumeration of the different specializations of the tsvSelect template. | * | * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It | * is used by main to choose the appropriate tsvSelect template instantiation to call. It | * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The | * TsvSelectOptions version specifies the text of allowed values in command line arguments. | */ |enum RestLocation { none, first, last }; | |/** tsvSelect does the primary work of the tsv-select program. | * | * Input is read line by line, extracting the listed fields and writing them out in the order | * specified. An exception is thrown on error. | * | * This function is templatized with instantiations for the different --rest options. This | * avoids repeatedly running the same if-tests inside the inner loop. The main function | * instantiates this function three times, once for each of the --rest options. It results | * in a larger program, but is faster. Run-time improvements of 25% were measured compared | * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.) | */ | |void tsvSelect(RestLocation rest)(ref TsvSelectOptions cmdopt) |{ | import tsv_utils.common.utils: BufferedOutputRange, ByLineSourceRange, | InputFieldReordering, throwIfWindowsNewlineOnUnix; | import std.algorithm: splitter; | import std.array : appender, Appender; | import std.format: format; | import std.range; | | // Ensure the correct template instantiation was called. | static if (rest == RestLocation.none) 54| assert(cmdopt.restArg == TsvSelectOptions.RestOption.none); | else static if (rest == RestLocation.first) 26| assert(cmdopt.restArg == TsvSelectOptions.RestOption.first); | else static if (rest == RestLocation.last) 62| assert(cmdopt.restArg == TsvSelectOptions.RestOption.last); | else | static assert(false, "rest template argument does not match cmdopt.restArg."); | | /* Check that the input files were setup as expected. Should at least have one | * input, stdin if nothing else, and newlines removed from the byLine range. | */ 142| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator))); | | /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */ 192| assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none); | | /* InputFieldReordering copies select fields from an input line to a new buffer. | * The buffer is reordered in the process. | */ 142| auto fieldReordering = new InputFieldReordering!char(cmdopt.fields); | | /* Fields not on the --fields list are added to a separate buffer so they can be | * output as a group (the --rest option). This is done using an 'Appender', which | * is faster than the ~= operator. The Appender is passed a GC allocated buffer | * that grows as needed and is reused for each line. Typically it'll grow only | * on the first line. | */ | static if (rest != RestLocation.none) | { 88| auto leftOverFieldsAppender = appender!(char[][]); | } | | /* BufferedOutputRange (from common/utils.d) is a performance improvement over | * writing directly to stdout. | */ 284| auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); | | /* Read each input file (or stdin) and iterate over each line. | */ 1195| foreach (fileNum, inputStream; cmdopt.inputSources.enumerate) | { 17575| foreach (lineNum, line; inputStream.byLine.enumerate(1)) | { 3624| if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); | 3657| if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader) | { 34| continue; // Drop the header line from all but the first file. | } | | static if (rest != RestLocation.none) | { 764| leftOverFieldsAppender.clear; | | /* Track the field location in the line. This enables bulk appending | * after the last specified field has been processed. | */ 764| size_t nextFieldStart = 0; | } | 3406| fieldReordering.initNewLine; | 78164| foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) | { | static if (rest == RestLocation.none) | { 13454| fieldReordering.processNextField(fieldIndex, fieldValue); 16095| if (fieldReordering.allFieldsFilled) break; | } | else | { | /* Processing with 'rest' fields. States: | * - Excluded fields and specified fields remain | * - Only specified fields remain | * - Only excluded fields remain | */ | 2144| nextFieldStart += fieldValue.length + 1; 2144| bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length; 3306| immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex]; | 2144| if (!isExcluded) | { 1504| immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue); | 2233| if (numMatched == 0) leftOverFieldsAppender.put(fieldValue); | } 640| else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length) | { 354| excludedFieldsRemain = false; | } | 3549| if (fieldReordering.allFieldsFilled && !excludedFieldsRemain) | { | /* Processed all specified fields. Bulk append any fields | * remaining on the line. Cases: | * - Current field is last field: | */ 678| if (nextFieldStart <= line.length) | { 452| leftOverFieldsAppender.put(line[nextFieldStart .. $]); | } | 678| break; | } | } | } | | // Finished with all fields in the line. 3406| enforce(fieldReordering.allFieldsFilled, 1| format("Not enough fields in line. File: %s, Line: %s", | inputStream.name, lineNum)); | | // Write the re-ordered line. | | static if (rest == RestLocation.first) | { 234| if (leftOverFieldsAppender.data.length > 0) | { 197| bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 385| if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); | } | } | 3405| bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim); | | static if (rest == RestLocation.last) | { 530| if (leftOverFieldsAppender.data.length > 0) | { 614| if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); 448| bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); | } | } | 3405| bufferedOutput.appendln; | | /* Send the first line of the first file immediately. This helps detect | * errors quickly in multi-stage unix pipelines. Note that tsv-select may | * have been sent one line from an upstream process, usually a header line. | */ 3688| if (lineNum == 1 && fileNum == 0) bufferedOutput.flush; | } | } |} tsv-select/src/tsv_utils/tsv-select.d is 100% covered <<<<<< EOF # path=./tsv-split-src-tsv_utils-tsv-split.lst |/** |Command line tool for splitting a files (or files) into multiple output files. |Several methods for splitting are available, including splitting by line count, |splitting by random assignment, and splitting by random assignment based on |key fields. | |Copyright (c) 2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.tsv_split; | |import std.exception : enforce; |import std.format : format; |import std.range; |import std.stdio; |import std.typecons : tuple, Flag; | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |version(unittest) |{ | // When running unit tests, use main from -main compiler switch. |} |else |{ | /** Main program. | * | * Invokes command line argument processing and calls tsvSplit to do the real | * work. Errors occurring during processing are caught and reported to the user. | */ | int main(string[] cmdArgs) | { | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 210| dmd_coverSetMerge(true); | } | 210| TsvSplitOptions cmdopt; 210| const r = cmdopt.processArgs(cmdArgs); 240| if (!r[0]) return r[1]; | version(LDC_Profile) | { | import ldc.profile : resetAll; | resetAll(); | } | try | { 180| tsvSplit(cmdopt); | } | catch (Exception exc) | { 7| stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 7| return 1; | } 173| return 0; | } |} | |immutable helpText = q"EOS |Synopsis: tsv-split [options] [file...] | |Split input lines into multiple output files. There are three modes of |operation: | |* Fixed number of lines per file (--l|lines-per-file NUM): Each input | block of NUM lines is written to a new file. Similar to Unix 'split'. | |* Random assignment (--n|num-files NUM): Each input line is written to a | randomly selected output file. Random selection is from NUM files. | |* Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): | Input lines are written to output files using fields as a key. Each | unique key is randomly assigned to one of NUM output files. All lines | with the same key are written to the same file. | |By default, files are written to the current directory and have names |of the form 'part_NNN', with 'NNN' being a number and |being the extension of the first input file. If the input file is |'file.txt', the names will take the form 'part_NNN.txt'. The output |directory and file names are customizable. | |Fields are specified using field number or field name. Field names |require that the input file has a header line. | |Use '--help-verbose' for more detailed information. | |Options: |EOS"; | |immutable helpTextVerbose = q"EOS |Synopsis: tsv-split [options] [file...] | |Split input lines into multiple output files. There are three modes of |operation: | |* Fixed number of lines per file (--l|lines-per-file NUM): Each input | block of NUM lines is written to a new file. Similar to Unix 'split'. | |* Random assignment (--n|num-files NUM): Each input line is written to a | randomly selected output file. Random selection is from NUM files. | |* Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): | Input lines are written to output files using fields as a key. Each | unique key is randomly assigned to one of NUM output files. All lines | with the same key are written to the same file. | |Output files: By default, files are written to the current directory and |have names of the form 'part_NNN', with 'NNN' being a number and | being the extension of the first input file. If the input file is |'file.txt', the names will take the form 'part_NNN.txt'. The suffix is |empty when reading from standard input. The numeric part defaults to 3 |digits for '--l|lines-per-files'. For '--n|num-files' enough digits are |used so all filenames are the same length. The output directory and file |names are customizable. | |Header lines: There are two ways to handle input with headers: write a |header to all output files (--H|header), or exclude headers from all |output files ('--I|header-in-only'). The best choice depends on the |follow-up processing. All tsv-utils tools support header lines in multiple |input files, but many other tools do not. For example, GNU parallel works |best on files without header lines. | |Random assignment (--n|num-files): Random distribution of records to a set |of files is a common task. When data fits in memory the preferred approach |is usually to shuffle the data and split it into fixed sized blocks. E.g. |'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches |are needed when data is too large for convenient shuffling. tsv-split's |random assignment feature is useful in this case. Each input line is |written to a randomly selected output file. Note that output files will |have similar but not identical numbers of records. | |Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This |splits a data set into multiple files sharded by key. All lines with the |same key are written to the same file. This partitioning enables parallel |computation based on the key. For example, statistical calculation |('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields'). |These operations can be parallelized using tools like GNU parallel, which |simplifies concurrent operations on multiple files. Fields are specified |using field number or field name. Field names require that the input file |has a header line. Use '--help-fields' for details about field names. | |Random seed: By default, each tsv-split invocation using random assignment |or random assignment by key produces different assignments to the output |files. Using '--s|static-seed' changes this so multiple runs produce the |same assignments. This works by using the same random seed each run. The |seed can be specified using '--v|seed-value'. | |Appending to existing files: By default, an error is triggered if an |output file already exists. '--a|append' changes this so that lines are |appended to existing files. (Header lines are not appended to files with |data.) This is useful when adding new data to files created by a previous |tsv-split run. Random assignment should use the same '--n|num-files' value |each run, but different random seeds (avoid '--s|static-seed'). Random |assignment by key should use the same '--n|num-files', '--k|key-fields', |and seed ('--s|static-seed' or '--v|seed-value') each run. | |Max number of open files: Random assignment and random assignment by key |are dramatically faster when all output files are kept open. However, |keeping a large numbers of open files can bump into system limits or limit |resources available to other processes. By default, tsv-split uses up to |4096 open files or the system per-process limit, whichever is smaller. |This can be changed using '--max-open-files', though it cannot be set |larger than the system limit. The system limit varies considerably between |systems. On many systems it is unlimited. On MacOS it is often set to 256. |Use Unix 'ulimit' to display and modify the limits: |* 'ulimit -n' - Show the "soft limit". The per-process maximum. |* 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit. |* 'ulimit -Sn NUM' - Change the "soft limit" to NUM. | |Examples: | | # Split a 10 million line file into 1000 files, 10,000 lines each. | # Output files are part_000.txt, part_001.txt, ... part_999.txt. | tsv-split data.txt --lines-per-file 10000 | | # Same as the previous example, but write files to a subdirectory. | tsv-split data.txt --dir split_files --lines-per-file 10000 | | # Split a file into 10,000 line files, writing a header line to each | tsv-split data.txt -H --lines-per-file 10000 | | # Same as the previous example, but dropping the header line. | tsv-split data.txt -I --lines-per-file 10000 | | # Randomly assign lines to 1000 files | tsv-split data.txt --num-files 1000 | | # Randomly assign lines to 1000 files while keeping unique entries | # from the 'url' field together. | tsv-split data.tsv -H -k url --num-files 1000 | | # Randomly assign lines to 1000 files. Later, randomly assign lines | # from a second data file to the same output files. | tsv-split data1.tsv -n 1000 | tsv-split data2.tsv -n 1000 --append | | # Randomly assign lines to 1000 files using field 3 as a key. | # Later, add a second file to the same output files. | tsv-split data1.tsv -n 1000 -k 3 --static-seed | tsv-split data2.tsv -n 1000 -k 3 --static-seed --append | | # Change the system per-process open file limit for one command. | # The parens create a sub-shell. The current shell is not changed. | ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt ) | |Options: |EOS"; | |/** Container for command line options and derived data. | * | * TsvSplitOptions handles several aspects of command line options. On the input side, | * it defines the command line options available, performs validation, and sets up any | * derived state based on the options provided. These activities are handled by the | * processArgs() member. | * | * Once argument processing is complete, TsvSplitOptions is used as a container | * holding the specific processing options used by the splitting algorithms. | */ |struct TsvSplitOptions |{ | import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; | | enum invalidFileSuffix = "///////"; | | string programName; /// Program name | InputSourceRange inputSources; /// Input files | bool headerInOut = false; /// --H|header | bool headerIn = false; /// --I|header-in-only | size_t linesPerFile = 0; /// --l|lines-per-file | uint numFiles = 0; /// --n|num-files | size_t[] keyFields; /// Derived: --k|key-fields | string dir; /// --dir | string prefix = "part_"; /// --prefix | string suffix = invalidFileSuffix; /// --suffix | uint digitWidth = 0; /// --w|digit-width | bool appendToExistingFiles = false; /// --a|append | bool staticSeed = false; /// --s|static-seed | uint seedValueOptionArg = 0; /// --v|seed-value | char delim = '\t'; /// --d|delimiter | uint maxOpenFilesArg = 0; /// --max-open-files | bool hasHeader = false; /// Derived. True if either '--H|header' or '--I|header-in-only' is set. | bool keyIsFullLine = false; /// Derived. True if '--f|fields 0' is specfied. | bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value | uint seed = 0; /// Derived from --static-seed, --seed-value | uint maxOpenOutputFiles; /// Derived. | | /** Process tsv-split command line arguments. | * | * Defines the command line options, performs validation, and derives additional | * state. std.getopt.getopt is called to do the main option processing followed | * additional validation and derivation. | * | * Help text is printed to standard output if help was requested. Error text is | * written to stderr if invalid input is encountered. | * | * A tuple is returned. First value is true if command line arguments were | * successfully processed and execution should continue, or false if an error | * occurred or the user asked for help. If false, the second value is the | * appropriate exit code (0 or 1). | * | * Returning true (execution continues) means args have been validated and derived | * values calculated. Field indices will have been converted to zero-based. | */ | auto processArgs(ref string[] cmdArgs) | { | import std.algorithm : all, canFind, each, min; | import std.conv : to; | import std.file : exists, isDir; | import std.getopt; | import std.math : isNaN; | import std.path : baseName, expandTilde, extension, stripExtension; | import std.typecons : Yes, No; | import tsv_utils.common.fieldlist; | 727| bool helpVerbose = false; // --help-verbose 727| bool helpFields = false; // --help-fields 727| bool versionWanted = false; // --V|version 727| string keyFieldsArg; // --k|key-fields | 727| string keyFieldsOptionString = "k|key-fields"; | 1454| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | try | { 727| arraySep = ","; // Use comma to separate values in command line options 727| auto r = getopt( | cmdArgs, | "help-verbose", " Print more detailed help.", &helpVerbose, | "help-fields", " Print help on specifying fields.", &helpFields, | | std.getopt.config.caseSensitive, | "H|header", " Input files have a header line. Write the header to each output file.", &headerInOut, | "I|header-in-only", " Input files have a header line. Do not write the header to output files.", &headerIn, | std.getopt.config.caseInsensitive, | | "l|lines-per-file", "NUM Number of lines to write to each output file (excluding the header line).", &linesPerFile, | "n|num-files", "NUM Number of output files to generate.", &numFiles, | | keyFieldsOptionString, | " Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.", | &keyFieldsArg, | | "dir", "STR Directory to write to. Default: Current working directory.", &dir, | "prefix", "STR Filename prefix. Default: 'part_'", &prefix, | "suffix", "STR Filename suffix. Default: First input file extension. None for standard input.", &suffix, | "w|digit-width", "NUM Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth, | "a|append", " Append to existing files.", &appendToExistingFiles, | | "s|static-seed", " Use the same random seed every run.", &staticSeed, | | std.getopt.config.caseSensitive, | "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, | std.getopt.config.caseInsensitive, | | "d|delimiter", "CHR Field delimiter.", &delim, | "max-open-files", "NUM Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg, | | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | std.getopt.config.caseInsensitive, | ); | 727| if (r.helpWanted) | { 1| defaultGetoptPrinter(helpText, r.options); 1| return tuple(false, 0); | } 726| else if (helpVerbose) | { 1| defaultGetoptPrinter(helpTextVerbose, r.options); 1| return tuple(false, 0); | } 725| else if (helpFields) | { 1| writeln(fieldListHelpText); 1| return tuple(false, 0); | } 724| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("tsv-split")); 2| return tuple(false, 0); | } | | /* Remaining command line args are files. | */ 1444| string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 722| cmdArgs.length = 1; | | /* Validation and derivations - Do as much validation prior to header line | * processing as possible (avoids waiting on stdin). | * | * Note: keyFields depends on header line processing, but keyFieldsArg | * can be used to detect whether the command line argument was specified. | */ | 772| enforce(!(headerInOut && headerIn), 1| "Use only one of '--H|header' and '--I|header-in-only'."); | 1393| hasHeader = headerInOut || headerIn; | 884| enforce(linesPerFile != 0 || numFiles != 0, 3| "Either '--l|lines-per-file' or '--n|num-files' is required."); | 1276| enforce(linesPerFile == 0 || numFiles == 0, 2| "'--l|lines-per-file' and '--n|num-files' cannot be used together."); | 1272| enforce(linesPerFile == 0 || keyFieldsArg.length == 0, 1| "'--l|lines-per-file' and '--k|key-fields' cannot be used together."); | 717| enforce(numFiles != 1, "'--n|num-files must be two or more."); | 713| if (!dir.empty) | { 504| dir = dir.expandTilde; 507| enforce(dir.exists, format("Directory does not exist: --dir '%s'", dir)); 501| enforce(dir.isDir, format("Path is not a directory: --dir '%s'", dir)); | } | | /* Seed. */ | import std.random : unpredictableSeed; | 1360| usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); | 1303| if (usingUnpredictableSeed) seed = unpredictableSeed; 174| else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 120| else if (staticSeed) seed = 2438424139; 0000000| else assert(0, "Internal error, invalid seed option states."); | | /* Maximum number of open files. Mainly applies when --num-files is used. | * | * Derive maxOpenOutputFiles. Inputs: | * - Internal default limit: 4096. This is a somewhat conservative setting. | * - rlimit open files limit. Defined by '$ ulimit -n'. | * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit, | * but only up to the rlimit value. | * - Four open files are reserved for stdin, stdout, stderr, and one input | * file. | */ | 710| immutable uint internalDefaultMaxOpenFiles = 4096; 710| immutable uint numReservedOpenFiles = 4; 710| immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit(); | 724| enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles, 1| format("'--max-open-files' must be at least %d.", | numReservedOpenFiles + 1)); | 709| enforce(maxOpenFilesArg <= rlimitOpenFilesLimit, 1| format("'--max-open-files' value (%d) greater current system limit (%d)." ~ | "\nRun 'ulimit -n' to see the soft limit." ~ | "\nRun 'ulimit -Hn' to see the hard limit." ~ | "\nRun 'ulimit -Sn NUM' to change the soft limit.", | maxOpenFilesArg, rlimitOpenFilesLimit)); | 708| enforce(rlimitOpenFilesLimit > numReservedOpenFiles, 0000000| format("System open file limit too small. Current value: %d. Must be %d or more." ~ | "\nRun 'ulimit -n' to see the soft limit." ~ | "\nRun 'ulimit -Hn' to see the hard limit." ~ | "\nRun 'ulimit -Sn NUM' to change the soft limit.", | rlimitOpenFilesLimit, numReservedOpenFiles + 1)); | 708| immutable uint openFilesLimit = | (maxOpenFilesArg != 0) 12| ? maxOpenFilesArg 696| : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit); | 708| assert(openFilesLimit > numReservedOpenFiles); | 708| maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles; | | /* Suffix - If not provided, use the extension of the first input file. | * No suffix if reading from standard input. | */ 1398| if (suffix == invalidFileSuffix) suffix = filepaths[0].extension; | | /* Ensure forward slash is not included in the filename prefix and suffix. | * Forward slash is an invalid Unix filename character. However, open file | * calls could match a directory path, resulting in unintended file | * creation. | * | * The other invalid filename character on Unix is the NULL character. | * However, the NULL character cannot be entered via Unix command lines, | * so there is no need to test for it explicitly. | */ 708| enforce(!prefix.canFind('/'), 1| "'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); | 707| enforce(!suffix.canFind('/'), 1| "'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); | | /* Digit width - If not specified, or specified as zero, the width is | * determined by the number of files for --num-files, or defaulted to 3 | * for --lines-per-file. | */ 706| if (digitWidth == 0) | { 206| if (numFiles > 0) | { 146| digitWidth = 1; 146| uint n = numFiles - 1; 257| while (n >= 10) | { 111| n /= 10; 111| ++digitWidth; | } | } | else | { 60| digitWidth = 3; | } | } 706| assert(digitWidth != 0); | | /* | * Create the inputSourceRange and perform header line processing. | */ 1412| ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 706| inputSources = inputSourceRange(filepaths, readHeader); | 703| string[] headerFields; | 795| if (hasHeader) headerFields = inputSources.front.header.split(delim).to!(string[]); | 703| if (!keyFieldsArg.empty) | { 69| keyFields = | keyFieldsArg | .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) | (hasHeader, headerFields, keyFieldsOptionString) | .array; | } | 698| if (keyFields.length > 0) | { 114| if (keyFields.length == 1 && keyFields[0] == 0) | { 8| keyIsFullLine = true; | } | else | { 128| enforce(keyFields.all!(x => x != 0), 1| "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); | 126| keyFields.each!((ref x) => --x); // Convert to zero-based indexing. | } | } | | } | catch (Exception exc) | { 25| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 25| return tuple(false, 1); | } 697| return tuple(true, 0); | } |} | |/* TsvSplitOptions unit tests (command-line argument processing). | * | * Basic tests. Many cases are covered in executable tests, including all error cases, | * as errors write to stderr. | */ |unittest |{ | import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. | import std.conv : to; | import std.file : mkdir, rmdirRecurse; | import std.path : buildPath; | | /* A dummy file is used so we don't have to worry about the cases where command | * line processing might open a file. Don't want to use standard input for this, | * at least in cases where it might try to read to get the header line. | */ 1| auto testDir = makeUnittestTempDir("tsv_split_bylinecount"); 1| scope(exit) testDir.rmdirRecurse; | 1| string somefile_txt = buildPath(testDir, "somefile.txt"); 1| somefile_txt.File("w").writeln("Hello World!"); | | { 1| auto args = ["unittest", "--lines-per-file", "10", somefile_txt]; 1| TsvSplitOptions cmdopt; 1| const r = cmdopt.processArgs(args); | 1| assert(cmdopt.linesPerFile == 10); 1| assert(cmdopt.keyFields.empty); 1| assert(cmdopt.numFiles == 0); 1| assert(cmdopt.hasHeader == false); | } | { 1| auto args = ["unittest", "--num-files", "20", somefile_txt]; 1| TsvSplitOptions cmdopt; 1| const r = cmdopt.processArgs(args); | 1| assert(cmdopt.linesPerFile == 0); 1| assert(cmdopt.keyFields.empty); 1| assert(cmdopt.numFiles == 20); 1| assert(cmdopt.hasHeader == false); | } | { 1| auto args = ["unittest", "-n", "5", "--key-fields", "1-3", somefile_txt]; 1| TsvSplitOptions cmdopt; 1| const r = cmdopt.processArgs(args); | 1| assert(cmdopt.linesPerFile == 0); 1| assert(cmdopt.keyFields == [0, 1, 2]); 1| assert(cmdopt.numFiles == 5); 1| assert(cmdopt.hasHeader == false); 1| assert(cmdopt.keyIsFullLine == false); | } | { 1| auto args = ["unittest", "-n", "5", "-k", "0", somefile_txt]; 1| TsvSplitOptions cmdopt; 1| const r = cmdopt.processArgs(args); | 1| assert(cmdopt.linesPerFile == 0); 1| assert(cmdopt.numFiles == 5); 1| assert(cmdopt.hasHeader == false); 1| assert(cmdopt.keyIsFullLine == true); | } | { 1| auto args = ["unittest", "-n", "2", "--header", somefile_txt]; 1| TsvSplitOptions cmdopt; 1| const r = cmdopt.processArgs(args); | 1| assert(cmdopt.headerInOut == true); 1| assert(cmdopt.hasHeader == true); 1| assert(cmdopt.headerIn == false); | } | { 1| auto args = ["unittest", "-n", "2", "--header-in-only", somefile_txt]; 1| TsvSplitOptions cmdopt; 1| const r = cmdopt.processArgs(args); | 1| assert(cmdopt.headerInOut == false); 1| assert(cmdopt.hasHeader == true); 1| assert(cmdopt.headerIn == true); | } | | static void testSuffix(string[] args, string expectedSuffix) | { 9| TsvSplitOptions cmdopt; 9| auto savedArgs = args.to!string; 9| const r = cmdopt.processArgs(args); | 9| assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs)); 9| assert(cmdopt.suffix == expectedSuffix, | format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n cmdopt.processArgs(%s)", | expectedSuffix, cmdopt.suffix, savedArgs)); | } | | /* In these tests, don't use headers and when files are listed, use 'somefile_txt' first. | * This makes sure there is no attempt to read standard input and that there won't be an | * open failure trying to find a file. | */ 1| testSuffix(["unittest", "-n", "2"], ""); 1| testSuffix(["unittest", "-n", "2", "--", "-"], ""); 1| testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123"); 1| testSuffix(["unittest", "-n", "2", somefile_txt], ".txt"); 1| testSuffix(["unittest", "-n", "2", somefile_txt, "anotherfile.pqr"], ".txt"); 1| testSuffix(["unittest", "-n", "2", "--suffix", ".X", somefile_txt, "anotherfile.pqr"], ".X"); 1| testSuffix(["unittest", "-n", "2", "--suffix", "", somefile_txt], ""); 1| testSuffix(["unittest", "-n", "2", "--", "-", somefile_txt], ""); 1| testSuffix(["unittest", "-n", "2", "--", somefile_txt, "-"], ".txt"); | | static void testDigitWidth(string[] args, uint expected) | { 12| TsvSplitOptions cmdopt; 12| auto savedArgs = args.to!string; 12| const r = cmdopt.processArgs(args); | 12| assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs)); 12| assert(cmdopt.digitWidth == expected, | format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n cmdopt.processArgs(%s)", | expected, cmdopt.digitWidth, savedArgs)); | } | 1| testDigitWidth(["unittest", "-n", "2", somefile_txt], 1); 1| testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0", somefile_txt], 1); 1| testDigitWidth(["unittest", "-n", "10", somefile_txt], 1); 1| testDigitWidth(["unittest", "-n", "11", somefile_txt], 2); 1| testDigitWidth(["unittest", "-n", "555", somefile_txt], 3); 1| testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2", somefile_txt], 2); 1| testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4", somefile_txt], 4); 1| testDigitWidth(["unittest", "-l", "10", somefile_txt], 3); 1| testDigitWidth(["unittest", "-l", "10000", somefile_txt], 3); 1| testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0", somefile_txt], 3); 1| testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1", somefile_txt], 1); 1| testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5", somefile_txt], 5); |} | |/** Get the rlimit current number of open files the process is allowed. | * | * This routine returns the current soft limit on the number of open files the process | * is allowed. This is the number returned by the command: '$ ulimit -n'. | * | * This routine translates this value to a 'uint', as tsv-split uses 'uint' for | * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'. | * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'. | * | * An exception is thrown if call to 'getrlimit' fails. | */ |uint rlimitCurrOpenFilesLimit() |{ | import core.sys.posix.sys.resource : | rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR; | import std.conv : to; | 710| uint currOpenFileLimit = uint.max; | 710| rlimit rlimitMaxOpenFiles; | 710| enforce(getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) == 0, 0000000| "Internal error: getrlimit call failed"); | 710| if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY && 710| rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR && 710| rlimitMaxOpenFiles.rlim_cur >= 0 && 710| rlimitMaxOpenFiles.rlim_cur <= uint.max) | { 710| currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint; | } | 710| return currOpenFileLimit; |} | |/** Invokes the proper split routine based on the command line arguments. | * | * This routine is the top-level control after command line argument processing is | * done. It's primary job is to set up data structures and invoke the correct | * processing routine based on the command line arguments. | */ |void tsvSplit(ref TsvSplitOptions cmdopt) |{ | /* Check that the input files were setup as expected. Should at least have one | * input, stdin if nothing else. */ 180| assert(!cmdopt.inputSources.empty); | 180| if (cmdopt.linesPerFile != 0) | { 57| splitByLineCount(cmdopt); | } | else | { | /* Randomly distribute input lines to a specified number of files. */ | 246| auto outputFiles = | SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix, | cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles, | cmdopt.inputSources.front.header); | 123| if (!cmdopt.appendToExistingFiles) | { 111| string existingFile = outputFiles.checkIfFilesExist; 111| enforce(existingFile.length == 0, 2| format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.", | existingFile)); | } | 121| if (cmdopt.keyFields.length == 0) | { 61| splitLinesRandomly(cmdopt, outputFiles); | } | else | { 60| splitLinesByKey(cmdopt, outputFiles); | } | } |} | |/** A SplitOutputFiles struct holds a collection of output files. | * | * This struct manages a collection of output files used when writing to multiple | * files at once. This includes constructing filenames, opening and closing files, | * and writing data and header lines. | * | * Both random assignment (splitLinesRandomly) and random assignment by key | * (splitLinesByKey) use a SplitOutputFiles struct to manage output files. | * | * The main properties of the output file set are specified in the constuctor. The | * exception is the header line. This is not known until the first input file is | * read, so it is specified in a separate 'setHeader' call. | * | * Individual output files are written to based on their zero-based index in the | * output collection. The caller selects the output file number to write to and | * calls 'writeDataLine' to write a line. The header is written if needed. | */ |struct SplitOutputFiles |{ | import std.conv : to; | import std.file : exists; | import std.path : buildPath; | import std.stdio : File; | | static struct OutputFile | { | string filename; | File ofile; | bool hasData; | bool isOpen; // Track separately due to https://github.com/dlang/phobos/pull/7397 | } | | private uint _numFiles; | private bool _writeHeaders; | private uint _maxOpenFiles; | | private OutputFile[] _outputFiles; | private uint _numOpenFiles = 0; | private string _header; | 123| this(uint numFiles, string dir, string filePrefix, string fileSuffix, | uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles, string header) | { 123| assert(numFiles >= 2); 123| assert(maxOpenFiles >= 1); | 123| _numFiles = numFiles; 123| _writeHeaders = writeHeaders; 123| _maxOpenFiles = maxOpenFiles; 123| _header = header; | 123| _outputFiles.length = numFiles; | | /* Filename assignment. */ 20705| foreach (i, ref f; _outputFiles) | { 5084| f.filename = | buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix)); | } | } | | /* Destructor ensures all files are closed. | * | * Note: A dual check on whether the file is open is made. This is to avoid a | * Phobos bug where std.File doesn't properly maintain the state of open files | * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397. | */ | ~this() | { 15621| foreach (ref f; _outputFiles) | { 5982| if (f.isOpen && f.ofile.isOpen) | { 898| assert(_numOpenFiles >= 1); | 898| f.ofile.close; 898| f.isOpen = false; 898| _numOpenFiles--; | } | } | } | | /* Check if any of the files already exist. | * | * Returns the empty string if none of the files exist. Otherwise returns the | * filename of the first existing file found. This is to facilitate error | * message generation. | */ | string checkIfFilesExist() | { 25271| foreach (f; _outputFiles) if (f.filename.exists) return f.filename; 109| return ""; | } | | /* Picks a random file to close. Used when the open file handle limit has been | * reached. | */ | private void closeSomeFile() | { | import std.random : uniform; 281| assert(_numOpenFiles > 0); | 281| immutable uint start = uniform(0, _numFiles); | 37515| foreach (i; cycle(iota(_numFiles), start).take(_numFiles)) | { 12505| if (_outputFiles[i].isOpen) | { 281| _outputFiles[i].ofile.close; 281| _outputFiles[i].isOpen = false; 281| _numOpenFiles--; | 281| return; | } | } | 0000000| assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close."); | } | | /* Write a line to the specified file number. | * | * A header is written to the file if headers are being written and this is the | * first data written to the file. | */ | void writeDataLine(uint fileNum, const char[] data) | { 3505| assert(fileNum < _numFiles); 3505| assert(fileNum < _outputFiles.length); 3505| assert(_numOpenFiles <= _maxOpenFiles); | 3505| OutputFile* outputFile = &_outputFiles[fileNum]; | 3505| if (!outputFile.isOpen) | { 1460| if (_numOpenFiles == _maxOpenFiles) closeSomeFile(); 1179| assert(_numOpenFiles < _maxOpenFiles); | 1179| outputFile.ofile = outputFile.filename.File("a"); 1179| outputFile.isOpen = true; 1179| _numOpenFiles++; | 1179| if (!outputFile.hasData) | { 1021| ulong filesize = outputFile.ofile.size; 1063| outputFile.hasData = (filesize > 0 && filesize != ulong.max); | } | } | 4645| if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header); | 3505| outputFile.ofile.writeln(data); 3505| outputFile.hasData = true; | } |} | |/** Write input lines to multiple files, randomly selecting an output file for each line. | */ |void splitLinesRandomly(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) |{ | import std.random : Random = Mt19937, uniform; | import tsv_utils.common.utils : bufferedByLine, InputSourceRange; | | /* inputSources must be an InputSourceRange and include at least stdin. */ 61| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | 61| auto randomGenerator = Random(cmdopt.seed); | | /* Process each line. */ 400| foreach (inputStream; cmdopt.inputSources) | { 1272| foreach (line; inputStream.file.bufferedByLine) | { 362| immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator); 362| outputFiles.writeDataLine(outputFileNum, line); | } | } |} | |/** Write input lines to multiple output files using fields as a random selection key. | * | * Each input line is written to an output file. The output file is chosen using | * fields as a key. Each unique key is assigned to a file. All lines having the | * same key are written to the same file. | */ |void splitLinesByKey(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) |{ | import std.algorithm : splitter; | import std.conv : to; | import std.digest.murmurhash; | import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, | InputSourceRange, throwIfWindowsNewlineOnUnix; | 60| assert(cmdopt.keyFields.length > 0); | | /* inputSources must be an InputSourceRange and include at least stdin. */ 60| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | 60| immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. | | /* Create a mapping for the key fields. */ 120| auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); | | /* Process each line. */ 120| immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 318| foreach (inputStream; cmdopt.inputSources) | { 99| if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); | 15852| foreach (fileLineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) | { 3178| if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); | | /* Murmurhash works by successively adding individual keys, then finalizing. | * Adding individual keys is simpler if the full-line-as-key and individual | * fields as keys cases are separated. | */ 3144| auto hasher = MurmurHash3!32(cmdopt.seed); | 3144| if (cmdopt.keyIsFullLine) | { 89| hasher.put(cast(ubyte[]) line); | } | else | { 3055| assert(keyFieldsReordering !is null); | | /* Gather the key field values and assemble the key. */ 3055| keyFieldsReordering.initNewLine; 40117| foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) | { 8023| keyFieldsReordering.processNextField(fieldIndex, fieldValue); 11077| if (keyFieldsReordering.allFieldsFilled) break; | } | 3055| enforce(keyFieldsReordering.allFieldsFilled, 1| format("Not enough fields in line. File: %s, Line: %s", | inputStream.name, fileLineNum)); | 25923| foreach (count, key; keyFieldsReordering.outputFields.enumerate) | { 4872| if (count > 0) hasher.put(delimArray); 3963| hasher.put(cast(ubyte[]) key); | } | } | 3143| hasher.finish; 3143| immutable uint outputFileNum = hasher.get % cmdopt.numFiles; 3143| outputFiles.writeDataLine(outputFileNum, line); | } | } |} | |/** Write input lines to multiple files, splitting based on line count. | * | * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses | * should use the default value. | */ |void splitByLineCount(ref TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 128L) |{ | import std.file : exists; | import std.path : buildPath; | import std.stdio : File; | import tsv_utils.common.utils : InputSourceRange; | 547| assert (readBufferSize > 0); 547| ubyte[] readBuffer = new ubyte[readBufferSize]; | | /* inputSources must be an InputSourceRange and include at least stdin. */ 547| assert(!cmdopt.inputSources.empty); | static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); | 1073| string header = !cmdopt.headerInOut ? "" : 21| cmdopt.inputSources.front.header(Yes.keepTerminator); 547| size_t nextOutputFileNum = 0; 1094| File outputFile; 547| string outputFileName; 547| bool isOutputFileOpen = false; // Open file status tracked separately due to phobos bugs 547| size_t outputFileRemainingLines; | | /* nextNewlineIndex finds the index of the next newline character. It is an | * alternative to std.algorithm.countUntil. Invoking 'find' directly results | * 'memchr' being used (faster). The current 'countUntil' implementation does | * forward to find, but the way it is done avoids the memchr call optimization. | */ | static long nextNewlineIndex(const ubyte[] buffer) | { | import std.algorithm : find; 3587| immutable ubyte newlineChar = '\n'; 3587| immutable size_t buflen = buffer.length; 3587| immutable size_t findlen = buffer.find(newlineChar).length; | 7174| return findlen > 0 ? buflen - findlen : -1; | } | 2777| foreach (inputStream; cmdopt.inputSources) | { 9178| foreach (ref ubyte[] inputChunk; inputStream.file.byChunk(readBuffer)) | { 2498| size_t nextOutputChunkStart = 0; 2498| auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $]; | 5557| while (!remainingInputChunk.empty) | { | /* See if the next output file needs to be opened. */ 3060| if (!isOutputFileOpen) | { 1450| outputFileName = | buildPath(cmdopt.dir, | format("%s%.*d%s", cmdopt.prefix, | cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix)); | 2886| enforce(cmdopt.appendToExistingFiles || !outputFileName.exists, 1| format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.", | outputFileName)); | 1449| outputFile = outputFileName.File("ab"); 1449| outputFile.setvbuf(1024L * 64L, _IOFBF); 1449| isOutputFileOpen = true; 1449| ++nextOutputFileNum; 1449| outputFileRemainingLines = cmdopt.linesPerFile; | 1449| if (cmdopt.headerInOut) | { 46| ulong filesize = outputFile.size; 92| if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header); | } | } | | /* Find more newlines for the current output file. */ | 3059| assert(outputFileRemainingLines > 0); | 3059| size_t nextOutputChunkEnd = nextOutputChunkStart; | 12117| while (outputFileRemainingLines != 0 && !remainingInputChunk.empty) | { | /* Note: newLineIndex is relative to 'remainingInputChunk', not | * 'inputChunk'. Updates to variables referring to 'inputChunk' | * need to reflect this. In particular, 'nextOutputChunkEnd'. | */ 3587| immutable newlineIndex = nextNewlineIndex(remainingInputChunk); | 3587| if (newlineIndex == -1) | { 1373| nextOutputChunkEnd = inputChunk.length; | } | else | { 2214| --outputFileRemainingLines; 2214| nextOutputChunkEnd += (newlineIndex + 1); | } | 3587| remainingInputChunk = inputChunk[nextOutputChunkEnd .. $]; | } | 3059| assert(nextOutputChunkStart < nextOutputChunkEnd); 3059| assert(nextOutputChunkEnd <= inputChunk.length); | 3059| outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]); | 3059| if (outputFileRemainingLines == 0) | { 1175| outputFile.close; 1175| isOutputFileOpen = false; | } | 3059| nextOutputChunkStart = nextOutputChunkEnd; | 3059| assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart); | } | } | } |} | |/* splitByLineCount unit tests. | * | * These tests are primarily for buffer management. There are edge cases involving the | * interaction buffer size, input file size, lines-per-file, and newline placement | * that are difficult to test against the executable. | */ |unittest |{ | import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. | import std.algorithm : min; | import std.array : appender; | import std.conv : to; | import std.file : exists, mkdir, rmdirRecurse; | import std.path : buildPath; | import std.process : escapeShellCommand, executeShell; | | /* Test setup | * | * A set of twenty file input files is created, with names: input_NxM.txt, where | * N is the number of characters in each row and M is the number of rows (lines). | * The resulting files are put in the "lc_input" directory ('inputDir' variable) | * and have names: | * input_0x2.txt, input_0x3.txt, ... input_5x5.txt. | * | * A standalone block of code produces the expected result files for splitting an | * input file into a set of output files. This duplicates the splitByLineCount | * output. This is done for lines-per-file counts 1 to 5. Each result set is place | * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories | * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4". | * | * splitByLine is called for all the same input files and lines-per-file settings used | * to produce the expected output. This is done via testSplitByLineCount, which calls | * command line argument processing and splitByLine, similar to how the main program | * works. The results are written to a subdirectory. The subdirectory is compared to | * the expected output directory using the system 'diff' command. | * | * splitByLine is multiple times for each expected output case. The different calls | * iterate over a series of small ReadBufferSizes. This is how tests for edge cases | * in the readBufferSize vs line lengths, newline placement, etc., is accomplished. | * | * Note: One way to understand what is going on is to comment out the line: | * | * scope(exit) testDir.rmdirRecurse; | * | * Then run the test (e.g. 'make test') and look at the directory structure left | * behind. Print out the 'testDir' directory to see where it is located. | */ | | /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the | * call to splitByLineCount and calls 'diff' to compare the output directory to the | * expected directory. An assert is thrown if the directories do not match. | */ | static void testSplitByLineCount(string[] cmdArgs, string expectedDir, | size_t readBufferSize = 1024L * 512L) | { | import std.array : appender; | 490| assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty."); | | auto formatAssertMessage(T...)(string msg, T formatArgs) | { 0000000| auto formatString = "[testSplitByLineCount] %s: " ~ msg; 0000000| return format(formatString, cmdArgs[0], formatArgs); | } | 490| TsvSplitOptions cmdopt; 490| auto savedCmdArgs = cmdArgs.to!string; 490| auto r = cmdopt.processArgs(cmdArgs); 490| assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 490| assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required."); 490| assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required."); | 490| splitByLineCount(cmdopt, readBufferSize); | | /* Diff command setup. */ 490| auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir]; 490| auto diffResult = executeShell(escapeShellCommand(diffCmdArgs)); 490| assert(diffResult.status == 0, | format("[testSplitByLineCount]\n cmd: %s\n readBufferSize: %d\n expectedDir: %s\n------ Diff ------%s\n-------", | savedCmdArgs, readBufferSize, expectedDir, diffResult.output)); | } | 1| auto testDir = makeUnittestTempDir("tsv_split_bylinecount"); 1| scope(exit) testDir.rmdirRecurse; | 1| auto inputDir = buildPath(testDir, "lc_input"); 1| auto outputDir = buildPath(testDir, "lc_output"); 1| auto expectedDir = buildPath(testDir, "lc_expected"); | 1| mkdir(inputDir); 1| mkdir(outputDir); 1| mkdir(expectedDir); | | static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines) | { 25| return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines)); | } | 1| string[5] outputRowData = | [ | "abcde", | "fghij", | "klmno", | "pqrst", | "uvwxy" | ]; | | /* The main test loop. Iterates over input line lengths, numbers of rows, | * lines-per-file, and finally readBufferSize lengths. All combos are tested. | */ 21| foreach (inputLineLength; 0 .. 6) | { 90| foreach (inputFileNumLines; 2 .. 6) | { 24| auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); | | { 48| auto ofile = inputFile.File("w"); 24| auto output = appender!(char[])(); 324| foreach (m; 0 .. inputFileNumLines) | { 84| put(output, outputRowData[m][0 .. inputLineLength]); 84| put(output, '\n'); | } 24| ofile.write(output.data); 24| ofile.close; | } | | /* Iterate over the different lines-per-file lengths. | * - Create an expected output directory and files for each. | * - Test with different readBufferSize values. | */ 252| foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines)) | { 60| auto expectedSubDir = | buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength, | inputFileNumLines, outputFileNumLines)); 60| mkdir(expectedSubDir); | 60| size_t filenum = 0; 60| size_t linesWritten = 0; 222| while (linesWritten < inputFileNumLines) | { 162| auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum)); 324| auto f = expectedFile.File("w"); 162| auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); 1206| foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) | { 240| f.writeln(line[0 .. inputLineLength]); | } 162| linesWritten += linesToWrite; 162| ++filenum; 162| f.close; | } | | /* Test the different readBufferSizes. | * - An output directory is created for the run and deleted afterward. | * - First test the default size. | * - Then iterate overs small readBufferSize values. | */ 60| auto outputSubDir = | buildPath(outputDir, format("%dx%d_by_%d", inputLineLength, | inputFileNumLines, outputFileNumLines)); 60| mkdir(outputSubDir); | 60| testSplitByLineCount( | ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, | "--digit-width", "1", inputFile], | expectedSubDir); | 60| outputSubDir.rmdirRecurse; | 1440| foreach (readBufSize; 1 .. 8) | { 420| mkdir(outputSubDir); | 420| testSplitByLineCount( | ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, | "--digit-width", "1", inputFile], | expectedSubDir, readBufSize); | 420| outputSubDir.rmdirRecurse; | } | } | } | } | | { | /* Tests for the special case where readBufferSize is smaller than the header | * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file. | */ 1| immutable inputLineLength = 5; 1| immutable inputFileNumLines = 4; 1| immutable outputFileNumLines = 1; | 1| auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); 1| assert(inputFile.exists); | 1| auto expectedSubDirHeader = | buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength, | inputFileNumLines, outputFileNumLines)); | 1| auto expectedSubDirHeaderInOnly = | buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength, | inputFileNumLines, outputFileNumLines)); | 1| mkdir(expectedSubDirHeader); 1| mkdir(expectedSubDirHeaderInOnly); | | /* Generate the expected results. Cheat by starting with linesWritten = 1. This | * automatically excludes the header line, but keeps the loop code consistent | * with the main test loop. | */ 1| size_t filenum = 0; 1| size_t linesWritten = 1; 4| while (linesWritten < inputFileNumLines) | { 3| auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum)); 3| auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly, | format("part_%d.txt", filenum)); 6| auto fHeader = expectedFileHeader.File("w"); 6| auto fHeaderInOnly = expectedFileHeaderInOnly.File("w"); 3| auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); | 3| fHeader.writeln(outputRowData[0][0 .. inputLineLength]); 18| foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) | { 3| fHeader.writeln(line[0 .. inputLineLength]); 3| fHeaderInOnly.writeln(line[0 .. inputLineLength]); | } 3| linesWritten += linesToWrite; 3| ++filenum; 3| fHeader.close; 3| fHeaderInOnly.close; | } | | /* Now run the tests. */ 1| auto outputSubDirHeader = | buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength, | inputFileNumLines, outputFileNumLines)); 1| auto outputSubDirHeaderInOnly = | buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength, | inputFileNumLines, outputFileNumLines)); | 18| foreach (readBufSize; 1 .. 6) | { 5| mkdir(outputSubDirHeader); 5| mkdir(outputSubDirHeaderInOnly); | 5| testSplitByLineCount( | ["test", "--header", "--lines-per-file", outputFileNumLines.to!string, | "--dir", outputSubDirHeader, "--digit-width", "1", inputFile], | expectedSubDirHeader, readBufSize); | 5| testSplitByLineCount( | ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string, | "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile], | expectedSubDirHeaderInOnly, readBufSize); | 5| outputSubDirHeader.rmdirRecurse; 5| outputSubDirHeaderInOnly.rmdirRecurse; | } | } |} tsv-split/src/tsv_utils/tsv-split.d is 98% covered <<<<<< EOF # path=./tsv-append-src-tsv_utils-tsv-append.lst |/** |Command line tool that appends multiple TSV files. It is header aware and supports |tracking the original source file of each row. | |Copyright (c) 2017-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.tsv_append; | |import std.conv : to; |import std.exception : enforce; |import std.range; |import std.stdio; |import std.typecons : tuple; | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |version(unittest) |{ | // When running unit tests, use main from -main compiler switch. |} |else |{ | /** Main program. Invokes command line arg processing and tsv-append to perform | * the real work. Any errors are caught and reported. | */ | int main(string[] cmdArgs) | { | import tsv_utils.common.utils : BufferedOutputRange; | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 44| dmd_coverSetMerge(true); | } | 44| TsvAppendOptions cmdopt; 44| auto r = cmdopt.processArgs(cmdArgs); 54| if (!r[0]) return r[1]; 34| try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout)); | catch (Exception exc) | { 2| stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 2| return 1; | } 32| return 0; | } |} | |auto helpTextVerbose = q"EOS |Synopsis: tsv-append [options] [file...] | |tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility. |Unlike 'cat', it is header aware ('--H|header'), writing the header from only |the first file. It also supports source tracking, adding a column indicating |the original file to each row. Results are written to standard output. | |Concatenation with header support is useful when preparing data for traditional |Unix utilities like 'sort' and 'sed' or applications that read a single file. | |Source tracking is useful when creating long/narrow form tabular data, a format |used by many statistics and data mining packages. In this scenario, files have |been used to capture related data sets, the difference between data sets being a |condition represented by the file. For example, results from different variants |of an experiment might each be recorded in their own files. Retaining the source |file as an output column preserves the condition represented by the file. | |The file-name (without extension) is used as the source value. This can |customized using the --f|file option. | |Example: Header processing: | | $ tsv-append -H file1.tsv file2.tsv file3.tsv | |Example: Header processing and source tracking: | | $ tsv-append -H -t file1.tsv file2.tsv file3.tsv | |Example: Source tracking with custom values: | | $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv | |Options: |EOS"; | |auto helpText = q"EOS |Synopsis: tsv-append [options] [file...] | |tsv-append concatenates multiple TSV files, reading from files or standard input |and writing to standard output. It is header aware ('--H|header'), writing the |header from only the first file. It also supports source tracking, adding an |indicator of original file to each row of input. | |Options: |EOS"; | |/** Container for command line options. |*/ |struct TsvAppendOptions |{ | string programName; | string[] files; /// Input files | string[string] fileSourceNames; /// Maps file path to the 'source' value | string sourceHeader; /// --s|source-header | bool trackSource = false; /// --t|track-source | bool hasHeader = false; /// --H|header | char delim = '\t'; /// --d|delimiter | | /* fileOptionHandler processes the '--f|file source=file' option. */ | private void fileOptionHandler(string option, string optionVal) pure @safe | { | import std.algorithm : findSplit; | import std.format : format; | 17| auto valSplit = findSplit(optionVal, "="); | 33| enforce(!valSplit[0].empty && !valSplit[2].empty, 3| format("Invalid option value: '--%s %s'. Expected: '--%s ='.", | option, optionVal, option)); | 14| auto source = valSplit[0]; 14| auto filepath = valSplit[2]; 14| files ~= filepath; 14| fileSourceNames[filepath] = source; | } | | /** Command line argument processing. | * | * Returns a tuple. First value is true if command line arguments were successfully | * processed and execution should continue, or false if an error occurred or the user | * asked for help. If false, the second value is the appropriate exit code (0 or 1). | * | * Returning true (execution continues) means args have been validated and derived | * values calculated. In addition, field indices have been converted to zero-based. | * If the whole line is the key, the individual fields list will be cleared. | */ | auto processArgs (ref string[] cmdArgs) | { | import std.algorithm : any, each; | import std.getopt; | import std.path : baseName, stripExtension; | 56| bool helpVerbose = false; // --help-verbose 56| bool versionWanted = false; // --V|version | 112| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | try | { 56| arraySep = ","; // Use comma to separate values in command line options 56| auto r = getopt( | cmdArgs, | "help-verbose", " Print full help.", &helpVerbose, | std.getopt.config.caseSensitive, | "H|header", " Treat the first line of each file as a header.", &hasHeader, | std.getopt.config.caseInsensitive, | "t|track-source", " Track the source file. Adds an column with the source name.", &trackSource, | "s|source-header", "STR Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader, | "f|file", "STR=FILE Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler, | "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | std.getopt.config.caseInsensitive, | ); | 50| if (r.helpWanted) | { 1| defaultGetoptPrinter(helpText, r.options); 1| return tuple(false, 0); | } 49| else if (helpVerbose) | { 1| defaultGetoptPrinter(helpTextVerbose, r.options); 1| return tuple(false, 0); | } 48| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("tsv-append")); 2| return tuple(false, 0); | } | | /* Derivations and consistency checks. */ 99| if (files.length > 0 || !sourceHeader.empty) trackSource = true; 56| if (!sourceHeader.empty) hasHeader = true; 92| if (hasHeader && sourceHeader.empty) sourceHeader = "file"; | | /* Assume the remaing arguments are filepaths. */ 399| foreach (fp; cmdArgs[1 .. $]) | { | import std.path : baseName, stripExtension; 87| files ~= fp; 87| fileSourceNames[fp] = fp.stripExtension.baseName; | } | | /* Add a name mapping for dash ('-') unless it was included in the --file option. */ 88| if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin"; | } | catch (Exception exc) | { 6| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 6| return tuple(false, 1); | } 46| return tuple(true, 0); | } |} | |/** tsvAppend implements the basic functionality of the tsv-append program. | */ |void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream) |if (isOutputRange!(OutputRange, char)) |{ | import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange; | 46| bool headerWritten = false; 486| foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"]) | { 304| auto inputStream = (filename == "-") ? stdin : filename.File(); 100| auto sourceName = cmdopt.fileSourceNames[filename]; 1770| foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) | { 521| if (cmdopt.hasHeader && fileLineNum == 1) | { 65| if (!headerWritten) | { 27| if (cmdopt.trackSource) | { 17| outputStream.put(cmdopt.sourceHeader); 17| outputStream.put(cmdopt.delim); | } 27| outputStream.put(line); 27| outputStream.put('\n'); 27| headerWritten = true; | | /* Flush the header immediately. This helps tasks further on in a | * unix pipeline detect errors quickly, without waiting for all | * the data to flow through the pipeline. Note that an upstream | * task may have flushed its header line, so the header may | * arrive long before the main block of data. | */ 19| static if (isFlushableOutputRange!OutputRange) outputStream.flush; | } | } | else | { 249| if (cmdopt.trackSource) | { 138| outputStream.put(sourceName); 138| outputStream.put(cmdopt.delim); | } 249| outputStream.put(line); 249| outputStream.put('\n'); | } | } | } |} | |version(unittest) |{ | /* Unit test helper functions. */ | | import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. | | void testTsvAppend(string[] cmdArgs, string[][] expected) | { | import std.array : appender; | import std.format : format; | 12| assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty."); | | auto formatAssertMessage(T...)(string msg, T formatArgs) | { 0000000| auto formatString = "[testTsvAppend] %s: " ~ msg; 0000000| return format(formatString, cmdArgs[0], formatArgs); | } | 12| TsvAppendOptions cmdopt; 12| auto savedCmdArgs = cmdArgs.to!string; 12| auto r = cmdopt.processArgs(cmdArgs); 12| assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); | 12| auto output = appender!(char[])(); 12| tsvAppend(cmdopt, output); 12| auto expectedOutput = expected.tsvDataToString; | 12| assert(output.data == expectedOutput, | formatAssertMessage( | "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", | expectedOutput.to!string, output.data.to!string)); | } | } | |unittest |{ | import std.path : buildPath; | import std.file : rmdirRecurse; | import std.format : format; | 1| auto testDir = makeUnittestTempDir("tsv_append"); 1| scope(exit) testDir.rmdirRecurse; | 1| string[][] data1 = | [["field_a", "field_b", "field_c"], | ["red", "17", "κόκκινος"], | ["blue", "12", "άσπρο"]]; | 1| string[][] data2 = | [["field_a", "field_b", "field_c"], | ["green", "13.5", "κόκκινος"], | ["blue", "15", "πράσινος"]]; | 1| string[][] data3 = | [["field_a", "field_b", "field_c"], | ["yellow", "9", "κίτρινος"]]; | 1| string[][] dataHeaderRowOnly = | [["field_a", "field_b", "field_c"]]; | 1| string[][] dataEmpty = [[]]; | 1| string filepath1 = buildPath(testDir, "file1.tsv"); 1| string filepath2 = buildPath(testDir, "file2.tsv"); 1| string filepath3 = buildPath(testDir, "file3.tsv"); 1| string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv"); 1| string filepathEmpty = buildPath(testDir, "fileEmpty.tsv"); | 1| writeUnittestTsvFile(filepath1, data1); 1| writeUnittestTsvFile(filepath2, data2); 1| writeUnittestTsvFile(filepath3, data3); 1| writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly); 1| writeUnittestTsvFile(filepathEmpty, dataEmpty); | 1| testTsvAppend(["test-1", filepath1], data1); 1| testTsvAppend(["test-2", "--header", filepath1], data1); 1| testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2); | 1| testTsvAppend(["test-4", "--header", filepath1, filepath2], | [["field_a", "field_b", "field_c"], | ["red", "17", "κόκκινος"], | ["blue", "12", "άσπρο"], | ["green", "13.5", "κόκκινος"], | ["blue", "15", "πράσινος"]]); | 1| testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3], | [["field_a", "field_b", "field_c"], | ["red", "17", "κόκκινος"], | ["blue", "12", "άσπρο"], | ["green", "13.5", "κόκκινος"], | ["blue", "15", "πράσινος"], | ["yellow", "9", "κίτρινος"]]); | 1| testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], | data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3); | 1| testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], | [["field_a", "field_b", "field_c"], | ["red", "17", "κόκκινος"], | ["blue", "12", "άσπρο"], | ["green", "13.5", "κόκκινος"], | ["blue", "15", "πράσινος"], | ["yellow", "9", "κίτρινος"]]); | 1| testTsvAppend(["test-8", "--track-source", filepath1, filepath2], | [["file1", "field_a", "field_b", "field_c"], | ["file1", "red", "17", "κόκκινος"], | ["file1", "blue", "12", "άσπρο"], | ["file2", "field_a", "field_b", "field_c"], | ["file2", "green", "13.5", "κόκκινος"], | ["file2", "blue", "15", "πράσινος"]]); | 1| testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2], | [["file", "field_a", "field_b", "field_c"], | ["file1", "red", "17", "κόκκινος"], | ["file1", "blue", "12", "άσπρο"], | ["file2", "green", "13.5", "κόκκινος"], | ["file2", "blue", "15", "πράσινος"]]); | 1| testTsvAppend(["test-10", "-H", "-t", "--source-header", "source", | filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], | [["source", "field_a", "field_b", "field_c"], | ["file1", "red", "17", "κόκκινος"], | ["file1", "blue", "12", "άσπρο"], | ["file2", "green", "13.5", "κόκκινος"], | ["file2", "blue", "15", "πράσινος"], | ["file3", "yellow", "9", "κίτρινος"]]); | 1| testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1), | "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)], | [["id", "field_a", "field_b", "field_c"], | ["1a", "red", "17", "κόκκινος"], | ["1a", "blue", "12", "άσπρο"], | ["1b", "green", "13.5", "κόκκινος"], | ["1b", "blue", "15", "πράσινος"], | ["1c", "yellow", "9", "κίτρινος"]]); | 1| testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1), | "-f", format("1b=%s", filepath2), filepath3], | [["id", "field_a", "field_b", "field_c"], | ["1a", "red", "17", "κόκκινος"], | ["1a", "blue", "12", "άσπρο"], | ["1b", "green", "13.5", "κόκκινος"], | ["1b", "blue", "15", "πράσινος"], | ["file3", "yellow", "9", "κίτρινος"]]); |} tsv-append/src/tsv_utils/tsv-append.d is 97% covered <<<<<< EOF # path=./number-lines-src-tsv_utils-number-lines.lst |/** |A simple version of the unix 'nl' program. | |This program is a simpler version of the unix 'nl' (number lines) program. It reads |text from files or standard input and adds a line number to each line. | |Copyright (c) 2015-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ |module tsv_utils.number_lines; | |import std.stdio; |import std.typecons : tuple; | |auto helpText = q"EOS |Synopsis: number-lines [options] [file...] | |number-lines reads from files or standard input and writes each line to standard |output preceded by a line number. It is a simplified version of the unix 'nl' |program. It supports one feature 'nl' does not: the ability to treat the first |line of files as a header. This is useful when working with tab-separated-value |files. If header processing used, a header line is written for the first file, |and the header lines are dropped from any subsequent files. | |Examples: | number-lines myfile.txt | cat myfile.txt | number-lines --header linenum | number-lines *.txt | |Options: |EOS"; | |/** Container for command line options. | */ |struct NumberLinesOptions |{ | enum defaultHeaderString = "line"; | | string programName; | bool hasHeader = false; // --H|header | string headerString = ""; // --s|header-string | long startNum = 1; // --n|start-num | char delim = '\t'; // --d|delimiter | bool versionWanted = false; // --V|version | | /* Returns a tuple. First value is true if command line arguments were successfully | * processed and execution should continue, or false if an error occurred or the user | * asked for help. If false, the second value is the appropriate exit code (0 or 1). | */ | auto processArgs (ref string[] cmdArgs) | { | import std.algorithm : any, each; | import std.getopt; | import std.path : baseName, stripExtension; | 64| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | try | { 32| auto r = getopt( | cmdArgs, | std.getopt.config.caseSensitive, | "H|header", " Treat the first line of each file as a header. The first input file's header is output, subsequent file headers are discarded.", &hasHeader, | std.getopt.config.caseInsensitive, | "s|header-string", "STR String to use in the header row. Implies --header. Default: 'line'", &headerString, | "n|start-number", "NUM Number to use for the first line. Default: 1", &startNum, | "d|delimiter", "CHR Character appended to line number, preceding the rest of the line. Default: TAB (Single byte UTF-8 characters only.)", &delim, | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | std.getopt.config.caseInsensitive, | ); | 30| if (r.helpWanted) | { 1| defaultGetoptPrinter(helpText, r.options); 1| return tuple(false, 0); | } 29| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("number-lines")); 2| return tuple(false, 0); | } | | /* Derivations. */ 31| if (headerString.length > 0) hasHeader = true; 23| else headerString = defaultHeaderString; | } | catch (Exception exc) | { 2| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 2| return tuple(false, 1); | } 27| return tuple(true, 0); | } |} | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |/** Main program. */ |int main(string[] cmdArgs) |{ | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 32| dmd_coverSetMerge(true); | } | 32| NumberLinesOptions cmdopt; 32| auto r = cmdopt.processArgs(cmdArgs); 37| if (!r[0]) return r[1]; 27| try numberLines(cmdopt, cmdArgs[1..$]); | catch (Exception exc) | { 1| stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 1| return 1; | } | 26| return 0; |} | |/** Implements the primary logic behind number lines. | * | * Reads lines lines from each file, outputing each with a line number prepended. The | * header from the first file is written, the header from subsequent files is dropped. | */ |void numberLines(const NumberLinesOptions cmdopt, const string[] inputFiles) |{ | import std.conv : to; | import std.range; | import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange; | 54| auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); | 27| long lineNum = cmdopt.startNum; 27| bool headerWritten = false; 271| foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) | { 164| auto inputStream = (filename == "-") ? stdin : filename.File(); 1358| foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) | { 377| if (cmdopt.hasHeader && fileLineNum == 1) | { 24| if (!headerWritten) | { 12| bufferedOutput.append(cmdopt.headerString); 12| bufferedOutput.append(cmdopt.delim); 12| bufferedOutput.appendln(line); 12| headerWritten = true; | | /* Flush the header immediately. This helps tasks further on in a | * unix pipeline detect errors quickly, without waiting for all | * the data to flow through the pipeline. Note that an upstream | * task may have flushed its header line, so the header may | * arrive long before the main block of data. | */ 12| bufferedOutput.flush; | } | } | else | { 226| bufferedOutput.append(lineNum.to!string); 226| bufferedOutput.append(cmdopt.delim); 226| bufferedOutput.appendln(line); 226| lineNum++; | } | } | } |} number-lines/src/tsv_utils/number-lines.d is 100% covered <<<<<< EOF # path=./common-src-tsv_utils-common-numerics.lst |/** |Numeric related utilities used by TSV Utilities. | |Utilities in this file: |$(LIST | * [formatNumber] - An alternate print format for numbers, especially useful when | doubles are being used to represent integer and float values. | | * [rangeMedian] - Finds the median value of a range. | | * [quantile] - Generates quantile values for a data set. |) | |Copyright (c) 2016-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ | |module tsv_utils.common.numerics; | |import std.traits : isFloatingPoint, isIntegral, Unqual; | |/** |formatNumber is an alternate way to print numbers. It is especially useful when |representing both integral and floating point values with float point data types. | |formatNumber was created for tsv-summarize, where all calculations are done as doubles, |but may be integers by nature. In addition, output may be either for human consumption |or for additional machine processing. Integers are printed normally. Floating point is |printed as follows: |$(LIST | * Values that are exact integral values are printed as integers, as long as they | are within the range of where all integers are represented exactly by the floating | point type. The practical effect is to avoid switching to exponential notion. | | * If the specified floatPrecision is between 0 and readablePrecisionMax, then | floatPrecision is used to set the significant digits following the decimal point. | Otherwise, it is used to set total significant digits. This does not apply to | really large numbers, for doubles, those larger than 2^53. Trailing zeros are | chopped in all cases. |) |*/ |auto formatNumber(T, size_t readablePrecisionMax = 6)(T num, const size_t floatPrecision = 12) |if (isFloatingPoint!T || isIntegral!T) |{ | alias UT = Unqual!T; | | import std.conv : to; | import std.format : format; | | static if (isIntegral!T) | { 1757| return format("%d", num); // The easy case. | } | else | { | static assert(isFloatingPoint!T); | | static if (!is(UT == float) && !is(UT == double)) | { | /* Not a double or float, but a floating point. Punt on refinements. */ 14| return format("%.*g", floatPrecision, num); | } | else | { | static assert(is(UT == float) || is(UT == double)); | 3035| if (floatPrecision <= readablePrecisionMax) | { | /* Print with a fixed precision beyond the decimal point (%.*f), but | * remove trailing zeros. Notes: | * - This handles integer values stored in floating point types. | * - Values like NaN and infinity also handled. | */ 679| immutable str = format("%.*f", floatPrecision, num); 679| size_t trimToLength = str.length; | 1204| if (floatPrecision != 0 && str.length > floatPrecision + 1) | { | import std.ascii : isDigit; 525| assert(str.length - floatPrecision - 1 > 0); 525| size_t decimalIndex = str.length - floatPrecision - 1; | 1036| if (str[decimalIndex] == '.' && str[decimalIndex - 1].isDigit) | { 511| size_t lastNonZeroDigit = str.length - 1; 511| assert(decimalIndex < lastNonZeroDigit); 1149| while (str[lastNonZeroDigit] == '0') lastNonZeroDigit--; 511| trimToLength = (decimalIndex < lastNonZeroDigit) 421| ? lastNonZeroDigit + 1 90| : decimalIndex; | } | } | 679| return str[0 .. trimToLength]; | } | else | { | /* Determine if the number is subject to special integer value printing. | * Goal is to avoid exponential notion for integer values that '%.*g' | * generates. Numbers within the significant digit range of floatPrecision | * will print as desired with '%.*g', whether there is a fractional part | * or not. The '%.*g' format, with exponential notation, is also used for | * really large numbers. "Really large" being numbers outside the range | * of integers exactly representable by the floating point type. | */ | | enum UT maxConsecutiveUTInteger = 2.0^^UT.mant_dig; | enum bool maxUTIntFitsInLong = (maxConsecutiveUTInteger <= long.max); | | import std.math : fabs; 2356| immutable UT absNum = num.fabs; | 2356| if (!maxUTIntFitsInLong || | absNum < 10.0^^floatPrecision || 540| absNum > maxConsecutiveUTInteger) | { | /* Within signficant digits range or very large. */ 1844| return format("%.*g", floatPrecision, num); | } | else | { | /* Check for integral values needing to be printed in decimal format. | * modf/modff are used to determine if the value has a non-zero | * fractional component. | */ | import core.stdc.math : modf, modff; | | static if (is(UT == float)) alias modfUT = modff; | else static if (is(UT == double)) alias modfUT = modf; | else static assert(0); | 512| UT integerPart; | 554| if (modfUT(num, &integerPart) == 0.0) return format("%d", num.to!long); 470| else return format("%.*g", floatPrecision, num); | } | } | } | } 0000000| assert(0); |} | |unittest // formatNumber unit tests |{ | import std.conv : to; | import std.format : format; | | /* Integers */ 7| assert(formatNumber(0) == "0"); 7| assert(formatNumber(1) == "1"); 7| assert(formatNumber(-1) == "-1"); 7| assert(formatNumber(999) == "999"); 7| assert(formatNumber(12345678912345) == "12345678912345"); 7| assert(formatNumber(-12345678912345) == "-12345678912345"); | 14| size_t a1 = 10; assert(a1.formatNumber == "10"); 14| const int a2 = -33234; assert(a2.formatNumber == "-33234"); 14| immutable long a3 = -12345678912345; assert(a3.formatNumber == "-12345678912345"); | | // Specifying precision should never matter for integer values. 7| assert(formatNumber(0, 0) == "0"); 7| assert(formatNumber(1, 0) == "1"); 7| assert(formatNumber(-1, 0) == "-1"); 7| assert(formatNumber(999, 0) == "999"); 7| assert(formatNumber(12345678912345, 0) == "12345678912345"); 7| assert(formatNumber(-12345678912345, 0) == "-12345678912345"); | 7| assert(formatNumber(0, 3) == "0"); 7| assert(formatNumber(1, 3) == "1"); 7| assert(formatNumber(-1, 3 ) == "-1"); 7| assert(formatNumber(999, 3) == "999"); 7| assert(formatNumber(12345678912345, 3) == "12345678912345"); 7| assert(formatNumber(-12345678912345, 3) == "-12345678912345"); | 7| assert(formatNumber(0, 9) == "0"); 7| assert(formatNumber(1, 9) == "1"); 7| assert(formatNumber(-1, 9 ) == "-1"); 7| assert(formatNumber(999, 9) == "999"); 7| assert(formatNumber(12345678912345, 9) == "12345678912345"); 7| assert(formatNumber(-12345678912345, 9) == "-12345678912345"); | | /* Doubles */ 7| assert(formatNumber(0.0) == "0"); 7| assert(formatNumber(0.2) == "0.2"); 7| assert(formatNumber(0.123412, 0) == "0"); 7| assert(formatNumber(0.123412, 1) == "0.1"); 7| assert(formatNumber(0.123412, 2) == "0.12"); 7| assert(formatNumber(0.123412, 5) == "0.12341"); 7| assert(formatNumber(0.123412, 6) == "0.123412"); 7| assert(formatNumber(0.123412, 7) == "0.123412"); 7| assert(formatNumber(9.123412, 5) == "9.12341"); 7| assert(formatNumber(9.123412, 6) == "9.123412"); 7| assert(formatNumber(99.123412, 5) == "99.12341"); 7| assert(formatNumber(99.123412, 6) == "99.123412"); 7| assert(formatNumber(99.123412, 7) == "99.12341"); 7| assert(formatNumber(999.123412, 0) == "999"); 7| assert(formatNumber(999.123412, 1) == "999.1"); 7| assert(formatNumber(999.123412, 2) == "999.12"); 7| assert(formatNumber(999.123412, 3) == "999.123"); 7| assert(formatNumber(999.123412, 4) == "999.1234"); 7| assert(formatNumber(999.123412, 5) == "999.12341"); 7| assert(formatNumber(999.123412, 6) == "999.123412"); 7| assert(formatNumber(999.123412, 7) == "999.1234"); 7| assert(formatNumber!(double, 9)(999.12341234, 7) == "999.1234123"); 7| assert(formatNumber(9001.0) == "9001"); 7| assert(formatNumber(1234567891234.0) == "1234567891234"); 7| assert(formatNumber(1234567891234.0, 0) == "1234567891234"); 7| assert(formatNumber(1234567891234.0, 1) == "1234567891234"); | | // Test round off cases 7| assert(formatNumber(0.6, 0) == "1"); 7| assert(formatNumber(0.6, 1) == "0.6"); 7| assert(formatNumber(0.06, 0) == "0"); 7| assert(formatNumber(0.06, 1) == "0.1"); 7| assert(formatNumber(0.06, 2) == "0.06"); 7| assert(formatNumber(0.06, 3) == "0.06"); 7| assert(formatNumber(9.49999, 0) == "9"); 7| assert(formatNumber(9.49999, 1) == "9.5"); 7| assert(formatNumber(9.6, 0) == "10"); 7| assert(formatNumber(9.6, 1) == "9.6"); 7| assert(formatNumber(99.99, 0) == "100"); 7| assert(formatNumber(99.99, 1) == "100"); 7| assert(formatNumber(99.99, 2) == "99.99"); 7| assert(formatNumber(9999.9996, 3) == "10000"); 7| assert(formatNumber(9999.9996, 4) == "9999.9996"); 7| assert(formatNumber(99999.99996, 4) == "100000"); 7| assert(formatNumber(99999.99996, 5) == "99999.99996"); 7| assert(formatNumber(999999.999996, 5) == "1000000"); 7| assert(formatNumber(999999.999996, 6) == "999999.999996"); | | /* Turn off precision, the 'human readable' style. | * Note: Remains o if both are zero (first test). If it becomes desirable to support | * turning it off when for the precision equal zero case the simple extension is to | * allow the 'human readable' precision template parameter to be negative. | */ 7| assert(formatNumber!(double, 0)(999.123412, 0) == "999"); 7| assert(formatNumber!(double, 0)(999.123412, 1) == "1e+03"); 7| assert(formatNumber!(double, 0)(999.123412, 2) == "1e+03"); 7| assert(formatNumber!(double, 0)(999.123412, 3) == "999"); 7| assert(formatNumber!(double, 0)(999.123412, 4) == "999.1"); | | // Default number printing 7| assert(formatNumber(1.2) == "1.2"); 7| assert(formatNumber(12.3) == "12.3"); 7| assert(formatNumber(12.34) == "12.34"); 7| assert(formatNumber(123.45) == "123.45"); 7| assert(formatNumber(123.456) == "123.456"); 7| assert(formatNumber(1234.567) == "1234.567"); 7| assert(formatNumber(1234.5678) == "1234.5678"); 7| assert(formatNumber(12345.6789) == "12345.6789"); 7| assert(formatNumber(12345.67891) == "12345.67891"); 7| assert(formatNumber(123456.78912) == "123456.78912"); 7| assert(formatNumber(123456.789123) == "123456.789123"); 7| assert(formatNumber(1234567.891234) == "1234567.89123"); 7| assert(formatNumber(12345678.912345) == "12345678.9123"); 7| assert(formatNumber(123456789.12345) == "123456789.123"); 7| assert(formatNumber(1234567891.2345) == "1234567891.23"); 7| assert(formatNumber(12345678912.345) == "12345678912.3"); 7| assert(formatNumber(123456789123.45) == "123456789123"); 7| assert(formatNumber(1234567891234.5) == "1.23456789123e+12"); 7| assert(formatNumber(12345678912345.6) == "1.23456789123e+13"); 7| assert(formatNumber(123456789123456.0) == "123456789123456"); 7| assert(formatNumber(0.3) == "0.3"); 7| assert(formatNumber(0.03) == "0.03"); 7| assert(formatNumber(0.003) == "0.003"); 7| assert(formatNumber(0.0003) == "0.0003"); 7| assert(formatNumber(0.00003) == "3e-05" || formatNumber(0.00003) == "3e-5"); 7| assert(formatNumber(0.000003) == "3e-06" || formatNumber(0.000003) == "3e-6"); 7| assert(formatNumber(0.0000003) == "3e-07" || formatNumber(0.0000003) == "3e-7"); | | // Large number inside and outside the contiguous integer representation range 7| double dlarge = 2.0^^(double.mant_dig - 2) - 10.0; 7| double dhuge = 2.0^^(double.mant_dig + 1) + 1000.0; | 7| assert(dlarge.formatNumber == format("%d", dlarge.to!long)); 7| assert(dhuge.formatNumber!(double) == format("%.12g", dhuge)); | | // Negative values - Repeat most of above tests. 7| assert(formatNumber(-0.0) == "-0" || formatNumber(-0.0) == "0"); 7| assert(formatNumber(-0.2) == "-0.2"); 7| assert(formatNumber(-0.123412, 0) == "-0"); 7| assert(formatNumber(-0.123412, 1) == "-0.1"); 7| assert(formatNumber(-0.123412, 2) == "-0.12"); 7| assert(formatNumber(-0.123412, 5) == "-0.12341"); 7| assert(formatNumber(-0.123412, 6) == "-0.123412"); 7| assert(formatNumber(-0.123412, 7) == "-0.123412"); 7| assert(formatNumber(-9.123412, 5) == "-9.12341"); 7| assert(formatNumber(-9.123412, 6) == "-9.123412"); 7| assert(formatNumber(-99.123412, 5) == "-99.12341"); 7| assert(formatNumber(-99.123412, 6) == "-99.123412"); 7| assert(formatNumber(-99.123412, 7) == "-99.12341"); 7| assert(formatNumber(-999.123412, 0) == "-999"); 7| assert(formatNumber(-999.123412, 1) == "-999.1"); 7| assert(formatNumber(-999.123412, 2) == "-999.12"); 7| assert(formatNumber(-999.123412, 3) == "-999.123"); 7| assert(formatNumber(-999.123412, 4) == "-999.1234"); 7| assert(formatNumber(-999.123412, 5) == "-999.12341"); 7| assert(formatNumber(-999.123412, 6) == "-999.123412"); 7| assert(formatNumber(-999.123412, 7) == "-999.1234"); 7| assert(formatNumber!(double, 9)(-999.12341234, 7) == "-999.1234123"); 7| assert(formatNumber(-9001.0) == "-9001"); 7| assert(formatNumber(-1234567891234.0) == "-1234567891234"); 7| assert(formatNumber(-1234567891234.0, 0) == "-1234567891234"); 7| assert(formatNumber(-1234567891234.0, 1) == "-1234567891234"); | | // Test round off cases 7| assert(formatNumber(-0.6, 0) == "-1"); 7| assert(formatNumber(-0.6, 1) == "-0.6"); 7| assert(formatNumber(-0.06, 0) == "-0"); 7| assert(formatNumber(-0.06, 1) == "-0.1"); 7| assert(formatNumber(-0.06, 2) == "-0.06"); 7| assert(formatNumber(-0.06, 3) == "-0.06"); 7| assert(formatNumber(-9.49999, 0) == "-9"); 7| assert(formatNumber(-9.49999, 1) == "-9.5"); 7| assert(formatNumber(-9.6, 0) == "-10"); 7| assert(formatNumber(-9.6, 1) == "-9.6"); 7| assert(formatNumber(-99.99, 0) == "-100"); 7| assert(formatNumber(-99.99, 1) == "-100"); 7| assert(formatNumber(-99.99, 2) == "-99.99"); 7| assert(formatNumber(-9999.9996, 3) == "-10000"); 7| assert(formatNumber(-9999.9996, 4) == "-9999.9996"); 7| assert(formatNumber(-99999.99996, 4) == "-100000"); 7| assert(formatNumber(-99999.99996, 5) == "-99999.99996"); 7| assert(formatNumber(-999999.999996, 5) == "-1000000"); 7| assert(formatNumber(-999999.999996, 6) == "-999999.999996"); | 7| assert(formatNumber!(double, 0)(-999.123412, 0) == "-999"); 7| assert(formatNumber!(double, 0)(-999.123412, 1) == "-1e+03"); 7| assert(formatNumber!(double, 0)(-999.123412, 2) == "-1e+03"); 7| assert(formatNumber!(double, 0)(-999.123412, 3) == "-999"); 7| assert(formatNumber!(double, 0)(-999.123412, 4) == "-999.1"); | | // Default number printing 7| assert(formatNumber(-1.2) == "-1.2"); 7| assert(formatNumber(-12.3) == "-12.3"); 7| assert(formatNumber(-12.34) == "-12.34"); 7| assert(formatNumber(-123.45) == "-123.45"); 7| assert(formatNumber(-123.456) == "-123.456"); 7| assert(formatNumber(-1234.567) == "-1234.567"); 7| assert(formatNumber(-1234.5678) == "-1234.5678"); 7| assert(formatNumber(-12345.6789) == "-12345.6789"); 7| assert(formatNumber(-12345.67891) == "-12345.67891"); 7| assert(formatNumber(-123456.78912) == "-123456.78912"); 7| assert(formatNumber(-123456.789123) == "-123456.789123"); 7| assert(formatNumber(-1234567.891234) == "-1234567.89123"); | 7| assert(formatNumber(-12345678.912345) == "-12345678.9123"); 7| assert(formatNumber(-123456789.12345) == "-123456789.123"); 7| assert(formatNumber(-1234567891.2345) == "-1234567891.23"); 7| assert(formatNumber(-12345678912.345) == "-12345678912.3"); 7| assert(formatNumber(-123456789123.45) == "-123456789123"); 7| assert(formatNumber(-1234567891234.5) == "-1.23456789123e+12"); 7| assert(formatNumber(-12345678912345.6) == "-1.23456789123e+13"); 7| assert(formatNumber(-123456789123456.0) == "-123456789123456"); | 7| assert(formatNumber(-0.3) == "-0.3"); 7| assert(formatNumber(-0.03) == "-0.03"); 7| assert(formatNumber(-0.003) == "-0.003"); 7| assert(formatNumber(-0.0003) == "-0.0003"); 7| assert(formatNumber(-0.00003) == "-3e-05" || formatNumber(-0.00003) == "-3e-5"); 7| assert(formatNumber(-0.000003) == "-3e-06" || formatNumber(-0.000003) == "-3e-6"); 7| assert(formatNumber(-0.0000003) == "-3e-07" || formatNumber(-0.0000003) == "-3e-7"); | 7| const double dlargeNeg = -2.0^^(double.mant_dig - 2) + 10.0; 7| immutable double dhugeNeg = -2.0^^(double.mant_dig + 1) - 1000.0; | 7| assert(dlargeNeg.formatNumber == format("%d", dlargeNeg.to!long)); 7| assert(dhugeNeg.formatNumber!(double) == format("%.12g", dhugeNeg)); | | // Type qualifiers 14| const double b1 = 0.0; assert(formatNumber(b1) == "0"); 14| const double b2 = 0.2; assert(formatNumber(b2) == "0.2"); 14| const double b3 = 0.123412; assert(formatNumber(b3, 2) == "0.12"); 14| immutable double b4 = 99.123412; assert(formatNumber(b4, 5) == "99.12341"); 14| immutable double b5 = 99.123412; assert(formatNumber(b5, 7) == "99.12341"); | | // Special values 7| assert(formatNumber(double.nan) == "nan"); 7| assert(formatNumber(double.nan, 0) == "nan"); 7| assert(formatNumber(double.nan, 1) == "nan"); 7| assert(formatNumber(double.nan, 9) == "nan"); 7| assert(formatNumber(double.infinity) == "inf"); 7| assert(formatNumber(double.infinity, 0) == "inf"); 7| assert(formatNumber(double.infinity, 1) == "inf"); 7| assert(formatNumber(double.infinity, 9) == "inf"); | | // Float. Mix negative and type qualifiers in. 7| assert(formatNumber(0.0f) == "0"); 7| assert(formatNumber(0.5f) == "0.5"); 7| assert(formatNumber(0.123412f, 0) == "0"); 7| assert(formatNumber(0.123412f, 1) == "0.1"); 7| assert(formatNumber(-0.123412f, 2) == "-0.12"); 7| assert(formatNumber(9.123412f, 5) == "9.12341"); 7| assert(formatNumber(9.123412f, 6) == "9.123412"); 7| assert(formatNumber(-99.123412f, 5) == "-99.12341"); 7| assert(formatNumber(99.123412f, 7) == "99.12341"); 7| assert(formatNumber(-999.123412f, 5) == "-999.12341"); | 14| float c1 = 999.123412f; assert(formatNumber(c1, 7) == "999.1234"); 14| float c2 = 999.1234f; assert(formatNumber!(float, 9)(c2, 3) == "999.123"); 14| const float c3 = 9001.0f; assert(formatNumber(c3) == "9001"); 14| const float c4 = -12345678.0f; assert(formatNumber(c4) == "-12345678"); 14| immutable float c5 = 12345678.0f; assert(formatNumber(c5, 0) == "12345678"); 14| immutable float c6 = 12345678.0f; assert(formatNumber(c6, 1) == "12345678"); | 7| float flarge = 2.0^^(float.mant_dig - 2) - 10.0; 7| float fhuge = 2.0^^(float.mant_dig + 1) + 1000.0; | 7| assert(flarge.formatNumber == format("%d", flarge.to!long)); 7| assert(fhuge.formatNumber!(float, 12) == format("%.12g", fhuge)); | | // Reals - No special formatting 14| real d1 = 2.0^^(double.mant_dig) - 1000.0; assert(formatNumber(d1) == format("%.12g", d1)); 14| real d2 = 123456789.12341234L; assert(formatNumber(d2, 12) == format("%.12g", d2)); |} | |/** |rangeMedian. Finds the median. Modifies the range via topN or sort in the process. | |Note: topN is the preferred algorithm, but the version prior to Phobos 2.073 |is pathologically slow on certain data sets. Use topN in 2.073 and later, |sort in earlier versions. | |See: https://issues.dlang.org/show_bug.cgi?id=16517 | https://github.com/dlang/phobos/pull/4815 | http://forum.dlang.org/post/ujuugklmbibuheptdwcn@forum.dlang.org |*/ |static if (__VERSION__ >= 2073) |{ | version = rangeMedianViaTopN; |} |else |{ | version = rangeMedianViaSort; |} | |auto rangeMedian (Range) (Range r) |if (isRandomAccessRange!Range && hasLength!Range && hasSlicing!Range) |{ | version(rangeMedianViaSort) | { | version(rangeMedianViaTopN) | { | assert(0, "Both rangeMedianViaSort and rangeMedianViaTopN assigned as versions. Assign only one."); | } | } | else version(rangeMedianViaTopN) | { | } | else | { | static assert(0, "A version of rangeMedianViaSort or rangeMedianViaTopN must be assigned."); | } | | import std.traits : isFloatingPoint; | 42117| ElementType!Range median; | 42117| if (r.length > 0) | { 41971| size_t medianIndex = r.length / 2; | | version(rangeMedianViaSort) | { | import std.algorithm : sort; | sort(r); | median = r[medianIndex]; | | static if (isFloatingPoint!(ElementType!Range)) | { | if (r.length % 2 == 0) | { | /* Even number of values. Split the difference. */ | median = (median + r[medianIndex - 1]) / 2.0; | } | } | } | else version(rangeMedianViaTopN) | { | import std.algorithm : maxElement, topN; 41971| topN(r, medianIndex); 41971| median = r[medianIndex]; | | static if (isFloatingPoint!(ElementType!Range)) | { 41831| if (r.length % 2 == 0) | { | /* Even number of values. Split the difference. */ 5374| if (r[medianIndex - 1] < median) | { 5319| median = (median + r[0..medianIndex].maxElement) / 2.0; | } | } | } | } | else | { | static assert(0, "A version of rangeMedianViaSort or rangeMedianViaTopN must be assigned."); | } | } | 42117| return median; |} | |/* rangeMedian unit tests. */ |@safe unittest |{ | import std.math : isNaN; | import std.algorithm : all, permutations; | | // Median of empty range is (type).init. Zero for int, nan for floats/doubles 7| assert(rangeMedian(new int[0]) == int.init); 14| assert(rangeMedian(new double[0]).isNaN && double.init.isNaN); 7| assert(rangeMedian(new string[0]) == ""); | 7| assert(rangeMedian([3]) == 3); 7| assert(rangeMedian([3.0]) == 3.0); 7| assert(rangeMedian([3.5]) == 3.5); 7| assert(rangeMedian(["aaa"]) == "aaa"); | | /* Even number of elements: Split the difference for floating point, but not other types. */ 7| assert(rangeMedian([3, 4]) == 4); 7| assert(rangeMedian([3.0, 4.0]) == 3.5); | 7| assert(rangeMedian([3, 6, 12]) == 6); 7| assert(rangeMedian([3.0, 6.5, 12.5]) == 6.5); | | // Do the rest with permutations 21| assert([4, 7].permutations.all!(x => (x.rangeMedian == 7))); 21| assert([4.0, 7.0].permutations.all!(x => (x.rangeMedian == 5.5))); 21| assert(["aaa", "bbb"].permutations.all!(x => (x.rangeMedian == "bbb"))); | 49| assert([4, 7, 19].permutations.all!(x => (x.rangeMedian == 7))); 49| assert([4.5, 7.5, 19.5].permutations.all!(x => (x.rangeMedian == 7.5))); 49| assert(["aaa", "bbb", "ccc"].permutations.all!(x => (x.rangeMedian == "bbb"))); | 175| assert([4.5, 7.5, 19.5, 21.0].permutations.all!(x => (x.rangeMedian == 13.5))); 847| assert([4.5, 7.5, 19.5, 20.5, 36.0].permutations.all!(x => (x.rangeMedian == 19.5))); 5047| assert([4.5, 7.5, 19.5, 24.0, 24.5, 25.0].permutations.all!(x => (x.rangeMedian == 21.75))); 35287| assert([1.5, 3.25, 3.55, 4.5, 24.5, 25.0, 25.6].permutations.all!(x => (x.rangeMedian == 4.5))); |} | |/// Quantiles | |/** The different quantile interpolation methods. | * See: https://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html | */ |enum QuantileInterpolation |{ | R1 = 1, /// R quantile type 1 | R2 = 2, /// R quantile type 2 | R3 = 3, /// R quantile type 3 | R4 = 4, /// R quantile type 4 | R5 = 5, /// R quantile type 5 | R6 = 6, /// R quantile type 6 | R7 = 7, /// R quantile type 7 | R8 = 8, /// R quantile type 8 | R9 = 9, /// R quantile type 9 |} | | |import std.traits : isFloatingPoint, isNumeric, Unqual; |import std.range; | |/** |Returns the quantile in a data vector for a cumulative probability. | |Takes a data vector and a probability and returns the quantile cut point for the |probability. The vector must be sorted and the probability in the range [0.0, 1.0]. |The interpolation methods available are the same as in R and available in a number |of statistical packages. See the R documentation or wikipedia for details |(https://en.wikipedia.org/wiki/Quantile). | |Examples: |---- |double data = [22, 57, 73, 97, 113]; |double median = quantile(0.5, data); // 73 |auto q1 = [0.25, 0.5, 0.75].map!(p => p.quantile(data)); // 57, 73, 97 |auto q2 = [0.25, 0.5, 0.75].map!(p => p.quantile(data), QuantileInterpolation.R8); //45.3333, 73, 102.333 |---- |*/ |double quantile(ProbType, Range) | (const ProbType prob, Range data, QuantileInterpolation method = QuantileInterpolation.R7) |if (isRandomAccessRange!Range && hasLength!Range && isNumeric!(ElementType!Range) && | isFloatingPoint!ProbType) |in |{ | import std.algorithm : isSorted; 35354| assert(0.0 <= prob && prob <= 1.0); 35354| assert(method >= QuantileInterpolation.min && method <= QuantileInterpolation.max); 17677| assert(data.isSorted); |} |do |{ | import core.stdc.math : modf; | import std.algorithm : max, min; | import std.conv : to; | import std.math : ceil, lrint; | | /* Note: In the implementation below, 'h1' is the 1-based index into the data vector. | * This follows the wikipedia notation for the interpolation methods. One will be | * subtracted before the vector is accessed. | */ | 17677| double q = double.nan; // The return value. | 18601| if (data.length == 1) q = data[0].to!double; 16753| else if (data.length > 1) | { 15829| if (method == QuantileInterpolation.R1) | { 1750| q = data[((data.length * prob).ceil - 1.0).to!long.max(0)].to!double; | } 14079| else if (method == QuantileInterpolation.R2) | { 1729| immutable double h1 = data.length * prob + 0.5; 1729| immutable size_t lo = ((h1 - 0.5).ceil.to!long - 1).max(0); 1729| immutable size_t hi = ((h1 + 0.5).to!size_t - 1).min(data.length - 1); 1729| q = (data[lo].to!double + data[hi].to!double) / 2.0; | } 12350| else if (method == QuantileInterpolation.R3) | { | /* Implementation notes: | * - R3 uses 'banker's rounding', where 0.5 is rounded to the nearest even | * value. The 'lrint' routine does this. | * - DMD will sometimes choose the incorrect 0.5 rounding if the calculation | * is done as a single step. The separate calculation of 'h1' avoids this. | */ 1729| immutable double h1 = data.length * prob; 1729| q = data[h1.lrint.max(1) - 1].to!double; | } 10621| else if ((method == QuantileInterpolation.R4) || 8892| (method == QuantileInterpolation.R5) || 7163| (method == QuantileInterpolation.R6) || 5434| (method == QuantileInterpolation.R7) || 3458| (method == QuantileInterpolation.R8) || 1729| (method == QuantileInterpolation.R9)) | { | /* Methods 4-9 have different formulas for generating the real-valued index, | * but work the same after that, choosing the final value by linear interpolation. | */ 10621| double h1; 10621| switch (method) | { 5187| case QuantileInterpolation.R4: h1 = data.length * prob; break; 5187| case QuantileInterpolation.R5: h1 = data.length * prob + 0.5; break; 5187| case QuantileInterpolation.R6: h1 = (data.length + 1) * prob; break; 5928| case QuantileInterpolation.R7: h1 = (data.length - 1) * prob + 1.0; break; 5187| case QuantileInterpolation.R8: h1 = (data.length.to!double + 1.0/3.0) * prob + 1.0/3.0; break; 5187| case QuantileInterpolation.R9: h1 = (data.length + 0.25) * prob + 3.0/8.0; break; 0000000| default: assert(0); | } | 10621| double h1IntegerPart; 10621| immutable double h1FractionPart = modf(h1, &h1IntegerPart); 10621| immutable size_t lo = (h1IntegerPart - 1.0).to!long.max(0).min(data.length - 1); 10621| q = data[lo]; 10621| if (h1FractionPart > 0.0) | { 8788| immutable size_t hi = h1IntegerPart.to!long.min(data.length - 1); 8788| q += h1FractionPart * (data[hi].to!double - data[lo].to!double); | } | } 0000000| else assert(0); | } 17677| return q; |} | |unittest |{ | import std.algorithm : equal, map; | import std.array : array; | import std.traits : EnumMembers; | | /* A couple simple tests. */ 7| assert(quantile(0.5, [22, 57, 73, 97, 113]) == 73); 7| assert(quantile(0.5, [22.5, 57.5, 73.5, 97.5, 113.5]) == 73.5); 28| assert([0.25, 0.5, 0.75].map!(p => p.quantile([22, 57, 73, 97, 113])).array == [57.0, 73.0, 97.0]); 28| assert([0.25, 0.5, 0.75].map!(p => p.quantile([22, 57, 73, 97, 113], QuantileInterpolation.R1)).array == [57.0, 73.0, 97.0]); | | /* Data arrays. */ 7| double[] d1 = []; 7| double[] d2 = [5.5]; 7| double[] d3 = [0.0, 1.0]; 7| double[] d4 = [-1.0, 1.0]; 7| double[] d5 = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]; 7| double[] d6 = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]; 7| double[] d7 = [ 31.79, 64.19, 81.77]; 7| double[] d8 = [-94.43, -74.55, -50.81, 27.45, 78.79]; 7| double[] d9 = [-89.17, 20.93, 38.51, 48.03, 76.43, 77.02]; 7| double[] d10 = [-99.53, -76.87, -76.69, -67.81, -40.26, -11.29, 21.02]; 7| double[] d11 = [-78.32, -52.22, -50.86, 13.45, 15.96, 17.25, 46.35, 85.00]; 7| double[] d12 = [-81.36, -70.87, -53.56, -42.14, -9.18, 7.23, 49.52, 80.43, 98.50]; 7| double[] d13 = [ 38.37, 44.36, 45.70, 50.69, 51.36, 55.66, 56.91, 58.95, 62.01, 65.25]; | | /* Spot check a few other data types. Same expected outputs.*/ 7| int[] d3Int = [0, 1]; 7| int[] d4Int = [-1, 1]; 7| int[] d5Int = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 7| size_t[] d6Size_t = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; 7| float[] d7Float = [ 31.79f, 64.19f, 81.77f]; 7| float[] d8Float = [-94.43f, -74.55f, -50.81f, 27.45f, 78.79f]; 7| float[] d9Float = [-89.17f, 20.93f, 38.51f, 48.03f, 76.43f, 77.02f]; 7| float[] d10Float = [-99.53f, -76.87f, -76.69f, -67.81f, -40.26f, -11.29f, 21.02f]; | | /* Probability values. */ 7| double[] probs = [0.0, 0.05, 0.1, 0.25, 0.4, 0.49, 0.5, 0.51, 0.75, 0.9, 0.95, 0.98, 1.0]; | | /* Expected values for each data array, for 'probs'. One expected result for each of the nine methods. | * The expected values were generated by R and Octave. | */ 7| double[13][9] d1_expected; // All values double.nan, the default. 7| double[13][9] d2_expected = [ | [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], | [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], | [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], | [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], | [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], | [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], | [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], | [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], | [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], | ]; 7| double[13][9] d3_expected = [ | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02, 0.5, 0.8, 0.9, 0.96, 1.0], | [0.0, 0.0, 0.0, 0.0, 0.3, 0.48, 0.5, 0.52, 1.0, 1.0, 1.0, 1.0, 1.0], | [0.0, 0.0, 0.0, 0.0, 0.2, 0.47, 0.5, 0.53, 1.0, 1.0, 1.0, 1.0, 1.0], | [0.0, 0.05, 0.1, 0.25, 0.4, 0.49, 0.5, 0.51, 0.75, 0.9, 0.95, 0.98, 1.0], | [0.0, 0.0, 0.0, 0.0, 0.2666667, 0.4766667, 0.5, 0.5233333, 1.0, 1.0, 1.0, 1.0, 1.0], | [0.0, 0.0, 0.0, 0.0, 0.275, 0.4775, 0.5, 0.5225, 1.0, 1.0, 1.0, 1.0, 1.0], | ]; 7| double[13][9] d4_expected = [ | [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], | [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], | [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0], | [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -0.96, 0.0, 0.6, 0.8, 0.92, 1.0], | [-1.0, -1.0, -1.0, -1.0, -0.4, -0.04, 0.0, 0.04, 1.0, 1.0, 1.0, 1.0, 1.0], | [-1.0, -1.0, -1.0, -1.0, -0.6, -0.06, 0.0, 0.06, 1.0, 1.0, 1.0, 1.0, 1.0], | [-1.0, -0.9, -0.8, -0.5, -0.2, -0.02, 0.0, 0.02, 0.5, 0.8, 0.9, 0.96, 1.0], | [-1.0, -1.0, -1.0, -1.0, -0.4666667, -0.04666667, -4.440892e-16, 0.04666667, 1.0, 1.0, 1.0, 1.0, 1.0], | [-1.0, -1.0, -1.0, -1.0, -0.45, -0.045, 0.0, 0.045, 1.0, 1.0, 1.0, 1.0, 1.0], | ]; 7| double[13][9] d5_expected = [ | [0.0, 0.0, 1.0, 2.0, 4.0, 5.0, 5.0, 5.0, 8.0, 9.0, 10.0, 10.0, 10.0], | [0.0, 0.0, 1.0, 2.0, 4.0, 5.0, 5.0, 5.0, 8.0, 9.0, 10.0, 10.0, 10.0], | [0.0, 0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 5.0, 7.0, 9.0, 9.0, 10.0, 10.0], | [0.0, 0.0, 0.1, 1.75, 3.4, 4.39, 4.5, 4.61, 7.25, 8.9, 9.45, 9.78, 10.0], | [0.0, 0.05, 0.6, 2.25, 3.9, 4.89, 5.0, 5.11, 7.75, 9.4, 9.95, 10.0, 10.0], | [0.0, 0.0, 0.2, 2.0, 3.8, 4.88, 5.0, 5.12, 8.0, 9.8, 10.0, 10.0, 10.0], | [0.0, 0.5, 1.0, 2.5, 4.0, 4.9, 5.0, 5.1, 7.5, 9.0, 9.5, 9.8, 10.0], | [0.0, 0.0, 0.4666667, 2.166667, 3.866667, 4.886667, 5.0, 5.113333, 7.833333, 9.533333, 10.0, 10.0, 10.0], | [0.0, 0.0, 0.5, 2.1875, 3.875, 4.8875, 5.0, 5.1125, 7.8125, 9.5, 10.0, 10.0, 10.0], | ]; 7| double[13][9] d6_expected = [ | [0.0, 0.0, 1.0, 2.0, 4.0, 5.0, 5.0, 6.0, 8.0, 10.0, 11.0, 11.0, 11.0], | [0.0, 0.0, 1.0, 2.5, 4.0, 5.0, 5.5, 6.0, 8.5, 10.0, 11.0, 11.0, 11.0], | [0.0, 0.0, 0.0, 2.0, 4.0, 5.0, 5.0, 5.0, 8.0, 10.0, 10.0, 11.0, 11.0], | [0.0, 0.0, 0.2, 2.0, 3.8, 4.88, 5.0, 5.12, 8.0, 9.8, 10.4, 10.76, 11.0], | [0.0, 0.1, 0.7, 2.5, 4.3, 5.38, 5.5, 5.62, 8.5, 10.3, 10.9, 11.0, 11.0], | [0.0, 0.0, 0.3, 2.25, 4.2, 5.37, 5.5, 5.63, 8.75, 10.7, 11.0, 11.0, 11.0], | [0.0, 0.55, 1.1, 2.75, 4.4, 5.39, 5.5, 5.61, 8.25, 9.9, 10.45, 10.78, 11.0], | [0.0, 0.0, 0.5666667, 2.416667, 4.266667, 5.376667, 5.5, 5.623333, 8.583333, 10.43333, 11.0, 11.0, 11.0], | [0.0, 0.0, 0.6, 2.4375, 4.275, 5.3775, 5.5, 5.6225, 8.5625, 10.4, 11.0, 11.0, 11.0], | ]; 7| double[13][9] d7_expected = [ | [31.79, 31.79, 31.79, 31.79, 64.19, 64.19, 64.19, 64.19, 81.77, 81.77, 81.77, 81.77, 81.77], | [31.79, 31.79, 31.79, 31.79, 64.19, 64.19, 64.19, 64.19, 81.77, 81.77, 81.77, 81.77, 81.77], | [31.79, 31.79, 31.79, 31.79, 31.79, 31.79, 64.19, 64.19, 64.19, 81.77, 81.77, 81.77, 81.77], | [31.79, 31.79, 31.79, 31.79, 38.27, 47.018, 47.99, 48.962, 68.585, 76.496, 79.133, 80.7152, 81.77], | [31.79, 31.79, 31.79, 39.89, 54.47, 63.218, 64.19, 64.7174, 77.375, 81.77, 81.77, 81.77, 81.77], | [31.79, 31.79, 31.79, 31.79, 51.23, 62.894, 64.19, 64.8932, 81.77, 81.77, 81.77, 81.77, 81.77], | [31.79, 35.03, 38.27, 47.99, 57.71, 63.542, 64.19, 64.5416, 72.98, 78.254, 80.012, 81.0668, 81.77], | [31.79, 31.79, 31.79, 37.19, 53.39, 63.11, 64.19, 64.776, 78.84, 81.77, 81.77, 81.77, 81.77], | [31.79, 31.79, 31.79, 37.865, 53.66, 63.137, 64.19, 64.76135, 78.47375, 81.77, 81.77, 81.77, 81.77], | ]; 7| double[13][9] d8_expected = [ | [-94.43, -94.43, -94.43, -74.55, -74.55, -50.81, -50.81, -50.81, 27.45, 78.79, 78.79, 78.79, 78.79], | [-94.43, -94.43, -94.43, -74.55, -62.68, -50.81, -50.81, -50.81, 27.45, 78.79, 78.79, 78.79, 78.79], | [-94.43, -94.43, -94.43, -94.43, -74.55, -74.55, -74.55, -50.81, 27.45, 27.45, 78.79, 78.79, 78.79], | [-94.43, -94.43, -94.43, -89.46, -74.55, -63.867, -62.68, -61.493, 7.885, 53.12, 65.955, 73.656, 78.79], | [-94.43, -94.43, -94.43, -79.52, -62.68, -51.997, -50.81, -46.897, 40.285, 78.79, 78.79, 78.79, 78.79], | [-94.43, -94.43, -94.43, -84.49, -65.054, -52.2344, -50.81, -46.1144, 53.12, 78.79, 78.79, 78.79, 78.79], | [-94.43, -90.454, -86.478, -74.55, -60.306, -51.7596, -50.81, -47.6796, 27.45, 58.254, 68.522, 74.6828, 78.79], | [-94.43, -94.43, -94.43, -81.17667, -63.47133, -52.07613, -50.81, -46.63613, 44.56333, 78.79, 78.79, 78.79, 78.79], | [-94.43, -94.43, -94.43, -80.7625, -63.2735, -52.05635, -50.81, -46.70135, 43.49375, 78.79, 78.79, 78.79, 78.79], | ]; 7| double[13][9] d9_expected = [ | [-89.17, -89.17, -89.17, 20.93, 38.51, 38.51, 38.51, 48.03, 76.43, 77.02, 77.02, 77.02, 77.02], | [-89.17, -89.17, -89.17, 20.93, 38.51, 38.51, 43.27, 48.03, 76.43, 77.02, 77.02, 77.02, 77.02], | [-89.17, -89.17, -89.17, 20.93, 20.93, 38.51, 38.51, 38.51, 48.03, 76.43, 77.02, 77.02, 77.02], | [-89.17, -89.17, -89.17, -34.12, 27.962, 37.4552, 38.51, 39.0812, 62.23, 76.666, 76.843, 76.9492, 77.02], | [-89.17, -89.17, -78.16, 20.93, 36.752, 42.6988, 43.27, 43.8412, 76.43, 76.961, 77.02, 77.02, 77.02], | [-89.17, -89.17, -89.17, -6.595, 34.994, 42.6036, 43.27, 43.9364, 76.5775, 77.02, 77.02, 77.02, 77.02], | [-89.17, -61.645, -34.12, 25.325, 38.51, 42.794, 43.27, 43.746, 69.33, 76.725, 76.8725, 76.961, 77.02], | [-89.17, -89.17, -89.17, 11.755, 36.166, 42.66707, 43.27, 43.87293, 76.47917, 77.02, 77.02, 77.02, 77.02], | [-89.17, -89.17, -89.17, 14.04875, 36.3125, 42.675, 43.27, 43.865, 76.46688, 77.02, 77.02, 77.02, 77.02], | ]; 7| double[13][9] d10_expected = [ | [-99.53, -99.53, -99.53, -76.87, -76.69, -67.81, -67.81, -67.81, -11.29, 21.02, 21.02, 21.02, 21.02], | [-99.53, -99.53, -99.53, -76.87, -76.69, -67.81, -67.81, -67.81, -11.29, 21.02, 21.02, 21.02, 21.02], | [-99.53, -99.53, -99.53, -76.87, -76.69, -76.69, -67.81, -67.81, -40.26, -11.29, 21.02, 21.02, 21.02], | [-99.53, -99.53, -99.53, -82.535, -76.726, -72.8716, -72.25, -71.6284, -33.0175, -1.597, 9.7115, 16.4966, 21.02], | [-99.53, -99.53, -94.998, -76.825, -74.026, -68.4316, -67.81, -65.8815, -18.5325, 14.558, 21.02, 21.02, 21.02], | [-99.53, -99.53, -99.53, -76.87, -74.914, -68.5204, -67.81, -65.606, -11.29, 21.02, 21.02, 21.02, 21.02], | [-99.53, -92.732, -85.934, -76.78, -73.138, -68.3428, -67.81, -66.157, -25.775, 1.634, 11.327, 17.1428, 21.02], | [-99.53, -99.53, -98.01933, -76.84, -74.322, -68.4612, -67.81, -65.78967, -16.11833, 18.866, 21.02, 21.02, 21.02], | [-99.53, -99.53, -97.264, -76.83625, -74.248, -68.4538, -67.81, -65.81263, -16.72187, 17.789, 21.02, 21.02, 21.02], | ]; 7| double[13][9] d11_expected = [ | [-78.32, -78.32, -78.32, -52.22, 13.45, 13.45, 13.45, 15.96, 17.25, 85.0, 85.0, 85.0, 85.0], | [-78.32, -78.32, -78.32, -51.54, 13.45, 13.45, 14.705, 15.96, 31.8, 85.0, 85.0, 85.0, 85.0], | [-78.32, -78.32, -78.32, -52.22, -50.86, 13.45, 13.45, 13.45, 17.25, 46.35, 85.0, 85.0, 85.0], | [-78.32, -78.32, -78.32, -52.22, -37.998, 8.3052, 13.45, 13.6508, 17.25, 54.08, 69.54, 78.816, 85.0], | [-78.32, -78.32, -70.49, -51.54, -5.843, 14.5042, 14.705, 14.9058, 31.8, 73.405, 85.0, 85.0, 85.0], | [-78.32, -78.32, -78.32, -51.88, -12.274, 14.4791, 14.705, 14.9309, 39.075, 85.0, 85.0, 85.0, 85.0], | [-78.32, -69.185, -60.05, -51.2, 0.588, 14.5293, 14.705, 14.8807, 24.525, 57.945, 71.4725, 79.589, 85.0], | [-78.32, -78.32, -73.97, -51.65333, -7.986667, 14.49583, 14.705, 14.91417, 34.225, 78.55833, 85.0, 85.0, 85.0], | [-78.32, -78.32, -73.1, -51.625, -7.45075, 14.49792, 14.705, 14.91208, 33.61875, 77.27, 85.0, 85.0, 85.0], | ]; 7| double[13][9] d12_expected = [ | [-81.36, -81.36, -81.36, -53.56, -42.14, -9.18, -9.18, -9.18, 49.52, 98.5, 98.5, 98.5, 98.5], | [-81.36, -81.36, -81.36, -53.56, -42.14, -9.18, -9.18, -9.18, 49.52, 98.5, 98.5, 98.5, 98.5], | [-81.36, -81.36, -81.36, -70.87, -42.14, -42.14, -42.14, -9.18, 49.52, 80.43, 98.5, 98.5, 98.5], | [-81.36, -81.36, -81.36, -66.5425, -46.708, -28.6264, -25.66, -22.6936, 38.9475, 82.237, 90.3685, 95.2474, 98.5], | [-81.36, -81.36, -77.164, -57.8875, -38.844, -12.1464, -9.18, -7.7031, 57.2475, 91.272, 98.5, 98.5, 98.5], | [-81.36, -81.36, -81.36, -62.215, -42.14, -12.476, -9.18, -7.539, 64.975, 98.5, 98.5, 98.5, 98.5], | [-81.36, -77.164, -72.968, -53.56, -35.548, -11.8168, -9.18, -7.8672, 49.52, 84.044, 91.272, 95.6088, 98.5], | [-81.36, -81.36, -78.56267, -59.33, -39.94267, -12.25627, -9.18, -7.6484, 59.82333, 93.68133, 98.5, 98.5, 98.5], | [-81.36, -81.36, -78.213, -58.96938, -39.668, -12.2288, -9.18, -7.662075, 59.17938, 93.079, 98.5, 98.5, 98.5], | ]; 7| double[13][9] d13_expected = [ | [38.37, 38.37, 38.37, 45.7, 50.69, 51.36, 51.36, 55.66, 58.95, 62.01, 65.25, 65.25, 65.25], | [38.37, 38.37, 41.365, 45.7, 51.025, 51.36, 53.51, 55.66, 58.95, 63.63, 65.25, 65.25, 65.25], | [38.37, 38.37, 38.37, 44.36, 50.69, 51.36, 51.36, 51.36, 58.95, 62.01, 65.25, 65.25, 65.25], | [38.37, 38.37, 38.37, 45.03, 50.69, 51.293, 51.36, 51.79, 57.93, 62.01, 63.63, 64.602, 65.25], | [38.37, 38.37, 41.365, 45.7, 51.025, 53.08, 53.51, 53.94, 58.95, 63.63, 65.25, 65.25, 65.25], | [38.37, 38.37, 38.969, 45.365, 50.958, 53.037, 53.51, 53.983, 59.715, 64.926, 65.25, 65.25, 65.25], | [38.37, 41.0655, 43.761, 46.9475, 51.092, 53.123, 53.51, 53.897, 58.44, 62.334, 63.792, 64.6668, 65.25], | [38.37, 38.37, 40.56633, 45.58833, 51.00267, 53.06567, 53.51, 53.95433, 59.205, 64.062, 65.25, 65.25, 65.25], | [38.37, 38.37, 40.766, 45.61625, 51.00825, 53.06925, 53.51, 53.95075, 59.14125, 63.954, 65.25, 65.25, 65.25], | ]; | | void compareResults(const double[] actual, const double[] expected, string dataset, QuantileInterpolation method) | { | import std.conv : to; | import std.format : format; | import std.math : approxEqual, isNaN; | import std.range : lockstep; | 35721| foreach (i, actualValue, expectedValue; lockstep(actual, expected)) | { 18837| assert(actualValue.approxEqual(expectedValue) || (actualValue.isNaN && expectedValue.isNaN), | format("Quantile unit test failure, dataset %s, method: %s, index: %d, expected: %g, actual: %g", | dataset, method.to!string, i, expectedValue, actualValue)); | } | } | | foreach(methodIndex, method; EnumMembers!QuantileInterpolation) | { 882| compareResults(probs.map!(p => p.quantile(d1, method)).array, d1_expected[methodIndex], "d1", method); 882| compareResults(probs.map!(p => p.quantile(d2, method)).array, d2_expected[methodIndex], "d2", method); 882| compareResults(probs.map!(p => p.quantile(d3, method)).array, d3_expected[methodIndex], "d3", method); 882| compareResults(probs.map!(p => p.quantile(d3Int, method)).array, d3_expected[methodIndex], "d3Int", method); 882| compareResults(probs.map!(p => p.quantile(d4, method)).array, d4_expected[methodIndex], "d4", method); 882| compareResults(probs.map!(p => p.quantile(d4Int, method)).array, d4_expected[methodIndex], "d4Int", method); 882| compareResults(probs.map!(p => p.quantile(d5, method)).array, d5_expected[methodIndex], "d5", method); 882| compareResults(probs.map!(p => p.quantile(d5Int, method)).array, d5_expected[methodIndex], "d5Int", method); 882| compareResults(probs.map!(p => p.quantile(d6, method)).array, d6_expected[methodIndex], "d6", method); 882| compareResults(probs.map!(p => p.quantile(d6Size_t, method)).array, d6_expected[methodIndex], "d6Size_t", method); 882| compareResults(probs.map!(p => p.quantile(d7, method)).array, d7_expected[methodIndex], "d7", method); 882| compareResults(probs.map!(p => p.quantile(d7Float, method)).array, d7_expected[methodIndex], "d7Float", method); 882| compareResults(probs.map!(p => p.quantile(d8, method)).array, d8_expected[methodIndex], "d8", method); 882| compareResults(probs.map!(p => p.quantile(d8Float, method)).array, d8_expected[methodIndex], "d8Float", method); 882| compareResults(probs.map!(p => p.quantile(d9, method)).array, d9_expected[methodIndex], "d9", method); 882| compareResults(probs.map!(p => p.quantile(d9Float, method)).array, d9_expected[methodIndex], "d9Float", method); 882| compareResults(probs.map!(p => p.quantile(d10, method)).array, d10_expected[methodIndex], "d10", method); 882| compareResults(probs.map!(p => p.quantile(d10Float, method)).array, d10_expected[methodIndex], "d10Float", method); 882| compareResults(probs.map!(p => p.quantile(d11, method)).array, d11_expected[methodIndex], "d11", method); 882| compareResults(probs.map!(p => p.quantile(d12, method)).array, d12_expected[methodIndex], "d12", method); 882| compareResults(probs.map!(p => p.quantile(d13, method)).array, d13_expected[methodIndex], "d13", method); | } |} common/src/tsv_utils/common/numerics.d is 99% covered <<<<<< EOF # path=./csv2tsv-src-tsv_utils-csv2tsv.lst |/** |Convert CSV formatted data to TSV format. | |This program converts comma-separated value data to tab-separated format. | |Copyright (c) 2016-2020, eBay Inc. |Initially written by Jon Degenhardt | |License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) |*/ | |module tsv_utils.csv2tsv; | |import std.stdio; |import std.exception : enforce; |import std.format : format; |import std.range; |import std.traits : isArray, Unqual; |import std.typecons : tuple; | |immutable helpText = q"EOS |Synopsis: csv2tsv [options] [file...] | |csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records |are read from files or standard input, converted records written to standard output. |Use '--help-verbose' for details the CSV formats accepted. | |Options: |EOS"; | |immutable helpTextVerbose = q"EOS |Synopsis: csv2tsv [options] [file...] | |csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records |are read from files or standard input, converted records written to standard output. | |Both formats represent tabular data, each record on its own line, fields separated |by a delimiter character. The key difference is that CSV uses escape sequences to |represent newlines and field separators in the data, whereas TSV disallows these |characters in the data. The most common field delimiters are comma for CSV and tab |for TSV, but any character can be used. | |Conversion to TSV is done by removing CSV escape syntax, changing field delimiters, |and replacing newlines and field delimiters in the data. By default, newlines and |field delimiters in the data are replaced by spaces. Most details are customizable. | |There is no single spec for CSV, any number of variants can be found. The escape |syntax is common enough: fields containing newlines or field delimiters are placed |in double quotes. Inside a quoted field, a double quote is represented by a pair of |double quotes. As with field separators, the quoting character is customizable. | |Behaviors of this program that often vary between CSV implementations: | * Newlines are supported in quoted fields. | * Double quotes are permitted in a non-quoted field. However, a field starting | with a quote must follow quoting rules. | * Each record can have a different numbers of fields. | * The three common forms of newlines are supported: CR, CRLF, LF. Output is | written using Unix newlines (LF). | * A newline will be added if the file does not end with one. | * A UTF-8 Byte Order Mark (BOM) at the start of a file will be removed. | * No whitespace trimming is done. | |This program does not validate CSV correctness, but will terminate with an error |upon reaching an inconsistent state. Improperly terminated quoted fields are the |primary cause. | |UTF-8 input is assumed. Convert other encodings prior to invoking this tool. | |Options: |EOS"; | |/** Container for command line options. | */ |struct Csv2tsvOptions |{ | string programName; | bool helpVerbose = false; // --help-verbose | bool hasHeader = false; // --H|header | char csvQuoteChar = '"'; // --q|quote | char csvDelimChar = ','; // --c|csv-delim | char tsvDelimChar = '\t'; // --t|tsv-delim | string tsvDelimReplacement = " "; // --r|tab-replacement | string newlineReplacement = " "; // --n|newline-replacement | bool versionWanted = false; // --V|version | | auto processArgs (ref string[] cmdArgs) | { | import std.algorithm : canFind; | import std.getopt; | import std.path : baseName, stripExtension; | 86| programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; | | try | { 43| auto r = getopt( | cmdArgs, | "help-verbose", " Print full help.", &helpVerbose, | std.getopt.config.caseSensitive, | "H|header", " Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader, | std.getopt.config.caseSensitive, | "q|quote", "CHR Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar, | "c|csv-delim", "CHR Field delimiter in CSV data. Default: comma (,).", &csvDelimChar, | "t|tsv-delim", "CHR Field delimiter in TSV data. Default: TAB", &tsvDelimChar, | "r|tab-replacement", "STR Replacement for TSV field delimiters (typically TABs) found in CSV input. Default: Space.", &tsvDelimReplacement, | "n|newline-replacement", "STR Replacement for newlines found in CSV input. Default: Space.", &newlineReplacement, | std.getopt.config.caseSensitive, | "V|version", " Print version information and exit.", &versionWanted, | std.getopt.config.caseInsensitive, | ); | 42| if (r.helpWanted) | { 1| defaultGetoptPrinter(helpText, r.options); 1| return tuple(false, 0); | } 41| else if (helpVerbose) | { 1| defaultGetoptPrinter(helpTextVerbose, r.options); 1| return tuple(false, 0); | } 40| else if (versionWanted) | { | import tsv_utils.common.tsvutils_version; 2| writeln(tsvutilsVersionNotice("csv2tsv")); 2| return tuple(false, 0); | } | | /* Consistency checks. */ 75| enforce(csvQuoteChar != '\n' && csvQuoteChar != '\r', 2| "CSV quote character cannot be newline (--q|quote)."); | 36| enforce(csvQuoteChar != csvDelimChar, 1| "CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim)."); | 35| enforce(csvQuoteChar != tsvDelimChar, 1| "CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim)."); | 67| enforce(csvDelimChar != '\n' && csvDelimChar != '\r', 2| "CSV field delimiter cannot be newline (--c|csv-delim)."); | 63| enforce(tsvDelimChar != '\n' && tsvDelimChar != '\r', 2| "TSV field delimiter cannot be newline (--t|tsv-delim)."); | 171| enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement), 5| "Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement)."); | 142| enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(newlineReplacement), 4| "Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement)."); | } | catch (Exception exc) | { 18| stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 18| return tuple(false, 1); | } 21| return tuple(true, 0); | } |} | |static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; | |version(unittest) |{ | // No main in unittest |} |else |{ | int main(string[] cmdArgs) | { | /* When running in DMD code coverage mode, turn on report merging. */ | version(D_Coverage) version(DigitalMars) | { | import core.runtime : dmd_coverSetMerge; 43| dmd_coverSetMerge(true); | } | 43| Csv2tsvOptions cmdopt; 43| const r = cmdopt.processArgs(cmdArgs); 65| if (!r[0]) return r[1]; | version(LDC_Profile) | { | import ldc.profile : resetAll; | resetAll(); | } 21| try csv2tsvFiles(cmdopt, cmdArgs[1..$]); | catch (Exception exc) | { 3| writeln(); 3| stdin.flush(); 3| stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 3| return 1; | } | 18| return 0; | } |} | |void csv2tsvFiles(const ref Csv2tsvOptions cmdopt, const string[] inputFiles) |{ | import tsv_utils.common.utils : BufferedOutputRange; | 21| ubyte[1024 * 128] fileRawBuf; 42| auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(stdout); 21| bool firstFile = true; | 195| foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) | { 116| auto inputStream = (filename == "-") ? stdin : filename.File; 76| auto printFileName = (filename == "-") ? "stdin" : filename; | 94| auto skipLines = (firstFile || !cmdopt.hasHeader) ? 0 : 1; | 38| csv2tsv(inputStream, stdoutWriter, fileRawBuf, printFileName, skipLines, | cmdopt.csvQuoteChar, cmdopt.csvDelimChar, | cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement, | cmdopt.newlineReplacement); | 36| firstFile = false; | } |} | |/* csv2tsv buffered conversion approach | |This version of csv2tsv uses a buffered approach to csv-to-tsv conversion. This is a |change from the original version, which used a character-at-a-time approach, with |characters coming from an infinite stream of characters. The character-at-a-time |approach was nice from a simplicity perspective, but the approach didn't optimize well. |Note that the original version read input in blocks and wrote to stdout in blocks, it |was the conversion algorithm itself that was character oriented. | |The idea is to convert a buffer at a time, writing larger blocks to the output stream |rather than one character at a time. In addition, the read buffer is modified in-place |when the only change is to convert a single character. The notable case is converting |the field delimiter character, typically comma to TAB. The result is writing longer |blocks to the output stream (BufferedOutputRange). | |Performance improvements from the new algorithm are notable. This is especially true |versus the previous version 2.0.0. Note though that the more recent versions of |csv2tsv were slower due to degradations coming from compiler and/or language version. |Version 1.1.19 was quite a bit faster. Regardless of version, the performance |improvement is especially good when run against "simple" CSV files, with limited |amounts of CSV escape syntax. In these files the main change is converting the field |delimiter character, typically comma to TAB. | |In some benchmarks on Mac OS, the new version was 40% faster than csv2tsv 2.0.0 on |files with significant CSV escapes, and 60% faster on files with limited CSV escapes. |Versus csv2tsv version 1.1.19, the new version is 10% and 40% faster on the same |files. On the "simple CSV" file, where Unix 'tr' is an option, 'tr' was still faster, |by about 20%. But getting into the 'tr' ballpark while retaining safety of correct |csv2tsv conversion is a good result. | |Algorithm notes: | |The algorithm works by reading an input block, then examining each byte in-order to |identify needed modifications. The region of consecutive characters without a change |is tracked. Single character changes are done in-place, in the read buffer. This |allows assembling longer blocks before write is needed. The region being tracked is |written to the output stream when it can no longer be extended in a continuous |fashion. At this point a new region is started. When the current read buffer has |been processed the current region is written out and a new block of data read in. | |The read buffer uses fixed size blocks. This means the algorithm is actually |operating on bytes (UTF-8 code units), and not characters. This works because all |delimiters and CSV escape syntax characters are single byte UTF-8 characters. These |are the only characters requiring interpretation. The main nuisance is the 2-byte |CRLF newline sequence, as this might be split across two read buffers. This is |handled by embedding 'CR' states in the finite state machine. | |Processing CSV escapes will often cause the character removals and additions. These |will not be representable in a continuous stream of bytes without moving bytes around |Instead of moving bytes, these cases are handled by immediately writing to the output |stream. This allows restarting a new block of contiguous characters. Handling by the |new algorithm is described below. Note that the length of the replacement characters |for TSV field and record delimiters (e.g. TAB, newline) affects the processing. | |All replacement character lengths: | |* Windows newline (CRLF) at the end of a line - Replace the CRLF with LF. | | Replace the CR with LF, add it to the current write region and terminate it. The | next write region starts at the character after the LF. | |* Double quote starting or ending a field - Drop the double quote. | | Terminate the current write region, next write region starts at the next character. | |* Double quote pair inside a quoted field - Drop one of the double quotes. | | The algorithm drops the first double quote and keep the second. This avoids | look-ahead and both field terminating double quote and double quote pair can | handled the same way. Terminate the current write region without adding the double | quote. The next write region starts at the next character. | |Single byte replacement characters: | |* Windows newline (CRLF) in a quoted field | | Replace the CR with the replacement char, add it to the current write region and | terminate it. The next write region starts at the character after the LF. | |Multi-byte replacement sequences: | |* TSV Delimiter (TAB by default) in a field | | Terminate the current write region, write it out and the replacement. The next | write region starts at the next character. | |* LF, CR, or CRLF in a quoted field | | Terminate the current write region, write it and the replacement. The next write | region starts at the next character. | |csv2tsv API | |At the API level, it is desirable to handle at both open files and input streams. |Open files are the key requirement, but handling input streams simplifies unit |testing, and in-memory conversion is likely to be useful anyway. Internally, it |should be easy enough to encapsulate the differences between input streams and files. |Reading files can be done using File.byChunk and reading from input streams can be |done using std.range.chunks. | |This has been handled by creating a new range that can iterate either files or |input streams chunk-by-chunk. |*/ | |/** Defines the 'bufferable' input sources supported by inputSourceByChunk. | * | * This includes std.stdio.File objects and mutable dynamic ubyte arrays (inputRange | * with slicing). | * | * Note: The mutable, dynamic arrays restriction is based on what is supported by | * std.range.chunks. This could be extended to include any type of array with ubyte | * elements, but it would require custom code in inputSourceByChunk. A test could be | * added as '(isArray!(R) && is(Unqual!(typeof(R.init[0])) == ubyte))'. | */ |enum bool isBufferableInputSource(R) = | isFileHandle!(Unqual!R) || | (isInputRange!R && is(ElementEncodingType!R == ubyte) && hasSlicing!R); | |@safe unittest |{ | static assert(isBufferableInputSource!(File)); | static assert(isBufferableInputSource!(typeof(stdin))); | static assert(isBufferableInputSource!(ubyte[])); | static assert(!isBufferableInputSource!(char[])); | static assert(!isBufferableInputSource!(string)); | 1| ubyte[10] x1; 1| const ubyte[1] x2; 1| immutable ubyte[1] x3; 1| ubyte[] x4 = new ubyte[](10); 1| const ubyte[] x5 = new ubyte[](10); 1| immutable ubyte[] x6 = new ubyte[](10); | | static assert(!isBufferableInputSource!(typeof(x1))); | static assert(!isBufferableInputSource!(typeof(x2))); | static assert(!isBufferableInputSource!(typeof(x3))); | static assert(isBufferableInputSource!(typeof(x4))); | static assert(!isBufferableInputSource!(typeof(x5))); | static assert(!isBufferableInputSource!(typeof(x6))); | | static assert(is(Unqual!(ElementType!(typeof(x1))) == ubyte)); | static assert(is(Unqual!(ElementType!(typeof(x2))) == ubyte)); | static assert(is(Unqual!(ElementType!(typeof(x3))) == ubyte)); | static assert(is(Unqual!(ElementType!(typeof(x4))) == ubyte)); | static assert(is(Unqual!(ElementType!(typeof(x5))) == ubyte)); | static assert(is(Unqual!(ElementType!(typeof(x6))) == ubyte)); | | struct S1 | { | void popFront(); | @property bool empty(); | @property ubyte front(); | } | | struct S2 | { | @property ubyte front(); | void popFront(); | @property bool empty(); 1| @property auto save() { return this; } | @property size_t length(); | S2 opSlice(size_t, size_t); | } | | static assert(isInputRange!S1); | static assert(!isBufferableInputSource!S1); | | static assert(isInputRange!S2); | static assert(is(ElementEncodingType!S2 == ubyte)); | static assert(hasSlicing!S2); | static assert(isBufferableInputSource!S2); | | /* For code coverage. */ 1| S2 s2; 1| auto x = s2.save; |} | |/** inputSourceByChunk returns a range that reads either a file handle (File) or a | * ubyte[] array a chunk at a time. | * | * This is a cover for File.byChunk that allows passing an in-memory array as well. | * At present the motivation is primarily to enable unit testing of chunk-based | * algorithms using in-memory strings. At present the in-memory input types are | * limited. In the future this may be changed to accept any type of character or | * ubyte array. | * | * inputSourceByChunk takes either a File open for reading or a ubyte[] array | * containing input data. Data is read a buffer at a time. The buffer can be | * user provided, or allocated by inputSourceByChunk based on a caller provided | * buffer size. | * | * A ubyte[] input source must satisfy isBufferableInputSource, which at present | * means that it is a dynamic, mutable ubyte[]. | * | * The chunks are returned as an input range. | */ | |auto inputSourceByChunk(InputSource)(InputSource source, size_t size) |{ 1482| return inputSourceByChunk(source, new ubyte[](size)); |} | |/// Ditto |auto inputSourceByChunk(InputSource)(InputSource source, ubyte[] buffer) |if (isBufferableInputSource!InputSource) |{ | static if (isFileHandle!(Unqual!InputSource)) | { 1520| return source.byChunk(buffer); | } | else | { | static struct BufferedChunk | { | private Chunks!InputSource _chunks; | private ubyte[] _buffer; | | private void readNextChunk() | { 17347| if (_chunks.empty) | { 3175| _buffer.length = 0; | } | else | { 14172| size_t len = _chunks.front.length; 14172| _buffer[0 .. len] = _chunks.front[]; 14172| _chunks.popFront; | | /* Only the last chunk should be shorter than the buffer. */ 16360| assert(_buffer.length == len || _chunks.empty); | 16360| if (_buffer.length != len) _buffer.length = len; | } | } | 3175| this(InputSource source, ubyte[] buffer) | { 3175| enforce(buffer.length > 0, "buffer size must be larger than 0"); 3175| _chunks = source.chunks(buffer.length); 3175| _buffer = buffer; 3175| readNextChunk(); | } | | @property bool empty() | { 156081| return (_buffer.length == 0); | } | | @property ubyte[] front() | { 20142| assert(!empty, "Attempting to fetch the front of an empty inputSourceByChunks"); 20142| return _buffer; | } | | void popFront() | { 14172| assert(!empty, "Attempting to popFront an empty inputSourceByChunks"); 14172| readNextChunk(); | } | } | 3175| return BufferedChunk(source, buffer); | } |} | |unittest // inputSourceByChunk |{ | import tsv_utils.common.unittest_utils; // tsv-utils unit test helpers | import std.file : mkdir, rmdirRecurse; | import std.path : buildPath; | 1| auto testDir = makeUnittestTempDir("csv2tsv_inputSourceByChunk"); 1| scope(exit) testDir.rmdirRecurse; | | import std.algorithm : equal, joiner; | import std.format; | import std.string : representation; | 1| auto charData = "abcde,ßÀß,あめりか物語,012345"; 1| ubyte[] ubyteData = charData.dup.representation; | 1| ubyte[1024] rawBuffer; // Must be larger than largest bufferSize in tests. | | void writeFileData(string filePath, ubyte[] data) | { | import std.stdio; | 76| auto f = filePath.File("w"); 38| f.rawWrite(data); 38| f.close; | } | 117| foreach (size_t dataSize; 0 .. ubyteData.length) | { 38| auto data = ubyteData[0 .. dataSize]; 38| auto filePath = buildPath(testDir, format("data_%d.txt", dataSize)); 38| writeFileData(filePath, data); | 2337| foreach (size_t bufferSize; 1 .. dataSize + 2) | { 741| assert(data.inputSourceByChunk(bufferSize).joiner.equal(data), | format("[Test-A] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); | 741| assert (rawBuffer.length >= bufferSize); | 741| ubyte[] buffer = rawBuffer[0 .. bufferSize]; 741| assert(data.inputSourceByChunk(buffer).joiner.equal(data), | format("[Test-B] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); | | { 1482| auto inputStream = filePath.File; 741| assert(inputStream.inputSourceByChunk(bufferSize).joiner.equal(data), | format("[Test-C] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 741| inputStream.close; | } | | { 1482| auto inputStream = filePath.File; 741| assert(inputStream.inputSourceByChunk(buffer).joiner.equal(data), | format("[Test-D] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 741| inputStream.close; | } | } | } |} | |/** Read CSV from an input source, covert to TSV and write to an output source. | * | * Params: | * inputSource = A "bufferable" input source, either a file open for | * read, or a dynamic, mutable ubyte array. | * outputStream = An output range to write TSV bytes to. | * readBuffer = A buffer to use for reading. | * filename = Name of file to use when reporting errors. A descriptive | * name can be used in lieu of a file name. | * skipLines = Number of lines to skip before outputting records. | * Typically used to skip writing header lines. | * csvQuote = The quoting character used in the CSV input. | * csvDelim = The field delimiter character used in the CSV input. | * tsvDelim = The field delimiter character to use in the TSV output. | * tsvDelimReplacement = String to use when replacing TSV field delimiters | * (e.g. TABs) found in the CSV data fields. | * tsvNewlineReplacement = String to use when replacing newlines found in the CSV | * data fields. | * discardBOM = If true (the default), a UTF-8 Byte Order Mark found at the | * start of the input stream will be dropped. | * | * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and | * line number where the error was identified. | */ |void csv2tsv(InputSource, OutputRange)( | InputSource inputSource, | auto ref OutputRange outputStream, | ubyte[] readBuffer, | string filename = "(none)", | size_t skipLines = 0, | const char csvQuote = '"', | const char csvDelim = ',', | const char tsvDelim = '\t', | const string tsvDelimReplacement = " ", | const string tsvNewlineReplacement = " ", | bool discardBOM = true, |) |if (isBufferableInputSource!InputSource && | isOutputRange!(OutputRange, char)) |{ | import std.conv: hexString; | 1731| assert (readBuffer.length >= 1); | | enum char LF = '\n'; | enum char CR = '\r'; | | enum ubyte[3] UTF8_BOM = cast(ubyte[3])hexString!"efbbbf"; | | /* Process state information - These variables are defined either in the outer | * context or within one of the foreach loops. | * | * * recordNum - The current CSV input line/record number. Starts at one. | * * fieldNum - Field number in the current line/record. Field numbers are | * one-upped. The field number set to zero at the start of a new record, | * prior to processing the first character of the first field on the record. | * * byteIndex - Read buffer index of the current byte being processed. | * * csvState - The current state of CSV processing. In particular, the state | * of the finite state machine. | * * writeRegionStart - Read buffer index where the next write starts from. | * * nextIndex - The index of the current input ubyte being processed. The | * current write region extends from the writeRegionStart to nextIndex. | * * nextChar - The current input ubyte. The ubyte/char at nextIndex. | */ | | enum CSVState | { | FieldEnd, // Start of input or after consuming a field or record delimiter. | NonQuotedField, // Processing a non-quoted field | QuotedField, // Processing a quoted field | QuoteInQuotedField, // Last char was a quote in a quoted field | CRAtFieldEnd, // Last char was a CR terminating a record/line | CRInQuotedField, // Last char was a CR in a quoted field | } | 1731| CSVState csvState = CSVState.FieldEnd; 1731| size_t recordNum = 1; 1731| size_t fieldNum = 0; | 44678| foreach (chunkIndex, inputChunkComplete; inputSource.inputSourceByChunk(readBuffer).enumerate) | { 8236| size_t writeRegionStart = 0; | | /* Discard byte order marks at the start of input. | * Note: Slicing the chunk in this fashion generates very good code, better | * other approaches like manipulating indices. | */ 8236| auto inputChunk = | (discardBOM && 8008| chunkIndex == 0 && 1614| inputChunkComplete.length >= UTF8_BOM.length && 765| inputChunkComplete[0 .. UTF8_BOM.length] == UTF8_BOM | ) 25| ? inputChunkComplete[UTF8_BOM.length .. $] 8211| : inputChunkComplete[]; | | /* flushCurrentRegion flushes the current write region and moves the start of | * the next write region one byte past the end of the current region. If | * appendChars are provided they are ouput as well. | * | * This routine is called when the current character (byte) terminates the | * current write region and should not itself be output. That is why the next | * write region always starts one byte past the current region end. | * | * This routine is also called when the 'skiplines' region has been processed. | * This is done to flush the region without actually writing it. This is done | * by explicit checks in the finite state machine when newline characters | * that terminate a record are processed. It would be nice to refactor this. | */ | void flushCurrentRegion(size_t regionEnd, const char[] appendChars = "") | { 6147| assert(regionEnd <= inputChunk.length); | 6147| if (recordNum > skipLines) | { 5477| if (regionEnd > writeRegionStart) | { 2403| outputStream.put(inputChunk[writeRegionStart .. regionEnd]); | } 5477| if (appendChars.length > 0) | { 691| outputStream.put(appendChars); | } | } | 6147| writeRegionStart = regionEnd + 1; | } | 120078| foreach (size_t nextIndex, char nextChar; inputChunk) | { 23843| OuterSwitch: final switch (csvState) | { 5528| case CSVState.FieldEnd: | /* Start of input or after consuming a field terminator. */ 5528| ++fieldNum; | | /* Note: Can't use switch due to the 'goto case' to the OuterSwitch. */ 5528| if (nextChar == csvQuote) | { 1997| flushCurrentRegion(nextIndex); 1997| csvState = CSVState.QuotedField; 1997| break OuterSwitch; | } | else | { | /* Processing state change only. Don't consume the character. */ 3531| csvState = CSVState.NonQuotedField; 3531| goto case CSVState.NonQuotedField; | } | 11034| case CSVState.NonQuotedField: 11034| switch (nextChar) | { 7761| default: 7761| break OuterSwitch; 1624| case csvDelim: 1624| inputChunk[nextIndex] = tsvDelim; 1624| csvState = CSVState.FieldEnd; 1624| break OuterSwitch; 947| case LF: 1017| if (recordNum == skipLines) flushCurrentRegion(nextIndex); 947| ++recordNum; 947| fieldNum = 0; 947| csvState = CSVState.FieldEnd; 947| break OuterSwitch; 517| case CR: 517| inputChunk[nextIndex] = LF; 637| if (recordNum == skipLines) flushCurrentRegion(nextIndex); 517| ++recordNum; 517| fieldNum = 0; 517| csvState = CSVState.CRAtFieldEnd; 517| break OuterSwitch; 185| case tsvDelim: 185| if (tsvDelimReplacement.length == 1) | { 100| inputChunk[nextIndex] = tsvDelimReplacement[0]; | } | else | { 85| flushCurrentRegion(nextIndex, tsvDelimReplacement); | } 185| break OuterSwitch; | } | 8043| case CSVState.QuotedField: 8043| switch (nextChar) | { 4046| default: 4046| break OuterSwitch; 2342| case csvQuote: | /* | * Flush the current region, without the double quote. Switch state | * to QuoteInQuotedField, which determines whether to output a quote. | */ 2342| flushCurrentRegion(nextIndex); 2342| csvState = CSVState.QuoteInQuotedField; 2342| break OuterSwitch; | 245| case tsvDelim: 245| if (tsvDelimReplacement.length == 1) | { 144| inputChunk[nextIndex] = tsvDelimReplacement[0]; | } | else | { 101| flushCurrentRegion(nextIndex, tsvDelimReplacement); | } 245| break OuterSwitch; 480| case LF: | /* Newline in a quoted field. */ 480| if (tsvNewlineReplacement.length == 1) | { 305| inputChunk[nextIndex] = tsvNewlineReplacement[0]; | } | else | { 175| flushCurrentRegion(nextIndex, tsvNewlineReplacement); | } 480| break OuterSwitch; 930| case CR: | /* Carriage Return in a quoted field. */ 930| if (tsvNewlineReplacement.length == 1) | { 600| inputChunk[nextIndex] = tsvNewlineReplacement[0]; | } | else | { 330| flushCurrentRegion(nextIndex, tsvNewlineReplacement); | } 930| csvState = CSVState.CRInQuotedField; 930| break OuterSwitch; | } | 1932| case CSVState.QuoteInQuotedField: | /* Just processed a quote in a quoted field. The buffer, without the | * quote, was just flushed. Only legal characters here are quote, | * comma (field delimiter), newline (record delimiter). | */ 1932| switch (nextChar) | { 346| case csvQuote: 346| csvState = CSVState.QuotedField; 346| break OuterSwitch; 1127| case csvDelim: 1127| inputChunk[nextIndex] = tsvDelim; 1127| csvState = CSVState.FieldEnd; 1127| break OuterSwitch; 204| case LF: 234| if (recordNum == skipLines) flushCurrentRegion(nextIndex); 204| ++recordNum; 204| fieldNum = 0; 204| csvState = CSVState.FieldEnd; 204| break OuterSwitch; 254| case CR: 254| inputChunk[nextIndex] = LF; 314| if (recordNum == skipLines) flushCurrentRegion(nextIndex); 254| ++recordNum; 254| fieldNum = 0; 254| csvState = CSVState.CRAtFieldEnd; 254| break OuterSwitch; 1| default: 1| throw new Exception( | format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 1| (filename == "-") ? "Standard Input" : filename, | recordNum)); | } | 930| case CSVState.CRInQuotedField: 930| if (nextChar == LF) | { 457| flushCurrentRegion(nextIndex); 457| csvState = CSVState.QuotedField; 457| break OuterSwitch; | } | else { | /* Naked CR. State change only, don't consume current character. */ 473| csvState = CSVState.QuotedField; 473| goto case CSVState.QuotedField; | } | 628| case CSVState.CRAtFieldEnd: 628| if (nextChar == LF) | { 380| flushCurrentRegion(nextIndex); 380| csvState = CSVState.FieldEnd; 380| break OuterSwitch; | } | else { | /* Naked CR. State change only, don't consume current character. */ 248| csvState = CSVState.FieldEnd; 248| goto case CSVState.FieldEnd; | } | } | } | | /* End of buffer. */ 14029| if (writeRegionStart < inputChunk.length && recordNum > skipLines) | { 5622| outputStream.put(inputChunk[writeRegionStart .. $]); | } | 8235| writeRegionStart = 0; | } | 1730| enforce(csvState != CSVState.QuotedField, 1| format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 1| (filename == "-") ? "Standard Input" : filename, | recordNum)); | | /* Output a newline if the CSV input did not have a terminating newline. */ 3633| if (fieldNum > 0 && recordNum > skipLines) put(outputStream, '\n'); |} | |unittest |{ | /* Unit tests for the csv2tsv function. | * | * These unit tests exercise different CSV combinations and escaping cases. The CSV | * data content is the same for each corresponding test string, except the delimiters | * have been changed. e.g csv6a and csv6b have the same data content. | * | * A property used in these tests is that changing the CSV delimiters doesn't change | * the resulting TSV. However, changing the TSV delimiters will change the TSV result, | * as TSV doesn't support having it's delimiters in the data. This allows having a | * single TSV expected set that is generated by CSVs with different delimter sets. | * | * This test set does not test main, file handling, or error messages. These are | * handled by tests run against the executable. | * | * Note: unittest is non @safe due to the casts from string to ubyte[]. This can | * probably be rewritten to use std.string.representation instead, which is @safe. | */ | | /* Default CSV. */ 1| auto csv1a = "a,b,c"; 1| auto csv2a = "a,bc,,,def"; 1| auto csv3a = ",a, b , cd ,"; 1| auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石"; 1| auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\""; 1| auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\""; 1| auto csv7a = "\",\",\",,\",\",,,\""; 1| auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\""; 1| auto csv9a = "\"ab, de\tfg\"\"\nhij\""; 1| auto csv10a = ""; 1| auto csv11a = ","; 1| auto csv12a = ",,"; 1| auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\""; 1| auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\""; 1| auto csv15a = "\"ab, de\tfg\"\"\rhij\""; 1| auto csv16a = "\"ab, de\tfg\"\"\r\nhij\""; 1| auto csv17a = "ab\",ab\"cd"; 1| auto csv18a = "\n\n\n"; 1| auto csv19a = "\t"; 1| auto csv20a = "\t\t"; 1| auto csv21a = "a\n"; 1| auto csv22a = "a,\n"; 1| auto csv23a = "a,b\n"; 1| auto csv24a = ",\n"; 1| auto csv25a = "#"; 1| auto csv26a = "^"; 1| auto csv27a = "#^#"; 1| auto csv28a = "^#^"; 1| auto csv29a = "$"; 1| auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n"; 1| auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n"; 1| auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\""; | | // Newlines terminating a line ending a non-quoted field 1| auto csv33a = "\rX\r\nX\n\r\nX\r\n"; | | // Newlines inside a quoted field and terminating a line following a quoted field 1| auto csv34a = "\"\r\",\"X\r\",\"X\rY\",\"\rY\"\r\"\r\n\",\"X\r\n\",\"X\r\nY\",\"\r\nY\"\r\n\"\n\",\"X\n\",\"X\nY\",\"\nY\"\n"; | | // CR at field end 1| auto csv35a = "abc,def\r\"ghi\",\"jkl\"\r\"mno\",pqr\r"; | | /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */ 1| auto csv1b = "a^b^c"; 1| auto csv2b = "a^bc^^^def"; 1| auto csv3b = "^a^ b ^ cd ^"; 1| auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石"; 1| auto csv5b = "#\n#^#\n\n#^#\n\n\n#"; 1| auto csv6b = "#\t#^#\t\t#^#\t\t\t#"; 1| auto csv7b = "#,#^#,,#^#,,,#"; 1| auto csv8b = "##^#\"#^#\"\"#"; 1| auto csv9b = "#ab, de\tfg\"\nhij#"; 1| auto csv10b = ""; 1| auto csv11b = "^"; 1| auto csv12b = "^^"; 1| auto csv13b = "#\r#^#\r\r#^#\r\r\r#"; 1| auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#"; 1| auto csv15b = "#ab, de\tfg\"\rhij#"; 1| auto csv16b = "#ab, de\tfg\"\r\nhij#"; 1| auto csv17b = "ab\"^ab\"cd"; 1| auto csv18b = "\n\n\n"; 1| auto csv19b = "\t"; 1| auto csv20b = "\t\t"; 1| auto csv21b = "a\n"; 1| auto csv22b = "a^\n"; 1| auto csv23b = "a^b\n"; 1| auto csv24b = "^\n"; 1| auto csv25b = "####"; 1| auto csv26b = "#^#"; 1| auto csv27b = "###^###"; 1| auto csv28b = "#^##^#"; 1| auto csv29b = "$"; 1| auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n"; 1| auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n"; 1| auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#"; 1| auto csv33b = "\rX\r\nX\n\r\nX\r\n"; 1| auto csv34b = "#\r#^#X\r#^#X\rY#^#\rY#\r#\r\n#^#X\r\n#^#X\r\nY#^#\r\nY#\r\n#\n#^#X\n#^#X\nY#^#\nY#\n"; 1| auto csv35b = "abc^def\r#ghi#^#jkl#\r#mno#^pqr\r"; | | /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/ 1| auto tsv1 = "a\tb\tc\n"; 1| auto tsv2 = "a\tbc\t\t\tdef\n"; 1| auto tsv3 = "\ta\t b \t cd \t\n"; 1| auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n"; 1| auto tsv5 = " \t \t \n"; 1| auto tsv6 = " \t \t \n"; 1| auto tsv7 = ",\t,,\t,,,\n"; 1| auto tsv8 = "\t\"\t\"\"\n"; 1| auto tsv9 = "ab, de fg\" hij\n"; 1| auto tsv10 = ""; 1| auto tsv11 = "\t\n"; 1| auto tsv12 = "\t\t\n"; 1| auto tsv13 = " \t \t \n"; 1| auto tsv14 = " \t \t \n"; 1| auto tsv15 = "ab, de fg\" hij\n"; 1| auto tsv16 = "ab, de fg\" hij\n"; 1| auto tsv17 = "ab\"\tab\"cd\n"; 1| auto tsv18 = "\n\n\n"; 1| auto tsv19 = " \n"; 1| auto tsv20 = " \n"; 1| auto tsv21 = "a\n"; 1| auto tsv22 = "a\t\n"; 1| auto tsv23 = "a\tb\n"; 1| auto tsv24 = "\t\n"; 1| auto tsv25 = "#\n"; 1| auto tsv26 = "^\n"; 1| auto tsv27 = "#^#\n"; 1| auto tsv28 = "^#^\n"; 1| auto tsv29 = "$\n"; 1| auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n"; 1| auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n"; 1| auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n"; 1| auto tsv33 = "\nX\nX\n\nX\n"; 1| auto tsv34 = " \tX \tX Y\t Y\n \tX \tX Y\t Y\n \tX \tX Y\t Y\n"; 1| auto tsv35 = "abc\tdef\nghi\tjkl\nmno\tpqr\n"; | | /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab. | * This will also result in different replacements when TAB and $ appear in the CSV. | */ 1| auto tsv1_x = "a$b$c\n"; 1| auto tsv2_x = "a$bc$$$def\n"; 1| auto tsv3_x = "$a$ b $ cd $\n"; 1| auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 1| auto tsv5_x = " $ $ \n"; 1| auto tsv6_x = "\t$\t\t$\t\t\t\n"; 1| auto tsv7_x = ",$,,$,,,\n"; 1| auto tsv8_x = "$\"$\"\"\n"; 1| auto tsv9_x = "ab, de\tfg\" hij\n"; 1| auto tsv10_x = ""; 1| auto tsv11_x = "$\n"; 1| auto tsv12_x = "$$\n"; 1| auto tsv13_x = " $ $ \n"; 1| auto tsv14_x = " $ $ \n"; 1| auto tsv15_x = "ab, de\tfg\" hij\n"; 1| auto tsv16_x = "ab, de\tfg\" hij\n"; 1| auto tsv17_x = "ab\"$ab\"cd\n"; 1| auto tsv18_x = "\n\n\n"; 1| auto tsv19_x = "\t\n"; 1| auto tsv20_x = "\t\t\n"; 1| auto tsv21_x = "a\n"; 1| auto tsv22_x = "a$\n"; 1| auto tsv23_x = "a$b\n"; 1| auto tsv24_x = "$\n"; 1| auto tsv25_x = "#\n"; 1| auto tsv26_x = "^\n"; 1| auto tsv27_x = "#^#\n"; 1| auto tsv28_x = "^#^\n"; 1| auto tsv29_x = " \n"; 1| auto tsv30_x = " $ \n $ $ \n^# $ #^$# ^$^ #\n"; 1| auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 1| auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 1| auto tsv33_x = "\nX\nX\n\nX\n"; 1| auto tsv34_x = " $X $X Y$ Y\n $X $X Y$ Y\n $X $X Y$ Y\n"; 1| auto tsv35_x = "abc$def\nghi$jkl\nmno$pqr\n"; | | /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab, | * and with the delimiter/newline replacement string being |--|. Basically, newlines | * and '$' in the original data are replaced by |--|. | */ 1| auto tsv1_y = "a$b$c\n"; 1| auto tsv2_y = "a$bc$$$def\n"; 1| auto tsv3_y = "$a$ b $ cd $\n"; 1| auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 1| auto tsv5_y = "|--|$|--||--|$|--||--||--|\n"; 1| auto tsv6_y = "\t$\t\t$\t\t\t\n"; 1| auto tsv7_y = ",$,,$,,,\n"; 1| auto tsv8_y = "$\"$\"\"\n"; 1| auto tsv9_y = "ab, de\tfg\"|--|hij\n"; 1| auto tsv10_y = ""; 1| auto tsv11_y = "$\n"; 1| auto tsv12_y = "$$\n"; 1| auto tsv13_y = "|--|$|--||--|$|--||--||--|\n"; 1| auto tsv14_y = "|--|$|--||--|$|--||--||--|\n"; 1| auto tsv15_y = "ab, de\tfg\"|--|hij\n"; 1| auto tsv16_y = "ab, de\tfg\"|--|hij\n"; 1| auto tsv17_y = "ab\"$ab\"cd\n"; 1| auto tsv18_y = "\n\n\n"; 1| auto tsv19_y = "\t\n"; 1| auto tsv20_y = "\t\t\n"; 1| auto tsv21_y = "a\n"; 1| auto tsv22_y = "a$\n"; 1| auto tsv23_y = "a$b\n"; 1| auto tsv24_y = "$\n"; 1| auto tsv25_y = "#\n"; 1| auto tsv26_y = "^\n"; 1| auto tsv27_y = "#^#\n"; 1| auto tsv28_y = "^#^\n"; 1| auto tsv29_y = "|--|\n"; 1| auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n"; 1| auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 1| auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 1| auto tsv33_y = "\nX\nX\n\nX\n"; 1| auto tsv34_y = "|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n"; 1| auto tsv35_y = "abc$def\nghi$jkl\nmno$pqr\n"; | | /* The TSV results for CSV sets 1a and 1b, but with the TAB replacement as |TAB| | * and newline replacement |NL|. | */ 1| auto tsv1_z = "a\tb\tc\n"; 1| auto tsv2_z = "a\tbc\t\t\tdef\n"; 1| auto tsv3_z = "\ta\t b \t cd \t\n"; 1| auto tsv4_z = "ß\tßÀß\tあめりか物語\t书名: 五色石\n"; 1| auto tsv5_z = "\t\t\n"; 1| auto tsv6_z = "\t\t\n"; 1| auto tsv7_z = ",\t,,\t,,,\n"; 1| auto tsv8_z = "\t\"\t\"\"\n"; 1| auto tsv9_z = "ab, defg\"hij\n"; 1| auto tsv10_z = ""; 1| auto tsv11_z = "\t\n"; 1| auto tsv12_z = "\t\t\n"; 1| auto tsv13_z = "\t\t\n"; 1| auto tsv14_z = "\t\t\n"; 1| auto tsv15_z = "ab, defg\"hij\n"; 1| auto tsv16_z = "ab, defg\"hij\n"; 1| auto tsv17_z = "ab\"\tab\"cd\n"; 1| auto tsv18_z = "\n\n\n"; 1| auto tsv19_z = "\n"; 1| auto tsv20_z = "\n"; 1| auto tsv21_z = "a\n"; 1| auto tsv22_z = "a\t\n"; 1| auto tsv23_z = "a\tb\n"; 1| auto tsv24_z = "\t\n"; 1| auto tsv25_z = "#\n"; 1| auto tsv26_z = "^\n"; 1| auto tsv27_z = "#^#\n"; 1| auto tsv28_z = "^#^\n"; 1| auto tsv29_z = "$\n"; 1| auto tsv30_z = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n"; 1| auto tsv31_z = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n"; 1| auto tsv32_z = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n"; 1| auto tsv33_z = "\nX\nX\n\nX\n"; 1| auto tsv34_z = "\tX\tXY\tY\n\tX\tXY\tY\n\tX\tXY\tY\n"; 1| auto tsv35_z = "abc\tdef\nghi\tjkl\nmno\tpqr\n"; | | /* Aggregate the test data into parallel arrays. */ 1| auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a, | csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a, | csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a, | csv31a, csv32a, csv33a, csv34a, csv35a]; | 1| auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b, | csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b, | csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b, | csv31b, csv32b, csv33b, csv34b, csv35b]; | 1| auto tsvSet1 = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10, | tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20, | tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30, | tsv31, tsv32, tsv33, tsv34, tsv35]; | 1| auto tsvSet1_x = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x, | tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x, | tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x, | tsv31_x, tsv32_x, tsv33_x, tsv34_x, tsv35_x]; | 1| auto tsvSet1_y = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y, | tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y, | tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y, | tsv31_y, tsv32_y, tsv33_y, tsv34_y, tsv35_y]; | 1| auto tsvSet1_z = [tsv1_z, tsv2_z, tsv3_z, tsv4_z, tsv5_z, tsv6_z, tsv7_z, tsv8_z, tsv9_z, tsv10_z, | tsv11_z, tsv12_z, tsv13_z, tsv14_z, tsv15_z, tsv16_z, tsv17_z, tsv18_z, tsv19_z, tsv20_z, | tsv21_z, tsv22_z, tsv23_z, tsv24_z, tsv25_z, tsv26_z, tsv27_z, tsv28_z, tsv29_z, tsv30_z, | tsv31_z, tsv32_z, tsv33_z, tsv34_z, tsv35_z]; | | /* The tests. */ 1| auto bufferSizeTests = [1, 2, 3, 8, 128]; | 18| foreach (bufferSize; bufferSizeTests) | { 5| ubyte[] readBuffer = new ubyte[](bufferSize); | 1055| foreach (i, csva, csvb, tsv, tsv_x, tsv_y, tsv_z; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y, tsvSet1_z)) | { | import std.conv : to; | | /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */ 175| ubyte[] csvInputA = cast(ubyte[])csva; 175| ubyte[] csvInputB = cast(ubyte[])csvb; | | /* CSV Set A vs TSV expected. */ 175| auto tsvResultA = appender!(char[])(); 175| csv2tsv(csvInputA, tsvResultA, readBuffer, "csvInputA_defaultTSV"); 175| assert(tsv == tsvResultA.data, | format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, csva, tsv, tsvResultA.data)); | | /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/ 175| auto tsvResultB = appender!(char[])(); 175| csv2tsv(csvInputB, tsvResultB, readBuffer, "csvInputB_defaultTSV", 0, '#', '^'); 175| assert(tsv == tsvResultB.data, | format("Unittest failure. tsv != tsvResultB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, csvb, tsv, tsvResultB.data)); | | /* CSV Set A and TSV with $ separator.*/ 175| csvInputA = cast(ubyte[])csva; 175| auto tsvResult_XA = appender!(char[])(); 175| csv2tsv(csvInputA, tsvResult_XA, readBuffer, "csvInputA_TSV_WithDollarDelimiter", 0, '"', ',', '$'); 175| assert(tsv_x == tsvResult_XA.data, | format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, csva, tsv_x, tsvResult_XA.data)); | | /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/ 175| csvInputB = cast(ubyte[])csvb; 175| auto tsvResult_XB = appender!(char[])(); 175| csv2tsv(csvInputB, tsvResult_XB, readBuffer, "csvInputB__TSV_WithDollarDelimiter", 0, '#', '^', '$'); 175| assert(tsv_x == tsvResult_XB.data, | format("Unittest failure. tsv_x != tsvResult_XB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, csvb, tsv_x, tsvResult_XB.data)); | | /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */ 175| csvInputA = cast(ubyte[])csva; 175| auto tsvResult_YA = appender!(char[])(); 175| csv2tsv(csvInputA, tsvResult_YA, readBuffer, "csvInputA_TSV_WithDollarAndDelimReplacement", 0, '"', ',', '$', "|--|", "|--|"); 175| assert(tsv_y == tsvResult_YA.data, | format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, csva, tsv_y, tsvResult_YA.data)); | | /* CSV Set B and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/ 175| csvInputB = cast(ubyte[])csvb; 175| auto tsvResult_YB = appender!(char[])(); 175| csv2tsv(csvInputB, tsvResult_YB, readBuffer, "csvInputB__TSV_WithDollarAndDelimReplacement", 0, '#', '^', '$', "|--|", "|--|"); 175| assert(tsv_y == tsvResult_YB.data, | format("Unittest failure. tsv_y != tsvResult_YB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, csvb, tsv_y, tsvResult_YB.data)); | | /* CSV Set A and TSV with TAB replacement as and newline replacement as . Same TSV as CSV Set A.*/ 175| csvInputA = cast(ubyte[])csva; 175| auto tsvResult_ZA = appender!(char[])(); 175| csv2tsv(csvInputA, tsvResult_ZA, readBuffer, "csvInputA_TSV_WithDifferentTABandNLReplacements", 0, '"', ',', '\t', "", ""); 175| assert(tsv_z == tsvResult_ZA.data, | format("Unittest failure. tsv_z != tsvResult_ZA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, csva, tsv_z, tsvResult_ZA.data)); | } | } |} | |// csv2tsv skiplines tests |unittest |{ | import std.string : representation; | 1| auto csv1 = ""; 1| auto csv2 = "a"; | 1| auto csv3 = "\n"; 1| auto csv4 = "\n\n"; 1| auto csv5 = "\n\n\n"; | 1| auto csv6 = "a\n"; 1| auto csv7 = "a\nb\n"; 1| auto csv8 = "a\nb\nc\n"; | 1| auto csv9 = "\"\n\"\n"; 1| auto csv10 = "\"\n\"\n\"\n\"\n"; 1| auto csv11 = "\"\n\"\n\"\n\"\n\"\n\"\n"; | 1| auto csv12 = "\r"; 1| auto csv13 = "\r\r"; 1| auto csv14 = "\r\r\r"; | 1| auto csv15 = "a\r"; 1| auto csv16 = "a\rb\r"; 1| auto csv17 = "a\rb\rc\r"; | 1| auto csv18 = "\"\r\"\r"; 1| auto csv19 = "\"\r\"\r\"\r\"\r"; 1| auto csv20 = "\"\r\"\r\"\r\"\r\"\r\"\r"; | 1| auto csv21 = "\r\n"; 1| auto csv22 = "\r\n\r\n"; 1| auto csv23 = "\r\n\r\n\r\n"; | 1| auto csv24 = "a\r\n"; 1| auto csv25 = "a\r\nb\r\n"; 1| auto csv26 = "a\r\nb\r\nc\r\n"; | 1| auto csv27 = "\"\r\n\"\r\n"; 1| auto csv28 = "\"\r\n\"\r\n\"\r\n\"\r\n"; 1| auto csv29 = "\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n"; | | /* The Skip 1 expected results. */ 1| auto tsv1Skip1 = ""; 1| auto tsv2Skip1 = ""; | 1| auto tsv3Skip1 = ""; 1| auto tsv4Skip1 = "\n"; 1| auto tsv5Skip1 = "\n\n"; | 1| auto tsv6Skip1 = ""; 1| auto tsv7Skip1 = "b\n"; 1| auto tsv8Skip1 = "b\nc\n"; | 1| auto tsv9Skip1 = ""; 1| auto tsv10Skip1 = " \n"; 1| auto tsv11Skip1 = " \n \n"; | 1| auto tsv12Skip1 = ""; 1| auto tsv13Skip1 = "\n"; 1| auto tsv14Skip1 = "\n\n"; | 1| auto tsv15Skip1 = ""; 1| auto tsv16Skip1 = "b\n"; 1| auto tsv17Skip1 = "b\nc\n"; | 1| auto tsv18Skip1 = ""; 1| auto tsv19Skip1 = " \n"; 1| auto tsv20Skip1 = " \n \n"; | 1| auto tsv21Skip1 = ""; 1| auto tsv22Skip1 = "\n"; 1| auto tsv23Skip1 = "\n\n"; | 1| auto tsv24Skip1 = ""; 1| auto tsv25Skip1 = "b\n"; 1| auto tsv26Skip1 = "b\nc\n"; | 1| auto tsv27Skip1 = ""; 1| auto tsv28Skip1 = " \n"; 1| auto tsv29Skip1 = " \n \n"; | | /* The Skip 2 expected results. */ 1| auto tsv1Skip2 = ""; 1| auto tsv2Skip2 = ""; | 1| auto tsv3Skip2 = ""; 1| auto tsv4Skip2 = ""; 1| auto tsv5Skip2 = "\n"; | 1| auto tsv6Skip2 = ""; 1| auto tsv7Skip2 = ""; 1| auto tsv8Skip2 = "c\n"; | 1| auto tsv9Skip2 = ""; 1| auto tsv10Skip2 = ""; 1| auto tsv11Skip2 = " \n"; | 1| auto tsv12Skip2 = ""; 1| auto tsv13Skip2 = ""; 1| auto tsv14Skip2 = "\n"; | 1| auto tsv15Skip2 = ""; 1| auto tsv16Skip2 = ""; 1| auto tsv17Skip2 = "c\n"; | 1| auto tsv18Skip2 = ""; 1| auto tsv19Skip2 = ""; 1| auto tsv20Skip2 = " \n"; | 1| auto tsv21Skip2 = ""; 1| auto tsv22Skip2 = ""; 1| auto tsv23Skip2 = "\n"; | 1| auto tsv24Skip2 = ""; 1| auto tsv25Skip2 = ""; 1| auto tsv26Skip2 = "c\n"; | 1| auto tsv27Skip2 = ""; 1| auto tsv28Skip2 = ""; 1| auto tsv29Skip2 = " \n"; | 1| auto csvSet = | [csv1, csv2, csv3, csv4, csv5, csv6, csv7, csv8, csv9, csv10, | csv11, csv12, csv13, csv14, csv15, csv16, csv17, csv18, csv19, csv20, | csv21, csv22, csv23, csv24, csv25, csv26, csv27, csv28, csv29]; | 1| auto tsvSkip1Set = | [tsv1Skip1, tsv2Skip1, tsv3Skip1, tsv4Skip1, tsv5Skip1, tsv6Skip1, tsv7Skip1, tsv8Skip1, tsv9Skip1, tsv10Skip1, | tsv11Skip1, tsv12Skip1, tsv13Skip1, tsv14Skip1, tsv15Skip1, tsv16Skip1, tsv17Skip1, tsv18Skip1, tsv19Skip1, tsv20Skip1, | tsv21Skip1, tsv22Skip1, tsv23Skip1, tsv24Skip1, tsv25Skip1, tsv26Skip1, tsv27Skip1, tsv28Skip1, tsv29Skip1]; | 1| auto tsvSkip2Set = | [tsv1Skip2, tsv2Skip2, tsv3Skip2, tsv4Skip2, tsv5Skip2, tsv6Skip2, tsv7Skip2, tsv8Skip2, tsv9Skip2, tsv10Skip2, | tsv11Skip2, tsv12Skip2, tsv13Skip2, tsv14Skip2, tsv15Skip2, tsv16Skip2, tsv17Skip2, tsv18Skip2, tsv19Skip2, tsv20Skip2, | tsv21Skip2, tsv22Skip2, tsv23Skip2, tsv24Skip2, tsv25Skip2, tsv26Skip2, tsv27Skip2, tsv28Skip2, tsv29Skip2]; | 1| auto bufferSizeTests = [1, 2, 3, 4, 8, 128]; | 21| foreach (bufferSize; bufferSizeTests) | { 6| ubyte[] readBuffer = new ubyte[](bufferSize); | 528| foreach (i, csv, tsvSkip1, tsvSkip2; lockstep(csvSet, tsvSkip1Set, tsvSkip2Set)) | { 174| ubyte[] csvInput = csv.dup.representation; 174| auto csvToTSVSkip1 = appender!(char[])(); 174| auto csvToTSVSkip2 = appender!(char[])(); | 174| csv2tsv(csvInput, csvToTSVSkip1, readBuffer, "csvToTSVSkip1", 1); | 174| assert(tsvSkip1 == csvToTSVSkip1.data, | format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, bufferSize, csv, tsvSkip1, csvToTSVSkip1.data)); | 174| csv2tsv(csvInput, csvToTSVSkip2, readBuffer, "csvToTSVSkip2", 2); | 174| assert(tsvSkip2 == csvToTSVSkip2.data, | format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, bufferSize, csv, tsvSkip2, csvToTSVSkip2.data)); | } | } |} | |// csv2tsv BOM tests. Note: std.range.lockstep prevents use of @safe |unittest |{ | import std.conv : hexString; | import std.string : representation; | | enum utf8BOM = hexString!"efbbbf"; | 1| auto csv1 = ""; 1| auto csv2 = "a"; 1| auto csv3 = "ab"; 1| auto csv4 = "a,b"; 1| auto csv5 = "a,b\ncdef,ghi\njklmn,opqrs\ntuv,wxyz"; | 1| auto csv1BOM = utf8BOM ~ csv1; 1| auto csv2BOM = utf8BOM ~ csv2; 1| auto csv3BOM = utf8BOM ~ csv3; 1| auto csv4BOM = utf8BOM ~ csv4; 1| auto csv5BOM = utf8BOM ~ csv5; | 1| auto tsv1 = ""; 1| auto tsv2 = "a\n"; 1| auto tsv3 = "ab\n"; 1| auto tsv4 = "a\tb\n"; 1| auto tsv5 = "a\tb\ncdef\tghi\njklmn\topqrs\ntuv\twxyz\n"; | | /* Note: csv1 is the empty string, so tsv1 does not have a trailing newline. | * However, with the BOM prepended the tsv gets a trailing newline. | */ 1| auto tsv1BOM = utf8BOM ~ tsv1 ~ "\n"; 1| auto tsv2BOM = utf8BOM ~ tsv2; 1| auto tsv3BOM = utf8BOM ~ tsv3; 1| auto tsv4BOM = utf8BOM ~ tsv4; 1| auto tsv5BOM = utf8BOM ~ tsv5; | 1| auto csvSet = [csv1, csv2, csv3, csv4, csv5]; 1| auto csvBOMSet = [csv1BOM, csv2BOM, csv3BOM, csv4BOM, csv5BOM]; | 1| auto tsvSet = [tsv1, tsv2, tsv3, tsv4, tsv5]; 1| auto tsvBOMSet = [tsv1BOM, tsv2BOM, tsv3BOM, tsv4BOM, tsv5BOM]; | 1| auto bufferSizeTests = [1, 2, 3, 4, 8, 128]; | 21| foreach (bufferSize; bufferSizeTests) | { 6| ubyte[] readBuffer = new ubyte[](bufferSize); | 126| foreach (i, csv, csvBOM, tsv, tsvBOM; lockstep(csvSet, csvBOMSet, tsvSet, tsvBOMSet)) | { 30| ubyte[] csvInput = csv.dup.representation; 30| ubyte[] csvBOMInput = csvBOM.dup.representation; | 30| auto csvToTSV = appender!(char[])(); 30| auto csvToTSV_NoBOMRemoval = appender!(char[])(); 30| auto csvBOMToTSV = appender!(char[])(); 30| auto csvBOMToTSV_NoBOMRemoval = appender!(char[])(); | 30| csv2tsv(csvInput, csvToTSV, readBuffer, "csvToTSV", 0, '"', ',', '\t', " ", " ", true); 30| assert(tsv == csvToTSV.data, | format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, bufferSize, csv, tsv, csvToTSV.data)); | 30| csv2tsv(csvInput, csvToTSV_NoBOMRemoval, readBuffer, "csvToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false); 30| assert(tsv == csvToTSV_NoBOMRemoval.data, | format("Unittest failure. tsv != csvToTSV_NoBOMRemoval.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, bufferSize, csv, tsv, csvToTSV_NoBOMRemoval.data)); | 30| csv2tsv(csvBOMInput, csvBOMToTSV, readBuffer, "csvBOMToTSV", 0, '"', ',', '\t', " ", " ", true); 30| if (readBuffer.length < utf8BOM.length) | { | /* Removing BOMs, but didn't provide enough buffer, so no removal. */ 10| assert(tsvBOM == csvBOMToTSV.data, | format("Unittest failure. tsvBOM != csvBOMToTSV.data. (Small buffer) Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, bufferSize, csv, tsv, csvBOMToTSV.data)); | } | else | { 20| assert(tsv == csvBOMToTSV.data, | format("Unittest failure. tsv != csvBOMToTSV.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, bufferSize, csv, tsv, csvBOMToTSV.data)); | } | 30| csv2tsv(csvBOMInput, csvBOMToTSV_NoBOMRemoval, readBuffer, "csvBOMToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false); 30| assert(tsvBOM == csvBOMToTSV_NoBOMRemoval.data, | format("Unittest failure. tsvBOM != csvBOMToTSV_NoBOMRemoval.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", | i + 1, bufferSize, csv, tsv, csvBOMToTSV_NoBOMRemoval.data)); | } | } |} csv2tsv/src/tsv_utils/csv2tsv.d is 100% covered <<<<<< EOF