mrc-ide / SIMPLEGEN

@@ -35,3 +35,240 @@
Loading
35 35
}
36 36
#endif
37 37
38 +
//------------------------------------------------
39 +
// read in transmission record and prune to only those events that run up to the
40 +
// final sample
41 +
#ifdef RCPP_ACTIVE
42 +
void prune_transmission_record_cpp(Rcpp::List args) {
43 +
  
44 +
  // start timer
45 +
  chrono::high_resolution_clock::time_point t1 = chrono::high_resolution_clock::now();
46 +
  
47 +
  // extract input args
48 +
  string transmission_record_location = rcpp_to_string(args["transmission_record_location"]);
49 +
  string pruned_record_location = rcpp_to_string(args["pruned_record_location"]);
50 +
  bool silent = rcpp_to_bool(args["silent"]);
51 +
  vector<int> inoc_IDs = rcpp_to_vector_int(args["inoc_IDs"]);
52 +
  
53 +
  // open filestream to transmission record and check that opened
54 +
  if (!silent) {
55 +
    print("Opening filestream to transmission record");
56 +
  }
57 +
  ifstream infile;
58 +
  infile.open(transmission_record_location);
59 +
  if (!infile.is_open()) {
60 +
    Rcpp::stop("unable to open filestream at specified location. Check the path exists, and that you have read access");
61 +
  }
62 +
  
63 +
  // open filestream to pruned record and check that opened
64 +
  if (!silent) {
65 +
    print("Opening filestream to pruned transmission record");
66 +
  }
67 +
  ofstream outfile;
68 +
  outfile.open(pruned_record_location);
69 +
  if (!outfile.is_open()) {
70 +
    Rcpp::stop("unable to open filestream at specified location. Check the path exists, and that you have write access");
71 +
  }
72 +
  
73 +
  // start by getting positions of all line breaks, and the first ID in every
74 +
  // line
75 +
  vector<int> line_starts(1);
76 +
  vector<pair<int, int>> first_IDs;
77 +
  stringstream ss;
78 +
  int ID;
79 +
  bool new_line = true;
80 +
  int i = 0;
81 +
  int t = 0;
82 +
  char c;
83 +
  while (infile.get(c)) {
84 +
    i++;
85 +
    if (c == '\n') {
86 +
      line_starts.push_back(i);
87 +
      new_line = true;
88 +
      t++;
89 +
    }
90 +
    if (new_line) {
91 +
      if (c == ' ' || c == ';') {
92 +
        ss >> ID;
93 +
        first_IDs.push_back(make_pair(t, ID));
94 +
        ss.clear();
95 +
        ss.str(std::string());
96 +
        new_line = false;
97 +
      } else {
98 +
        ss << c;
99 +
      }
100 +
    }
101 +
  }
102 +
  
103 +
  // count the number of generations. Note that the line_starts vector contains
104 +
  // (max_time+1) elements, as the final element represents the newlinw at the
105 +
  // end of the file
106 +
  int max_time = int(line_starts.size()) - 1;
107 +
  
108 +
  // create a vector of vectors for storing IDs we are interested in at each
109 +
  // time point. Initialise with inoc_IDs
110 +
  vector<vector<int>> pop(max_time);
111 +
  add_to_pop(inoc_IDs, first_IDs, pop);
112 +
  
113 +
  // create a map for storing final results. First value in the vector will
114 +
  // store the time
115 +
  map<int, vector<int>> pruned;
116 +
  
117 +
  // loop through the file again, this time reading lines in reverse order
118 +
  infile.clear();
119 +
  infile.seekg(0, ios::beg);
120 +
  ss.clear();
121 +
  ss.str(std::string());
122 +
  for (int t = (max_time-1); t >= 0; --t) {
123 +
    
124 +
    // some housekeeping
125 +
    int this_line_start = line_starts[t];
126 +
    int this_line_end = line_starts[t+1] - 1;
127 +
    int popsize = int(pop[t].size());
128 +
    inoc_IDs.clear();
129 +
    
130 +
    // go to start of line t
131 +
    infile.seekg(this_line_start, ios::beg);
132 +
    
133 +
    // loop through characters until reach end of line
134 +
    bool save_IDs_on = false;
135 +
    bool new_block = true;
136 +
    int ID_key;
137 +
    for (int i = this_line_start; i < this_line_end; ++i) {
138 +
      infile.get(c);
139 +
      
140 +
      // process character, either storing or discarding
141 +
      if (new_block) {
142 +
        if (c == ' ' || c == ';') {
143 +
          ss >> ID_key;
144 +
          ss.clear();
145 +
          ss.str(std::string());
146 +
          if (c == ' ') {
147 +
            new_block = false;
148 +
          }
149 +
          
150 +
          // if ID_key is in pop[t] then store time, and activate ID saving for
151 +
          // remainder of this block
152 +
          save_IDs_on = false;
153 +
          for (int j = 0; j < popsize; ++j) {
154 +
            if (pop[t][j] == ID_key) {
155 +
              pruned[ID_key].push_back(t);
156 +
              save_IDs_on = true;
157 +
              break;
158 +
            }
159 +
          }
160 +
          
161 +
        } else {
162 +
          ss << c;
163 +
        }
164 +
      } else {
165 +
        if (save_IDs_on) {
166 +
          if (c == ' ' || c == ';') {
167 +
            ss >> ID;
168 +
            pruned[ID_key].push_back(ID);
169 +
            ss.clear();
170 +
            ss.str(std::string());
171 +
            inoc_IDs.push_back(ID);
172 +
            if (c == ';') {
173 +
              new_block = true;
174 +
            }
175 +
          } else {
176 +
            ss << c;
177 +
          }
178 +
        } else if (c == ';') {
179 +
          ss.clear();
180 +
          ss.str(std::string());
181 +
          new_block = true;
182 +
        }
183 +
      }
184 +
      
185 +
      
186 +
    }  // end i loop
187 +
    
188 +
    // add new inoc_IDs to pop
189 +
    add_to_pop(inoc_IDs, first_IDs, pop);
190 +
    
191 +
  }  // end t loop
192 +
  
193 +
  // write pruned record to file
194 +
  t = 0;
195 +
  for (const auto & x : pruned) {
196 +
    while (x.second[0] > t) {
197 +
      outfile << "\n";
198 +
      t++;
199 +
    }
200 +
    outfile << x.first;
201 +
    for (int i = 1; i < int(x.second.size()); ++i) {
202 +
      outfile << " " << x.second[i];
203 +
    }
204 +
    outfile << ";";
205 +
  }
206 +
  outfile << "\n";
207 +
  
208 +
  // close filestreams
209 +
  if (!silent) {
210 +
    print("Closing filestreams");
211 +
  }
212 +
  infile.close();
213 +
  outfile.close();
214 +
  
215 +
  // end timer
216 +
  chrono_timer(t1);
217 +
  
218 +
}
219 +
#endif
220 +
221 +
//------------------------------------------------
222 +
// read in a vector of inoc_IDs. These are put in ascending order, and searched
223 +
// for using the information in first_IDs to find the time at which each inoc_ID
224 +
// is first created. Finally, inoc_IDs are added to the population array at
225 +
// these time points. IDs are only added if they are not already present in the
226 +
// population array at that time point.
227 +
void add_to_pop(vector<int> &inoc_IDs, const vector<pair<int, int>> &first_IDs,
228 +
                vector<vector<int>> &pop, bool print_pop) {
229 +
  
230 +
  // sort inoc_IDs
231 +
  sort(inoc_IDs.begin(), inoc_IDs.end());
232 +
  
233 +
  // push back IDs to population at correct time
234 +
  int j = 0;
235 +
  bool break_i = false;
236 +
  for (int i = 0; i < (int(first_IDs.size()) - 1); ++i) {
237 +
    if (break_i) {
238 +
      break;
239 +
    }
240 +
    while (inoc_IDs[j] < first_IDs[i+1].second) {
241 +
      int t = first_IDs[i].first;
242 +
      if (find(pop[t].begin(), pop[t].end(), inoc_IDs[j]) == pop[t].end()) {
243 +
        pop[t].push_back(inoc_IDs[j]);
244 +
      }
245 +
      j++;
246 +
      if (j > (int(inoc_IDs.size()) - 1)) {
247 +
        break_i = true;
248 +
        break;
249 +
      }
250 +
    }
251 +
  }
252 +
  
253 +
  // any leftover IDs must be beyond the end of first_IDs, and so are added to
254 +
  // final timepoint
255 +
  for (int i = j; i < int(inoc_IDs.size()); ++i) {
256 +
    int t = first_IDs[first_IDs.size()-1].first;
257 +
    if (find(pop[t].begin(), pop[t].end(), inoc_IDs[i]) == pop[t].end()) {
258 +
      pop[t].push_back(inoc_IDs[i]);
259 +
    }
260 +
  }
261 +
  
262 +
  // optionally print pop array
263 +
  if (print_pop) {
264 +
    for (int t = 0; t < int(pop.size()); ++t) {
265 +
      Rcpp::Rcout << "t = " << t << ": ";
266 +
      for (int i = 0; i < int(pop[t].size()); ++i) {
267 +
        Rcpp::Rcout << pop[t][i] << " ";
268 +
      }
269 +
      Rcpp::Rcout << "\n";
270 +
    }
271 +
  }
272 +
  
273 +
}
274 +

@@ -604,3 +604,63 @@
Loading
604 604
  invisible(project)
605 605
}
606 606
607 +
#------------------------------------------------
608 +
#' @title Prune the transmission record
609 +
#'
610 +
#' @description TODO
611 +
#'
612 +
#' @param project a SIMPLEGEN project, as produced by the
613 +
#'   \code{simplegen_project()} function.
614 +
#' @param transmission_record_location the file path to a transmission record
615 +
#'   already written to file.
616 +
#' @param pruned_record_location the file path that the pruned transmission
617 +
#'   record will be written to.
618 +
#' @param overwrite_pruned_record if \code{TRUE} the pruned transmission record
619 +
#'   will overwrite any existing file by the same name. \code{FALSE} by default.
620 +
#' @param silent whether to suppress written messages to the console.
621 +
#' 
622 +
#' @export
623 +
624 +
prune_transmission_record <- function(project,
625 +
                                      transmission_record_location = "",
626 +
                                      pruned_record_location = "",
627 +
                                      overwrite_pruned_record = FALSE,
628 +
                                      silent = FALSE) {
629 +
  
630 +
  # check inputs
631 +
  assert_custom_class(project, "simplegen_project")
632 +
  assert_string(transmission_record_location)
633 +
  assert_string(pruned_record_location)
634 +
  assert_single_logical(overwrite_pruned_record)
635 +
  assert_single_logical(silent)
636 +
  
637 +
  # check transmission record exists
638 +
  if (!file.exists(transmission_record_location)) {
639 +
    stop(sprintf("could not find file at %s", transmission_record_location))
640 +
  }
641 +
  
642 +
  # optionally return warning if will overwrite pruned transmission record file
643 +
  if (!overwrite_pruned_record) {
644 +
    if (file.exists(pruned_record_location)) {
645 +
      stop(sprintf("file already exists at %s. Change target location, or use argument `overwrite_pruned_record = TRUE` to manually override this warning", pruned_record_location))
646 +
    }
647 +
  }
648 +
  
649 +
  # subset sample details to a vector of inoc_IDs
650 +
  inoc_IDs <- unlist(project$sample_details$inoc_IDs)
651 +
  if (length(inoc_IDs) == 0) {
652 +
    stop("no malaria positive hosts in sample")
653 +
  }
654 +
  
655 +
  # define argument list
656 +
  args <- list(transmission_record_location = transmission_record_location,
657 +
               pruned_record_location = pruned_record_location,
658 +
               inoc_IDs = inoc_IDs,
659 +
               silent = silent)
660 +
  
661 +
  # run efficient C++ code
662 +
  prune_transmission_record_cpp(args)
663 +
  
664 +
  # return project unchanged
665 +
  invisible(project)
666 +
}
Files Coverage
R 65.35%
src 56.76%
Project Totals (24 files) 59.28%
1
comment: false
2

3
coverage:
4
  status:
5
    project:
6
      default:
7
        target: auto
8
        threshold: 1%
9
    patch:
10
      default:
11
        target: auto
12
        threshold: 1%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading