No flags found
Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.
e.g., #unittest #integration
#production #enterprise
#frontend #backend
0177850
... +45 ...
a0c8ffe
Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.
e.g., #unittest #integration
#production #enterprise
#frontend #backend
153 | 153 | #' raw diff data and not the printed output of \code{diffobj}, but do not wish |
|
154 | 154 | #' to manually parse the \code{ses} output. Whether it is faster than |
|
155 | 155 | #' \code{ses} or not depends on the ratio of matching to non-matching values as |
|
156 | - | #' \code{ses_dat} includes matching values whereas \code{ses} does not. See |
|
157 | - | #' examples. |
|
156 | + | #' \code{ses_dat} includes matching values whereas \code{ses} does not. |
|
157 | + | #' \code{ses_dat} objects have a print method that makes it easy to interpret |
|
158 | + | #' the diff, but are actually data.frames. You can see the underlying data by |
|
159 | + | #' using \code{as.data.frame}, removing the "ses_dat" class, etc.. |
|
158 | 160 | #' |
|
159 | 161 | #' @export |
|
160 | 162 | #' @param a character |
166 | 168 | #' @param warn TRUE (default) or FALSE whether to warn if we hit |
|
167 | 169 | #' \code{max.diffs}. |
|
168 | 170 | #' @return character shortest edit script, or a machine readable version of it |
|
169 | - | #' as a \code{data.frame} with columns \code{op} (factor, values |
|
170 | - | #' \dQuote{Match}, \dQuote{Insert}, or \dQuote{Delete}), \code{val} character |
|
171 | - | #' corresponding to the value taken from either \code{a} or \code{b}, |
|
172 | - | #' and if \code{extra} is TRUE, integer columns \code{id.a} and \code{id.b} |
|
173 | - | #' corresponding to the indices in \code{a} or \code{b} that \code{val} was |
|
174 | - | #' taken from. See Details. |
|
171 | + | #' as a \code{ses_dat} object, which is a \code{data.frame} with columns |
|
172 | + | #' \code{op} (factor, values \dQuote{Match}, \dQuote{Insert}, or |
|
173 | + | #' \dQuote{Delete}), \code{val} character corresponding to the value taken |
|
174 | + | #' from either \code{a} or \code{b}, and if \code{extra} is TRUE, integer |
|
175 | + | #' columns \code{id.a} and \code{id.b} corresponding to the indices in |
|
176 | + | #' \code{a} or \code{b} that \code{val} was taken from. See Details. |
|
175 | 177 | #' @examples |
|
176 | 178 | #' a <- letters[1:6] |
|
177 | 179 | #' b <- c('b', 'CC', 'DD', 'd', 'f') |
|
178 | 180 | #' ses(a, b) |
|
179 | 181 | #' (dat <- ses_dat(a, b)) |
|
182 | + | #' str(dat) # data.frame with a print method |
|
180 | 183 | #' |
|
181 | 184 | #' ## use `ses_dat` output to construct a minimal diff |
|
182 | 185 | #' ## color with ANSI CSI SGR |
237 | 240 | values <- character(length(id)) |
|
238 | 241 | values[use.a] <- a[id2[use.a]] |
|
239 | 242 | values[use.b] <- b[id2[use.b]] |
|
240 | - | if(extra) { |
|
243 | + | res <- if(extra) { |
|
241 | 244 | id.a <- id.b <- rep(NA_integer_, length(values)) |
|
242 | 245 | id.a[use.a] <- id2[use.a] |
|
243 | 246 | id.b[use.b] <- id2[use.b] |
248 | 251 | } else { |
|
249 | 252 | data.frame(op=type2, val=values, stringsAsFactors=FALSE) |
|
250 | 253 | } |
|
254 | + | structure(res, class=c('ses_dat', class(res))) |
|
251 | 255 | } |
|
256 | + | #' @export |
|
257 | + | ||
258 | + | print.ses_dat <- function(x, quote=FALSE, ...) { |
|
259 | + | op <- x[['op']] |
|
260 | + | diff <- matrix( |
|
261 | + | "", 3, nrow(x), |
|
262 | + | dimnames=list(c('D:', 'M:', 'I:'), character(nrow(x))) |
|
263 | + | ) |
|
264 | + | d <- op == 'Delete' |
|
265 | + | m <- op == 'Match' |
|
266 | + | i <- op == 'Insert' |
|
267 | + | diff[1, d] <- x[['val']][d] |
|
268 | + | diff[2, m] <- x[['val']][m] |
|
269 | + | diff[3, i] <- x[['val']][i] |
|
270 | + | writeLines( |
|
271 | + | sprintf( |
|
272 | + | "\"ses_dat\" object (Match: %d, Delete: %d, Insert: %d):", |
|
273 | + | sum(m), sum(d), sum(i) |
|
274 | + | ) ) |
|
275 | + | print(diff, quote=quote, ...) |
|
276 | + | invisible(x) |
|
277 | + | } |
|
278 | + | ||
252 | 279 | # Internal validation fun for ses_* |
|
253 | 280 | ||
254 | 281 | ses_prep <- function(a, b, max.diffs, warn) { |
299 | 326 | #' @param a character |
|
300 | 327 | #' @param b character |
|
301 | 328 | #' @param max.diffs integer(1L) how many differences before giving up; set to |
|
302 | - | #' zero to allow as many as there are |
|
329 | + | #' -1 to allow as many as there are up to the maximum allowed (~INT_MAX/4). |
|
303 | 330 | #' @param warn TRUE or FALSE, whether to warn if we hit `max.diffs`. |
|
304 | 331 | #' @return MyersMbaSes object |
|
305 | 332 | #' @useDynLib diffobj, .registration=TRUE, .fixes="DIFFOBJ_" |
|
306 | 333 | ||
307 | - | diff_myers <- function(a, b, max.diffs=0L, warn=FALSE) { |
|
334 | + | diff_myers <- function(a, b, max.diffs=-1L, warn=FALSE) { |
|
308 | 335 | stopifnot( |
|
309 | 336 | is.character(a), is.character(b), all(!is.na(c(a, b))), is.int.1L(max.diffs), |
|
310 | 337 | is.TF(warn) |
76 | 76 | * script rather than simply saying the shortest edit script is longer than |
|
77 | 77 | * allowable diffs; this is all the `faux_snake` stuff. This algorithm tries |
|
78 | 78 | * to salvage whatever the myers algo computed up to the point of max diffs |
|
79 | - | * - Adding lots of comments as we worked through the logic |
|
79 | + | * - Comments. |
|
80 | + | */ |
|
81 | + | /* |
|
82 | + | * Terms |
|
83 | + | * |
|
84 | + | * * A: first string |
|
85 | + | * * B: second string |
|
86 | + | * * N: length of A |
|
87 | + | * * M: length of B |
|
88 | + | * * K: grid-diagonal, numbered from -M to N. For each grid-diagonal K, |
|
89 | + | * X - Y == K, i.e. (3, 1) is on diagonal K == 2. |
|
90 | + | * * D: Number of differences in a path. |
|
91 | + | * * Snake: sequence of diagonal moves (i.e. matching substrings). An edit |
|
92 | + | * script (path) will combine snakes with (possibly zero) right/down moves. A |
|
93 | + | * D-path may have up to D + 1 snakes, some or all zero length. |
|
94 | + | * * Middle Snake: Possibly zero length snake in which the forward and reverse |
|
95 | + | * paths meet in the Linear Space Refinement algorithm. Defined in terms of |
|
96 | + | * (x, y, u, v) where (x, y) are the coordinates from (0, 0), and (u, v) those |
|
97 | + | * from (M, N). |
|
80 | 98 | */ |
|
81 | 99 | ||
82 | 100 | ||
83 | 101 | #include <stdlib.h> |
|
84 | 102 | #include "diffobj.h" |
|
85 | 103 | ||
104 | + | // See _v and _setv for details |
|
105 | + | ||
86 | 106 | #define FV(k) _v(ctx, (k), 0) |
|
87 | 107 | #define RV(k) _v(ctx, (k), 1) |
|
88 | 108 |
124 | 144 | } |
|
125 | 145 | */ |
|
126 | 146 | /* |
|
127 | - | * k = diagonal number |
|
128 | - | * val = x value |
|
129 | - | * r = presumably whether we are looking up in reverse snakes |
|
147 | + | * For each diagonal k, we only store the path that up to this point has gotten |
|
148 | + | * furthest on it (well, two really, one from the forward and one from the |
|
149 | + | * reverse directins) |
|
150 | + | * |
|
151 | + | * @param k = diagonal number |
|
152 | + | * @param val = x value |
|
153 | + | * @param r = 0 for forward snake, 1 for backward snake |
|
130 | 154 | */ |
|
131 | 155 | static void |
|
132 | 156 | _setv(struct _ctx *ctx, int k, int r, int val) |
151 | 175 | } |
|
152 | 176 | /* |
|
153 | 177 | * For any given `k` diagonal, return the x coordinate of the furthest reaching |
|
154 | - | * path we've found. Use `r` to look for the x coordinate for the paths that |
|
155 | - | * are starting from the bottom right instead of top left |
|
178 | + | * path we've found. |
|
179 | + | * |
|
180 | + | * See `_set_v`. |
|
156 | 181 | */ |
|
157 | 182 | static int |
|
158 | 183 | _v(struct _ctx *ctx, int k, int r) |
175 | 200 | * |
|
176 | 201 | * Needs to account for special case when indeces are oob for the strings. The |
|
177 | 202 | * oob checks seem to be necessary since algo is requesting reads past length |
|
178 | - | * of string, but not sure if that is intentional or not. In theory it should |
|
179 | - | * not be, but perhaps this was handled gracefully by the varray business b4 |
|
180 | - | * we changed it. |
|
203 | + | * of string (maybe this worked with the varrays). As the current algo is |
|
204 | + | * written, it does not seem to be the case that we even attempt oob reads. |
|
181 | 205 | */ |
|
182 | 206 | int _comp_chr(SEXP a, int aidx, SEXP b, int bidx) { |
|
183 | 207 | int alen = XLENGTH(a); |
189 | 213 | comp = 1; |
|
190 | 214 | // nocov end |
|
191 | 215 | } else if(aidx >= alen || bidx >= blen) { |
|
192 | - | comp = 0; |
|
216 | + | comp = 0; // nocov |
|
193 | 217 | } else comp = STRING_ELT(a, aidx) == STRING_ELT(b, bidx); |
|
194 | 218 | return(comp); |
|
195 | 219 | } |
|
196 | 220 | /* |
|
197 | 221 | * Handle cases where differences exceed maximum allowable differences |
|
198 | 222 | * |
|
199 | - | * General logic is to create a faux snake that instead of moving down |
|
200 | - | * diagonally will chain right and down moves until it hits a path coming |
|
201 | - | * from the other direction. This snake is stored in `ctx`, and is then |
|
202 | - | * written by `ses` to the `ses` list. |
|
223 | + | * General logic is to try to connect the prior furthest points by naively |
|
224 | + | * incrementing in each dimension and hoping for some diagonal runs. |
|
225 | + | * |
|
226 | + | * @param a character vector |
|
227 | + | * @param b character vector |
|
228 | + | * @param d number of differences associated with the backward snake implicit in |
|
229 | + | * the ms.u/v values. |
|
230 | + | * forward loops in difference seeking; this is not the same as the |
|
231 | + | * number of differences as in each loop we find a forward and backward |
|
232 | + | * difference. |
|
203 | 233 | */ |
|
204 | 234 | static int |
|
205 | 235 | _find_faux_snake( |
|
206 | - | SEXP a, int aoff, int n, SEXP b, int boff, int m, struct _ctx *ctx, |
|
207 | - | struct middle_snake *ms, int d, diff_op ** faux_snake |
|
236 | + | SEXP a, int aoff, int n, SEXP b, int boff, int m, |
|
237 | + | struct middle_snake *ms, diff_op ** faux_snake, int d |
|
208 | 238 | ) { |
|
209 | - | /* normally we would record k/x values at the end of the furthest reaching |
|
210 | - | * snake, but here we need pick a path from top left and extend it until |
|
211 | - | * we hit something coming from bottom right. |
|
212 | - | */ |
|
213 | - | /* start by finding which diagonal has the furthest reaching value |
|
214 | - | * when looking from top left |
|
215 | - | */ |
|
216 | - | int k_max_f = 0, x_max_f = -1; |
|
217 | - | int x_f, y_f, k_f; |
|
218 | - | int delta = n - m; |
|
219 | - | ||
220 | - | for (int k = d - 1; k >= -d + 1; k -= 2) { /* might need to shift by 1 */ |
|
221 | - | int x_f = FV(k); |
|
222 | - | int f_dist = x_f - abs(k); |
|
239 | + | int x = ms->x; |
|
240 | + | int y = ms->y; |
|
241 | + | // Should switch to unsigned int... |
|
242 | + | if(x < 0 || y < 0 || ms->u < 0 || ms->v < 0) |
|
243 | + | error("Internal Error: fake snake with -ve start; contact maintainer."); // nocov |
|
223 | 244 | ||
224 | - | if(x_f > n || x_f - k > m) continue; |
|
225 | - | ||
226 | - | if(f_dist > x_max_f - abs(k_max_f)) { |
|
227 | - | x_max_f = x_f; |
|
228 | - | k_max_f = k; |
|
229 | - | } |
|
230 | - | } |
|
231 | - | /* didn't find a path so use origin */ |
|
232 | - | if(x_max_f < 0) { |
|
233 | - | // nocov start |
|
234 | - | error(err_msg_ubrnch, 2); |
|
235 | - | x_f = y_f = k_f = 0; |
|
236 | - | // nocov end |
|
237 | - | } else { |
|
238 | - | k_f = k_max_f; |
|
239 | - | x_f = x_max_f; |
|
240 | - | y_f = x_f - k_max_f; |
|
241 | - | } |
|
242 | - | /* |
|
243 | - | * now look for the furthest reaching point in any diagonal that is |
|
244 | - | * below the diagonal we found above since those are the only ones we |
|
245 | - | * can connect to |
|
246 | - | * |
|
247 | - | */ |
|
248 | - | int k_max_r = 0, x_max_r = n + 1; |
|
249 | - | int x_r, y_r; |
|
250 | - | ||
251 | - | for (int k = -d; k <= k_max_f - delta; k += 2) { |
|
252 | - | int x_r = RV(k); |
|
253 | - | int r_dist = n - x_r - abs(k); |
|
254 | - | /* skip reverse snakes that overshoot our forward snake |
|
255 | - | * ---\ |
|
256 | - | * \ |
|
257 | - | * \ |
|
258 | - | * \ |
|
259 | - | * \--- |
|
260 | - | * where there should be a decent path we can use, but because we are only |
|
261 | - | * tracking the last coordinates we don't really have a way connecting this |
|
262 | - | * type of path so we just go straight to the origin even though that is |
|
263 | - | * even more sub-optinal; not even sure if this is a possible scenario |
|
264 | - | */ |
|
265 | - | if(x_r < x_f || x_r - k - delta < y_f) continue; |
|
266 | - | ||
267 | - | /* since buffer is init to zero, an x_r value of zero means nothing, and |
|
268 | - | * not all the way to the left of the graph; also, in reverse snakes the |
|
269 | - | * snake should end at x == 1 in the leftmost case (we think) |
|
270 | - | */ |
|
271 | - | if(r_dist > n - x_max_r - abs(k_max_r) && x_r) { |
|
272 | - | x_max_r = x_r; |
|
273 | - | k_max_r = k; |
|
274 | - | } |
|
275 | - | } |
|
276 | - | if(x_max_r >= n) { |
|
277 | - | x_r = n; y_r = m; |
|
278 | - | } else { |
|
279 | - | x_r = x_max_r; |
|
280 | - | /* not 100% sure about this one; seems like k_max_r is relative to the |
|
281 | - | * bottom right origin, so maybe this should be x_r - k_max_r - delta? |
|
282 | - | */ |
|
283 | - | y_r = x_r - k_max_r - delta; |
|
284 | - | } |
|
285 | - | /* |
|
286 | - | * attempt to connect the two paths we found. We need to store this |
|
287 | - | * information as our "faux" snake since it will have to be processed |
|
288 | - | * in a manner similar as the middle snake would be processed; start by |
|
289 | - | * figuring out max number of steps it would take to connect the two |
|
290 | - | * paths |
|
291 | - | */ |
|
292 | - | int max_steps = x_r - x_f + y_r - y_f + 1; |
|
293 | 245 | int steps = 0; |
|
294 | - | int diffs = 0; |
|
246 | + | int diffs = 0; // only diffs from fake snake |
|
295 | 247 | int step_dir = 1; /* last direction we moved in, 1 is down */ |
|
296 | - | int x_sn = x_f, y_sn = y_f; |
|
297 | 248 | ||
298 | - | /* initialize the fake snake */ |
|
299 | - | if(max_steps < 0) |
|
249 | + | if(x > ms->u || y > ms->v) { |
|
250 | + | // Overshot backward snake, e.g. you hit a long diagonal run that overshoots |
|
251 | + | // the prior backward closest point. In this case toss backward snake. |
|
252 | + | ms->u = n; |
|
253 | + | ms->v = m; |
|
254 | + | diffs -= d; // we're also tossing accrued differences from back snake |
|
255 | + | if(x > ms->u || y > ms->v) |
|
256 | + | error("Internal Error: can't correct fwd snake overshoot; contact maintainer"); // nocov |
|
257 | + | } |
|
258 | + | if(ms->u > INT_MAX - ms->v - 1) // x/y positive, so this is conservative |
|
300 | 259 | error("Logic Error: fake snake step overflow? Contact maintainer."); // nocov |
|
301 | 260 | ||
261 | + | int max_steps; |
|
262 | + | max_steps = (ms->u - x) + (ms->v - y) + 1; |
|
263 | + | ||
302 | 264 | diff_op * faux_snake_tmp = (diff_op*) R_alloc(max_steps, sizeof(diff_op)); |
|
303 | 265 | for(int i = 0; i < max_steps; i++) *(faux_snake_tmp + i) = DIFF_NULL; |
|
304 | - | ||
305 | - | /* we have a further reaching reverse snake: |
|
306 | - | * not entirely sure if this should happen, but it seems it does |
|
307 | - | */ |
|
308 | - | while(x_sn < x_r || y_sn < y_r) { |
|
309 | - | if(x_sn > x_r || y_sn > y_r) { |
|
310 | - | error("Logic Error: Exceeded buffer for finding fake snake; contact maintainer."); // nocov |
|
311 | - | } |
|
312 | - | /* check to see if we could possibly move on a diagonal, and do so |
|
313 | - | * if possible, if not alternate going down and right*/ |
|
266 | + | while((x < ms->u) || (y < ms->v)) { |
|
314 | 267 | if( |
|
315 | - | x_sn <= x_r && y_sn <= y_r && |
|
316 | - | _comp_chr(a, aoff + x_sn, b, boff + y_sn) |
|
268 | + | x < ms->u && y < ms->v && |
|
269 | + | _comp_chr(a, aoff + x, b, boff + y) |
|
317 | 270 | ) { |
|
318 | - | x_sn++; y_sn++; |
|
271 | + | x++; y++; |
|
319 | 272 | *(faux_snake_tmp + steps) = DIFF_MATCH; |
|
320 | - | } else if (x_sn < x_r && (step_dir || y_sn >= y_r)) { |
|
321 | - | x_sn++; |
|
273 | + | } else if (x < ms->u && (step_dir || y >= ms->v)) { |
|
274 | + | x++; |
|
322 | 275 | diffs++; |
|
323 | 276 | step_dir = !step_dir; |
|
324 | 277 | *(faux_snake_tmp + steps) = DIFF_DELETE; |
|
325 | - | } else if (y_sn < y_r && (!step_dir || x_sn >= x_r)) { |
|
326 | - | y_sn++; |
|
278 | + | } else if (y < ms->v && (!step_dir || x >= ms->u)) { |
|
279 | + | y++; |
|
327 | 280 | diffs++; |
|
328 | 281 | *(faux_snake_tmp + steps) = DIFF_INSERT; |
|
329 | 282 | step_dir = !step_dir; |
332 | 285 | } |
|
333 | 286 | steps++; |
|
334 | 287 | } |
|
335 | - | /* corner cases; must absolutely make sure steps LT max_steps since we rely |
|
336 | - | * on at least one zero at the end of the faux_snake when we read it to know |
|
337 | - | * to stop reading it |
|
338 | - | */ |
|
339 | - | if(x_sn != x_r || y_sn != y_r || steps >= max_steps) { |
|
288 | + | if(x != ms->u || y != ms->v || steps >= max_steps) { |
|
340 | 289 | error("Logic Error: faux snake process failed; contact maintainer."); // nocov |
|
341 | 290 | } |
|
342 | - | /* modify the pointer to the pointer so we can return in by ref */ |
|
343 | - | ||
344 | 291 | *faux_snake = faux_snake_tmp; |
|
345 | - | ||
346 | - | /* record the coordinates of our faux snake using `ms` */ |
|
347 | - | ms->x = x_f; |
|
348 | - | ms->y = y_f; |
|
349 | - | ms->u = x_r; |
|
350 | - | ms->v = y_r; |
|
351 | - | ||
352 | 292 | return diffs; |
|
353 | 293 | } |
|
354 | 294 |
360 | 300 | * the maximum number of differences, we return to `_ses` with the number of |
|
361 | 301 | * differences found. `_ses` will then attempt to stitch back the snakes |
|
362 | 302 | * together. |
|
303 | + | * |
|
304 | + | * @param ms tracks beginning (x,y) and end (u,v) coords of the middle snake |
|
363 | 305 | */ |
|
364 | 306 | static int |
|
365 | 307 | _find_middle_snake( |
|
366 | 308 | SEXP a, int aoff, int n, SEXP b, int boff, int m, struct _ctx *ctx, |
|
367 | 309 | struct middle_snake *ms, diff_op ** faux_snake |
|
368 | 310 | ) { |
|
369 | 311 | int delta, odd, mid, d; |
|
312 | + | int x_max, y_max, v_max, u_max; |
|
313 | + | ms->x = x_max = 0; |
|
314 | + | ms->y = y_max = 0; |
|
315 | + | ms->u = u_max = n; |
|
316 | + | ms->v = v_max = m; |
|
317 | + | double dist = (x_max - u_max) * (x_max - u_max) + |
|
318 | + | (y_max - v_max) * (y_max - v_max); |
|
370 | 319 | ||
371 | 320 | delta = n - m; |
|
372 | 321 | odd = delta & 1; |
|
373 | - | mid = (n + m) / 2; |
|
322 | + | mid = (n + m) / 2; // we check in `diff` that this won't overflow int |
|
374 | 323 | mid += odd; |
|
375 | 324 | ||
376 | 325 | _setv(ctx, 1, 0, 0); |
|
377 | 326 | _setv(ctx, delta - 1, 1, n); |
|
378 | 327 | ||
379 | 328 | /* For each number of differences `d`, compute the farthest reaching paths |
|
380 | 329 | * from both the top left and bottom right of the edit graph |
|
330 | + | * |
|
331 | + | * First loop does NOT actually find a difference, which makes all the |
|
332 | + | * difference calculations weird. |
|
381 | 333 | */ |
|
382 | 334 | for (d = 0; d <= mid; d++) { |
|
383 | 335 | int k, x, y; |
|
384 | 336 | ||
385 | - | /* reached maximum allowable differences before real exit condition*/ |
|
386 | - | if ((2 * d - 1) >= ctx->dmax) { |
|
337 | + | /* Reached maximum allowable differences before real exit condition. |
|
338 | + | * Each loop iteration finds up to 2 d differences (one forward, one |
|
339 | + | * backward). |
|
340 | + | * |
|
341 | + | * We know there is going to be at least one more difference because there |
|
342 | + | * must be at least one for us to get here, and there might be two if the |
|
343 | + | * extra forward difference doesn't find the end. |
|
344 | + | */ |
|
345 | + | if (2 * (d - 1) > ctx->dmax - 1) { |
|
346 | + | // So far we've found 2*(d - 1) differences |
|
387 | 347 | ctx->dmaxhit = 1; |
|
388 | - | return _find_faux_snake(a, aoff, n, b, boff, m, ctx, ms, d, faux_snake); |
|
348 | + | ms->x = x_max; ms->y = y_max; ms->u = u_max; ms->v = v_max; |
|
349 | + | return 2 * (d - 1) + _find_faux_snake( |
|
350 | + | a, aoff, n, b, boff, m, ms, faux_snake, d - 1 |
|
351 | + | ); |
|
389 | 352 | } |
|
390 | - | /* Forward (from top left) paths*/ |
|
391 | - | ||
353 | + | /* Forward (from top left) paths */ |
|
354 | + | ||
355 | + | // // Alternate looping picks path closest to middle diagonal. If we change |
|
356 | + | // // this we also should change it for backward paths. This leads to more |
|
357 | + | // // compact diffs, but TBD whether this is good IRL so we abandon it for |
|
358 | + | // // now to avoid introducing behavior change. |
|
359 | + | // int ki = 0; |
|
360 | + | // k = d % 2 ? 1 : 0; |
|
361 | + | // for (; |
|
362 | + | // k >= -d && k <= d; |
|
363 | + | // ki++, k += 2 * ki * (ki % 2 ? -1 : 1) |
|
364 | + | // ) { |
|
392 | 365 | for (k = d; k >= -d; k -= 2) { |
|
366 | + | // If at lowest possible diag, or not at highest and next diag up is |
|
367 | + | // further along in x, move to the right, otherwise move down. |
|
393 | 368 | if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) { |
|
394 | - | x = FV(k + 1); |
|
369 | + | x = FV(k + 1); // move to the right, effectively |
|
395 | 370 | } else { |
|
396 | - | x = FV(k - 1) + 1; |
|
371 | + | x = FV(k - 1) + 1; // move down, effectively |
|
397 | 372 | } |
|
398 | 373 | y = x - k; |
|
399 | 374 | ||
400 | 375 | ms->x = x; |
|
401 | 376 | ms->y = y; |
|
402 | 377 | while(x < n && y < m && _comp_chr(a, aoff + x, b, boff + y)) { |
|
403 | - | /* matching characters, just walk down diagonal */ |
|
404 | - | x++; y++; |
|
378 | + | x++; y++; /* matching characters, just walk down diagonal */ |
|
379 | + | } |
|
380 | + | double dist_new = (x - u_max) * (x - u_max) + (y - v_max) * (y - v_max); |
|
381 | + | if(x <= n && y <= m && dist_new < dist) { |
|
382 | + | dist = dist_new; |
|
383 | + | x_max = x; |
|
384 | + | y_max = y; |
|
405 | 385 | } |
|
406 | 386 | _setv(ctx, k, 0, x); |
|
407 | 387 | ||
408 | - | /* for this diagonal we (think we) are now at farthest reaching point for |
|
409 | - | * a given d. Then return if: |
|
410 | - | * - If we're at the edge of the addressable part of the graph |
|
411 | - | * - The reverse snakes are already overlapping in the `x` coordinate |
|
388 | + | /* For this diagonal k we are now at farthest reaching point for a given |
|
389 | + | * `d`. Then return if: |
|
412 | 390 | * |
|
413 | - | * then it means that the only way to get to the snake coming from the |
|
414 | - | * other direction is by either moving down or across for every remaining |
|
415 | - | * move, so record the current coord as `u` and `v` and return |
|
391 | + | * - We're at the edge of the addressable part of the graph |
|
392 | + | * - The reverse snakes are already overlapping in the `x` coordinate |
|
416 | 393 | * |
|
417 | - | * Note that for the backward snake we reverse xy and uv so that the |
|
418 | - | * matching snake is always defined in `ms` as starting at `ms.(xy)` and |
|
419 | - | * ending at `ms.(uv)` |
|
394 | + | * For the backward snake we reverse xy and uv so that the matching snake |
|
395 | + | * is defined ` as starting at `ms.(xy)` and ending at `ms.(uv)` |
|
420 | 396 | */ |
|
421 | 397 | if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) { |
|
422 | 398 | if (x >= RV(k)) { |
426 | 402 | } |
|
427 | 403 | } |
|
428 | 404 | } |
|
429 | - | /* Backwards (from bottom right) paths*/ |
|
430 | - | ||
405 | + | // Check again if we'd go over by engaging the reverse snake |
|
406 | + | if (2 * d > ctx->dmax) { |
|
407 | + | // So far we've found 2*(d - 1) differences |
|
408 | + | ctx->dmaxhit = 1; |
|
409 | + | ms->x = x_max; ms->y = y_max; ms->u = u_max; ms->v = v_max; |
|
410 | + | return 2 * (d - 1) + 1 + _find_faux_snake( |
|
411 | + | a, aoff, n, b, boff, m, ms, faux_snake, d - 1 |
|
412 | + | ); |
|
413 | + | } |
|
414 | + | /* Backwards (from bottom right) paths (see forward loop). The two loops |
|
415 | + | * are very similar so it is tempting to fold them into each other now, but |
|
416 | + | * would require some work + ensuring no performance degradation. |
|
417 | + | */ |
|
431 | 418 | for (k = d; k >= -d; k -= 2) { |
|
432 | 419 | int kr = (n - m) + k; |
|
433 | 420 |
445 | 432 | /* matching characters, just walk up diagonal */ |
|
446 | 433 | x--; y--; |
|
447 | 434 | } |
|
435 | + | double dist_new = (x_max - x) * (x_max - x) + (y_max - y) * (y_max - y); |
|
436 | + | if(x >= 0 && y >= 0 && dist_new < dist) { |
|
437 | + | dist = dist_new; |
|
438 | + | u_max = x; |
|
439 | + | v_max = y; |
|
440 | + | } |
|
448 | 441 | _setv(ctx, kr, 1, x); |
|
449 | 442 | ||
450 | 443 | /* see comments in forward section */ |
524 | 517 | struct middle_snake ms; |
|
525 | 518 | int d; |
|
526 | 519 | ||
527 | - | //Rprintf("m: %d n: %d\n", m, n); |
|
528 | 520 | if (n == 0) { |
|
529 | 521 | _edit(ctx, DIFF_INSERT, boff, m); |
|
530 | 522 | d = m; |
|
531 | 523 | } else if (m == 0) { |
|
532 | 524 | _edit(ctx, DIFF_DELETE, aoff, n); |
|
533 | 525 | d = n; |
|
534 | 526 | } else { |
|
535 | - | /* Find the middle "snake" around which we |
|
536 | - | * recursively solve the sub-problems. Note this modifies `ms` by ref to |
|
537 | - | * set the beginning and end coordinates of the snake of the furthest |
|
538 | - | * reaching path. The beginning is always the top left part of the snake, |
|
539 | - | * irrespective of whether it was found on a forward or reverse path as |
|
540 | - | * f_m_s will flip the coordinates when appropriately when recording them |
|
541 | - | * in `ms` |
|
527 | + | /* Find the middle "snake" around which we recursively solve the |
|
528 | + | * sub-problems. Note this modifies `ms` by ref to set the beginning and |
|
529 | + | * end coordinates of the snake of the furthest reaching path. The |
|
530 | + | * beginning is always the top left part of the snake, irrespective of |
|
531 | + | * whether it was found on a forward or reverse path as f_m_s will flip the |
|
532 | + | * coordinates when appropriately when recording them in `ms` |
|
542 | 533 | * |
|
543 | 534 | * Additionally, if diffs exceed max.diffs, then `faux.snake` will also |
|
544 | 535 | * be set. `faux_snake` is a pointer to a pointer that points to a the |
549 | 540 | diff_op fsv = DIFF_NULL; |
|
550 | 541 | diff_op * faux_snake; |
|
551 | 542 | faux_snake = &fsv; |
|
552 | - | // |
|
553 | - | // d |
|
554 | - | // diff_op * fsp = NULL; |
|
555 | - | // diff_op fsv = DIFF_NULL; |
|
556 | - | // *fsp = fsv; |
|
557 | - | // **faux_snake = *fsp; |
|
558 | 543 | ||
559 | 544 | d = _find_middle_snake(a, aoff, n, b, boff, m, ctx, &ms, &faux_snake); |
|
560 | - | //Rprintf("d: %d\n", d); |
|
561 | 545 | if (d == -1) { |
|
562 | 546 | // nocov start |
|
563 | 547 | error( |
569 | 553 | error(err_msg_ubrnch, 6); |
|
570 | 554 | return d; |
|
571 | 555 | // nocov end |
|
572 | - | } else if (d > 1) { |
|
556 | + | } else if (d != 1) { |
|
573 | 557 | /* in this case we have something along the lines of (note the non- |
|
574 | 558 | * diagonal bits are just non-diagonal, we're making no claims about |
|
575 | 559 | * whether they should or could be of the horizontal variety) |
579 | 563 | * \- ... |
|
580 | 564 | * so we will record the snake (diagonal) in the middle, and recurse |
|
581 | 565 | * on the stub at the beginning and on the stub at the end separately |
|
566 | + | * |
|
567 | + | * Also have d == 0 case which can happen when the backward snake only has |
|
568 | + | * matches. |
|
582 | 569 | */ |
|
583 | 570 | ||
584 | 571 | /* Beginning stub */ |
613 | 600 | boff += ms.v; |
|
614 | 601 | n -= ms.u; |
|
615 | 602 | m -= ms.v; |
|
616 | - | if (_ses(a, aoff, n, b, boff, m, ctx) == -1) { |
|
603 | + | if(_ses(a, aoff, n, b, boff, m, ctx) == -1) { |
|
617 | 604 | // nocov start |
|
618 | 605 | error("Logic error: failed trying to run ses 2; contact maintainer."); |
|
619 | 606 | // nocov end |
|
620 | 607 | } |
|
621 | - | } else { |
|
608 | + | } else if (d == 1) { |
|
622 | 609 | int x = ms.x; |
|
623 | 610 | int u = ms.u; |
|
624 | 611 | ||
625 | - | /* There are only 4 base cases when the |
|
626 | - | * edit distance is 1. Having a hard time finding cases that trigger the |
|
627 | - | * x == u, possibly because the algo eats leading matches, although |
|
628 | - | * apparently we do achieve it somewhere in the test suite. |
|
612 | + | /* There are only 4 base cases when the edit distance is 1. Having a hard |
|
613 | + | * time finding cases that trigger the x == u, possibly because the algo |
|
614 | + | * eats leading matches, although apparently we do achieve it somewhere in |
|
615 | + | * the test suite. |
|
629 | 616 | * |
|
630 | 617 | * n > m m > n |
|
631 | 618 | * |
638 | 625 | * - | |
|
639 | 626 | */ |
|
640 | 627 | ||
641 | - | //Rprintf("x: %d u: %d y: %d v: %d\n", ms.x, ms.u, ms.y, ms.v); |
|
642 | 628 | if (m > n) { |
|
643 | 629 | if (x == u) { |
|
644 | 630 | _edit(ctx, DIFF_MATCH, aoff, n); |
659 | 645 | // Should never get here since this should be a D 2 case |
|
660 | 646 | // nocov start |
|
661 | 647 | error( |
|
662 | - | "Very special case n %d m %d aoff %d boff %d u %d\n", n, m, |
|
663 | - | aoff, boff, ms.u |
|
648 | + | "%s d %d n %d m %d aoff %d boff %d u %d; contact maintainer\n", |
|
649 | + | "Logic Error: special case", d, n, m, aoff, boff, ms.u |
|
664 | 650 | ); |
|
665 | 651 | // nocov end |
|
666 | 652 | } |
681 | 667 | ) { |
|
682 | 668 | if(n < 0 || m < 0) |
|
683 | 669 | error("Logic Error: negative lengths; contact maintainer."); // nocov |
|
670 | + | if(n > INT_MAX - m) |
|
671 | + | error("Combined length of diffed vectors exeeds INT_MAX (%d)", INT_MAX); // nocov |
|
684 | 672 | struct _ctx ctx; |
|
685 | 673 | int d, x, y; |
|
686 | 674 | struct diff_edit *e = NULL; |
|
687 | 675 | int delta = n - m; |
|
688 | 676 | if(delta < 0) delta = -delta; |
|
677 | + | if(n + m > INT_MAX - delta) |
|
678 | + | error("Logic Error: exceeded max allowable combined string length."); // nocov |
|
679 | + | if(n + m + delta > INT_MAX / 4 - 1) |
|
680 | + | error("Logic Error: exceeded max allowable combined string length."); // nocov |
|
689 | 681 | int bufmax = 4 * (n + m + delta) + 1; // see _setv |
|
690 | - | if(bufmax < n || bufmax < m) |
|
691 | - | error("Logic Error: exceeded maximum allowable combined string length."); // nocov |
|
692 | 682 | ||
693 | 683 | int *tmp = (int *) R_alloc(bufmax, sizeof(int)); |
|
694 | 684 | for(int i = 0; i < bufmax; i++) *(tmp + i) = 0; |
702 | 692 | ctx.ses = ses; |
|
703 | 693 | ctx.si = 0; |
|
704 | 694 | ctx.simax = n + m; |
|
705 | - | ctx.dmax = dmax ? dmax : INT_MAX; |
|
695 | + | ctx.dmax = dmax >= 0 ? dmax : INT_MAX; |
|
706 | 696 | ctx.dmaxhit = 0; |
|
707 | 697 | ||
708 | 698 | /* initialize first ses edit struct*/ |
712 | 702 | } |
|
713 | 703 | e->op = 0; |
|
714 | 704 | } |
|
715 | - | ||
716 | 705 | /* The _ses function assumes the SES will begin or end with a delete |
|
717 | 706 | * or insert. The following will ensure this is true by eating any |
|
718 | 707 | * beginning matches. This is also a quick to process sequences |
|
719 | 708 | * that match entirely. |
|
720 | 709 | */ |
|
721 | 710 | x = y = 0; |
|
711 | + | if(boff > INT_MAX - m || aoff > INT_MAX - n) |
|
712 | + | error("Internal error: overflow for a/boff; contact maintainer"); //nocov |
|
713 | + | ||
722 | 714 | while (x < n && y < m) { |
|
723 | - | if(boff > INT_MAX - y) |
|
724 | - | error("Internal error: overflow for boff; contact maintainer"); //nocov |
|
725 | - | if(aoff > INT_MAX - x) |
|
726 | - | error("Internal error: overflow for aoff; contact maintainer"); //nocov |
|
727 | 715 | if(!_comp_chr(a, aoff + x, b, boff + y)) break; |
|
728 | 716 | x++; y++; |
|
729 | 717 | } |
164 | 164 | #' methods do. |
|
165 | 165 | #' |
|
166 | 166 | #' Strings are re-encoded to UTF-8 with \code{\link{enc2utf8}} prior to |
|
167 | - | #' comparison to avoid spurious encoding-only differences. |
|
167 | + | #' comparison to avoid encoding-only differences. |
|
168 | + | #' |
|
169 | + | #' The text representation of `target` and `current` should each have no more |
|
170 | + | #' than ~INT_MAX/4 lines. |
|
168 | 171 | #' |
|
169 | 172 | #' @section Matrices and Data Frames: |
|
170 | 173 | #' |
350 | 353 | #' particular diff is a function of how many differences, and also how much |
|
351 | 354 | #' \code{context} is used since context can cause two hunks to bleed into |
|
352 | 355 | #' each other and become one. |
|
353 | - | #' @param max.diffs integer(1L), number of \emph{differences} after which we |
|
354 | - | #' abandon the \code{O(n^2)} diff algorithm in favor of a naive element by |
|
355 | - | #' element comparison. Set to \code{-1L} to always stick to the original |
|
356 | - | #' algorithm (defaults to 50000L). |
|
356 | + | #' @param max.diffs integer(1L), number of \emph{differences} (default 50000L) |
|
357 | + | #' after which we abandon the \code{O(n^2)} diff algorithm in favor of a naive |
|
358 | + | #' \code{O(n)} one. Set to \code{-1L} to stick to the original algorithm up to |
|
359 | + | #' the maximum allowed (~INT_MAX/4). |
|
357 | 360 | #' @param disp.width integer(1L) number of display columns to take up; note that |
|
358 | 361 | #' in \dQuote{sidebyside} \code{mode} the effective display width is half this |
|
359 | 362 | #' number (set to 0L to use default widths which are \code{getOption("width")} |
34 | 34 | error("Logic Error: `max` not integer(1L) and not NA"); // nocov |
|
35 | 35 | ||
36 | 36 | int max_i = asInteger(max); |
|
37 | - | if(max_i < 0) max_i = 0; |
|
37 | + | if(max_i < 0) max_i = -1; |
|
38 | 38 | ||
39 | 39 | struct diff_edit *ses = (struct diff_edit *) |
|
40 | 40 | R_alloc(n + m + 1, sizeof(struct diff_edit)); |
Learn more Showing 1 files with coverage changes found.
R/text.R
a0c8ffe
a9fd6c5
431e17b
759b27e
4927783
5408af2
1a3b04f
bd47c53
f75a7fd
e4b2738
69919f3
e9c3c05
7b9302e
b47fc65
3bc3c6e
d6812cb
6ff14be
075eec8
99f3a81
2fce078
c0e5a32
e2fd3f8
0f965e7
9b5635e
2ee025c
cf8fbef
02bc5d4
5b90ad7
f75961e
375067d
baaa5f3
46a9fe3
e64e330
af33801
37606e3
ef0bb72
fc79c96
af1bf48
7d6b003
22b54fa
55af81e
979f92a
a243a5a
e718369
c5bfcf1
24fbb41
0177850