brodieG / diffobj

Compare 0177850 ... +45 ... a0c8ffe

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.


@@ -153,8 +153,10 @@
Loading
153 153
#' raw diff data and not the printed output of \code{diffobj}, but do not wish
154 154
#' to manually parse the \code{ses} output.  Whether it is faster than
155 155
#' \code{ses} or not depends on the ratio of matching to non-matching values as
156 -
#' \code{ses_dat} includes matching values whereas \code{ses} does not.  See
157 -
#' examples.
156 +
#' \code{ses_dat} includes matching values whereas \code{ses} does not.
157 +
#' \code{ses_dat} objects have a print method that makes it easy to interpret
158 +
#' the diff, but are actually data.frames.  You can see the underlying data by
159 +
#' using \code{as.data.frame}, removing the "ses_dat" class, etc..
158 160
#'
159 161
#' @export
160 162
#' @param a character
@@ -166,17 +168,18 @@
Loading
166 168
#' @param warn TRUE (default) or FALSE whether to warn if we hit
167 169
#'   \code{max.diffs}.
168 170
#' @return character shortest edit script, or a machine readable version of it
169 -
#'   as a \code{data.frame} with columns \code{op} (factor, values
170 -
#'   \dQuote{Match}, \dQuote{Insert}, or \dQuote{Delete}), \code{val} character
171 -
#'   corresponding to the value taken from either \code{a} or \code{b},
172 -
#'   and if \code{extra} is TRUE, integer columns \code{id.a} and \code{id.b}
173 -
#'   corresponding to the indices in \code{a} or \code{b} that \code{val} was
174 -
#'   taken from.  See Details.
171 +
#'   as a \code{ses_dat} object, which is a \code{data.frame} with columns
172 +
#'   \code{op} (factor, values \dQuote{Match}, \dQuote{Insert}, or
173 +
#'   \dQuote{Delete}), \code{val} character corresponding to the value taken
174 +
#'   from either \code{a} or \code{b}, and if \code{extra} is TRUE, integer
175 +
#'   columns \code{id.a} and \code{id.b} corresponding to the indices in
176 +
#'   \code{a} or \code{b} that \code{val} was taken from.  See Details.
175 177
#' @examples
176 178
#' a <- letters[1:6]
177 179
#' b <- c('b', 'CC', 'DD', 'd', 'f')
178 180
#' ses(a, b)
179 181
#' (dat <- ses_dat(a, b))
182 +
#' str(dat)                 # data.frame with a print method
180 183
#'
181 184
#' ## use `ses_dat` output to construct a minimal diff
182 185
#' ## color with ANSI CSI SGR
@@ -237,7 +240,7 @@
Loading
237 240
  values <- character(length(id))
238 241
  values[use.a] <- a[id2[use.a]]
239 242
  values[use.b] <- b[id2[use.b]]
240 -
  if(extra) {
243 +
  res <- if(extra) {
241 244
    id.a <- id.b <- rep(NA_integer_, length(values))
242 245
    id.a[use.a] <- id2[use.a]
243 246
    id.b[use.b] <- id2[use.b]
@@ -248,7 +251,31 @@
Loading
248 251
  } else {
249 252
    data.frame(op=type2, val=values, stringsAsFactors=FALSE)
250 253
  }
254 +
  structure(res, class=c('ses_dat', class(res)))
251 255
}
256 +
#' @export
257 +
258 +
print.ses_dat <- function(x, quote=FALSE, ...) {
259 +
  op <- x[['op']]
260 +
  diff <- matrix(
261 +
    "", 3, nrow(x),
262 +
    dimnames=list(c('D:', 'M:', 'I:'), character(nrow(x)))
263 +
  )
264 +
  d <- op == 'Delete'
265 +
  m <- op == 'Match'
266 +
  i <- op == 'Insert'
267 +
  diff[1, d] <- x[['val']][d]
268 +
  diff[2, m] <- x[['val']][m]
269 +
  diff[3, i] <- x[['val']][i]
270 +
  writeLines(
271 +
    sprintf(
272 +
      "\"ses_dat\" object (Match: %d, Delete: %d, Insert: %d):",
273 +
      sum(m), sum(d), sum(i)
274 +
  ) )
275 +
  print(diff, quote=quote, ...)
276 +
  invisible(x)
277 +
}
278 +
252 279
# Internal validation fun for ses_*
253 280
254 281
ses_prep <- function(a, b, max.diffs, warn) {
@@ -299,12 +326,12 @@
Loading
299 326
#' @param a character
300 327
#' @param b character
301 328
#' @param max.diffs integer(1L) how many differences before giving up; set to
302 -
#'   zero to allow as many as there are
329 +
#'   -1 to allow as many as there are up to the maximum allowed (~INT_MAX/4).
303 330
#' @param warn TRUE or FALSE, whether to warn if we hit `max.diffs`.
304 331
#' @return MyersMbaSes object
305 332
#' @useDynLib diffobj, .registration=TRUE, .fixes="DIFFOBJ_"
306 333
307 -
diff_myers <- function(a, b, max.diffs=0L, warn=FALSE) {
334 +
diff_myers <- function(a, b, max.diffs=-1L, warn=FALSE) {
308 335
  stopifnot(
309 336
    is.character(a), is.character(b), all(!is.na(c(a, b))), is.int.1L(max.diffs),
310 337
    is.TF(warn)

@@ -285,7 +285,7 @@
Loading
285 285
            gsub(paste0(".*", ansi_regex, ".*"), "\\1", tail(x, 1L), perl=TRUE)
286 286
        ) }
287 287
        res
288 -
      } else x
288 +
      } else x # nocov has.cr elements can't have length zero after split...
289 289
    },
290 290
    character(1L)
291 291
  )

@@ -76,13 +76,33 @@
Loading
76 76
 *   script rather than simply saying the shortest edit script is longer than
77 77
 *   allowable diffs; this is all the `faux_snake` stuff.  This algorithm tries
78 78
 *   to salvage whatever the myers algo computed up to the point of max diffs
79 -
 * - Adding lots of comments as we worked through the logic
79 +
 * - Comments.
80 +
 */
81 +
/*
82 +
 * Terms
83 +
 *
84 +
 * * A: first string
85 +
 * * B: second string
86 +
 * * N: length of A
87 +
 * * M: length of B
88 +
 * * K: grid-diagonal, numbered from -M to N.  For each grid-diagonal K,
89 +
 *   X - Y == K, i.e. (3, 1) is on diagonal K == 2.
90 +
 * * D: Number of differences in a path.
91 +
 * * Snake: sequence of diagonal moves (i.e. matching substrings).  An edit
92 +
 *   script (path) will combine snakes with (possibly zero) right/down moves.  A
93 +
 *   D-path may have up to D + 1 snakes, some or all zero length.
94 +
 * * Middle Snake: Possibly zero length snake in which the forward and reverse
95 +
 *   paths meet in the Linear Space Refinement algorithm.  Defined in terms of
96 +
 *   (x, y, u, v) where (x, y) are the coordinates from (0, 0), and (u, v) those
97 +
 *   from (M, N).
80 98
 */
81 99
82 100
83 101
#include <stdlib.h>
84 102
#include "diffobj.h"
85 103
104 +
// See _v and _setv for details
105 +
86 106
#define FV(k) _v(ctx, (k), 0)
87 107
#define RV(k) _v(ctx, (k), 1)
88 108
@@ -124,9 +144,13 @@
Loading
124 144
}
125 145
*/
126 146
/*
127 -
 * k = diagonal number
128 -
 * val = x value
129 -
 * r = presumably whether we are looking up in reverse snakes
147 +
 * For each diagonal k, we only store the path that up to this point has gotten
148 +
 * furthest on it (well, two really, one from the forward and one from the
149 +
 * reverse directins)
150 +
 *
151 +
 * @param k = diagonal number
152 +
 * @param val = x value
153 +
 * @param r = 0 for forward snake, 1 for backward snake
130 154
 */
131 155
  static void
132 156
_setv(struct _ctx *ctx, int k, int r, int val)
@@ -151,8 +175,9 @@
Loading
151 175
}
152 176
/*
153 177
 * For any given `k` diagonal, return the x coordinate of the furthest reaching
154 -
 * path we've found.  Use `r` to look for the x coordinate for the paths that
155 -
 * are starting from the bottom right instead of top left
178 +
 * path we've found.
179 +
 *
180 +
 * See `_set_v`.
156 181
 */
157 182
  static int
158 183
_v(struct _ctx *ctx, int k, int r)
@@ -175,9 +200,8 @@
Loading
175 200
 *
176 201
 * Needs to account for special case when indeces are oob for the strings.  The
177 202
 * oob checks seem to be necessary since algo is requesting reads past length
178 -
 * of string, but not sure if that is intentional or not.  In theory it should
179 -
 * not be, but perhaps this was handled gracefully by the varray business b4
180 -
 * we changed it.
203 +
 * of string (maybe this worked with the varrays).  As the current algo is
204 +
 * written, it does not seem to be the case that we even attempt oob reads.
181 205
 */
182 206
int _comp_chr(SEXP a, int aidx, SEXP b, int bidx) {
183 207
  int alen = XLENGTH(a);
@@ -189,141 +213,70 @@
Loading
189 213
    comp = 1;
190 214
    // nocov end
191 215
  } else if(aidx >= alen || bidx >= blen) {
192 -
    comp = 0;
216 +
    comp = 0;  // nocov
193 217
  } else comp = STRING_ELT(a, aidx) == STRING_ELT(b, bidx);
194 218
  return(comp);
195 219
}
196 220
/*
197 221
 * Handle cases where differences exceed maximum allowable differences
198 222
 *
199 -
 * General logic is to create a faux snake that instead of moving down
200 -
 * diagonally will chain right and down moves until it hits a path coming
201 -
 * from the other direction.  This snake is stored in `ctx`, and is then
202 -
 * written by `ses` to the `ses` list.
223 +
 * General logic is to try to connect the prior furthest points by naively
224 +
 * incrementing in each dimension and hoping for some diagonal runs.
225 +
 *
226 +
 * @param a character vector
227 +
 * @param b character vector
228 +
 * @param d number of differences associated with the backward snake implicit in
229 +
 *   the ms.u/v values.
230 +
 * forward loops in difference seeking; this is not the same as the
231 +
 *   number of differences as in each loop we find a forward and backward
232 +
 *   difference.
203 233
 */
204 234
static int
205 235
_find_faux_snake(
206 -
  SEXP a, int aoff, int n, SEXP b, int boff, int m, struct _ctx *ctx,
207 -
  struct middle_snake *ms, int d, diff_op ** faux_snake
236 +
  SEXP a, int aoff, int n, SEXP b, int boff, int m,
237 +
  struct middle_snake *ms, diff_op ** faux_snake, int d
208 238
) {
209 -
  /* normally we would record k/x values at the end of the furthest reaching
210 -
   * snake, but here we need pick a path from top left  and extend it until
211 -
   * we hit something coming from bottom right.
212 -
   */
213 -
  /* start by finding which diagonal has the furthest reaching value
214 -
   * when looking from top left
215 -
   */
216 -
  int k_max_f = 0, x_max_f = -1;
217 -
  int x_f, y_f, k_f;
218 -
  int delta = n - m;
219 -
220 -
  for (int k = d - 1; k >= -d + 1; k -= 2) { /* might need to shift by 1 */
221 -
    int x_f = FV(k);
222 -
    int f_dist = x_f - abs(k);
239 +
  int x = ms->x;
240 +
  int y = ms->y;
241 +
  // Should switch to unsigned int...
242 +
  if(x < 0 || y < 0 || ms->u < 0 || ms->v < 0) 
243 +
      error("Internal Error: fake snake with -ve start; contact maintainer.");  // nocov
223 244
224 -
    if(x_f > n || x_f - k > m) continue;
225 -
226 -
    if(f_dist > x_max_f - abs(k_max_f)) {
227 -
      x_max_f = x_f;
228 -
      k_max_f = k;
229 -
    }
230 -
  }
231 -
  /* didn't find a path so use origin */
232 -
  if(x_max_f < 0) {
233 -
    // nocov start
234 -
    error(err_msg_ubrnch, 2);
235 -
    x_f = y_f = k_f = 0;
236 -
    // nocov end
237 -
  } else {
238 -
    k_f = k_max_f;
239 -
    x_f = x_max_f;
240 -
    y_f = x_f - k_max_f;
241 -
  }
242 -
  /*
243 -
   * now look for the furthest reaching point in any diagonal that is
244 -
   * below the diagonal we found above since those are the only ones we
245 -
   * can connect to
246 -
   *
247 -
   */
248 -
  int k_max_r = 0, x_max_r = n + 1;
249 -
  int x_r, y_r;
250 -
251 -
  for (int k = -d; k <= k_max_f - delta; k += 2) {
252 -
    int x_r = RV(k);
253 -
    int r_dist = n - x_r - abs(k);
254 -
    /* skip reverse snakes that overshoot our forward snake
255 -
     * ---\
256 -
     *     \
257 -
     *   \
258 -
     *    \
259 -
     *     \---
260 -
     * where there should be a decent path we can use, but because we are only
261 -
     * tracking the last coordinates we don't really have a way connecting this
262 -
     * type of path so we just go straight to the origin even though that is
263 -
     * even more sub-optinal; not even sure if this is a possible scenario
264 -
     */
265 -
    if(x_r < x_f || x_r - k - delta < y_f) continue;
266 -
267 -
    /* since buffer is init to zero, an x_r value of zero means nothing, and
268 -
     * not all the way to the left of the graph; also, in reverse snakes the
269 -
     * snake should end at x == 1 in the leftmost case (we think)
270 -
     */
271 -
    if(r_dist > n - x_max_r - abs(k_max_r) && x_r) {
272 -
      x_max_r = x_r;
273 -
      k_max_r = k;
274 -
    }
275 -
  }
276 -
  if(x_max_r >= n) {
277 -
    x_r = n; y_r = m;
278 -
  } else {
279 -
    x_r = x_max_r;
280 -
    /* not 100% sure about this one; seems like k_max_r is relative to the
281 -
     * bottom right origin, so maybe this should be x_r - k_max_r - delta?
282 -
     */
283 -
    y_r = x_r - k_max_r - delta;
284 -
  }
285 -
  /*
286 -
   * attempt to connect the two paths we found.  We need to store this
287 -
   * information as our "faux" snake since it will have to be processed
288 -
   * in a manner similar as the middle snake would be processed; start by
289 -
   * figuring out max number of steps it would take to connect the two
290 -
   * paths
291 -
   */
292 -
  int max_steps = x_r - x_f + y_r - y_f + 1;
293 245
  int steps = 0;
294 -
  int diffs = 0;
246 +
  int diffs = 0;    // only diffs from fake snake
295 247
  int step_dir = 1; /* last direction we moved in, 1 is down */
296 -
  int x_sn = x_f, y_sn = y_f;
297 248
298 -
  /* initialize the fake snake */
299 -
  if(max_steps < 0)
249 +
  if(x > ms->u || y > ms->v) {
250 +
    // Overshot backward snake, e.g. you hit a long diagonal run that overshoots
251 +
    // the prior backward closest point.  In this case toss backward snake.
252 +
    ms->u = n;
253 +
    ms->v = m;
254 +
    diffs -= d;  // we're also tossing accrued differences from back snake
255 +
    if(x > ms->u || y > ms->v)
256 +
      error("Internal Error: can't correct fwd snake overshoot; contact maintainer"); // nocov
257 +
  }
258 +
  if(ms->u > INT_MAX - ms->v - 1) // x/y positive, so this is conservative
300 259
    error("Logic Error: fake snake step overflow? Contact maintainer."); // nocov
301 260
261 +
  int max_steps;
262 +
  max_steps = (ms->u - x) + (ms->v - y) + 1;
263 +
302 264
  diff_op * faux_snake_tmp = (diff_op*) R_alloc(max_steps, sizeof(diff_op));
303 265
  for(int i = 0; i < max_steps; i++) *(faux_snake_tmp + i) = DIFF_NULL;
304 -
305 -
  /* we have a further reaching reverse snake:
306 -
   * not entirely sure if this should happen, but it seems it does
307 -
   */
308 -
  while(x_sn < x_r || y_sn < y_r) {
309 -
    if(x_sn > x_r || y_sn > y_r) {
310 -
      error("Logic Error: Exceeded buffer for finding fake snake; contact maintainer.");  // nocov
311 -
    }
312 -
    /* check to see if we could possibly move on a diagonal, and do so
313 -
     * if possible, if not alternate going down and right*/
266 +
  while((x < ms->u) || (y < ms->v)) {
314 267
    if(
315 -
        x_sn <= x_r && y_sn <= y_r &&
316 -
        _comp_chr(a, aoff + x_sn, b, boff + y_sn)
268 +
      x < ms->u && y < ms->v &&
269 +
      _comp_chr(a, aoff + x, b, boff + y)
317 270
    ) {
318 -
      x_sn++; y_sn++;
271 +
      x++; y++;
319 272
      *(faux_snake_tmp + steps) = DIFF_MATCH;
320 -
    } else if (x_sn < x_r && (step_dir || y_sn >= y_r)) {
321 -
      x_sn++;
273 +
    } else if (x < ms->u && (step_dir || y >= ms->v)) {
274 +
      x++;
322 275
      diffs++;
323 276
      step_dir = !step_dir;
324 277
      *(faux_snake_tmp + steps) = DIFF_DELETE;
325 -
    } else if (y_sn < y_r && (!step_dir || x_sn >= x_r)) {
326 -
      y_sn++;
278 +
    } else if (y < ms->v && (!step_dir || x >= ms->u)) {
279 +
      y++;
327 280
      diffs++;
328 281
      *(faux_snake_tmp + steps) = DIFF_INSERT;
329 282
      step_dir = !step_dir;
@@ -332,23 +285,10 @@
Loading
332 285
    }
333 286
    steps++;
334 287
  }
335 -
  /* corner cases; must absolutely make sure steps LT max_steps since we rely
336 -
   * on at least one zero at the end of the faux_snake when we read it to know
337 -
   * to stop reading it
338 -
   */
339 -
  if(x_sn != x_r || y_sn != y_r || steps >= max_steps) {
288 +
  if(x != ms->u || y != ms->v || steps >= max_steps) {
340 289
    error("Logic Error: faux snake process failed; contact maintainer."); // nocov
341 290
  }
342 -
  /* modify the pointer to the pointer so we can return in by ref */
343 -
344 291
  *faux_snake = faux_snake_tmp;
345 -
346 -
  /* record the coordinates of our faux snake using `ms` */
347 -
  ms->x = x_f;
348 -
  ms->y = y_f;
349 -
  ms->u = x_r;
350 -
  ms->v = y_r;
351 -
352 292
  return diffs;
353 293
}
354 294
@@ -360,63 +300,99 @@
Loading
360 300
 * the maximum number of differences, we return to `_ses` with the number of
361 301
 * differences found.  `_ses` will then attempt to stitch back the snakes
362 302
 * together.
303 +
 *
304 +
 * @param ms tracks beginning (x,y) and end (u,v) coords of the middle snake
363 305
 */
364 306
  static int
365 307
_find_middle_snake(
366 308
  SEXP a, int aoff, int n, SEXP b, int boff, int m, struct _ctx *ctx,
367 309
  struct middle_snake *ms, diff_op ** faux_snake
368 310
) {
369 311
  int delta, odd, mid, d;
312 +
  int x_max, y_max, v_max, u_max;
313 +
  ms->x = x_max = 0;
314 +
  ms->y = y_max = 0;
315 +
  ms->u = u_max = n;
316 +
  ms->v = v_max = m;
317 +
  double dist = (x_max - u_max) * (x_max - u_max) +
318 +
    (y_max - v_max) * (y_max - v_max);
370 319
371 320
  delta = n - m;
372 321
  odd = delta & 1;
373 -
  mid = (n + m) / 2;
322 +
  mid = (n + m) / 2;  // we check in `diff` that this won't overflow int
374 323
  mid += odd;
375 324
376 325
  _setv(ctx, 1, 0, 0);
377 326
  _setv(ctx, delta - 1, 1, n);
378 327
379 328
  /* For each number of differences `d`, compute the farthest reaching paths
380 329
   * from both the top left and bottom right of the edit graph
330 +
   *
331 +
   * First loop does NOT actually find a difference, which makes all the
332 +
   * difference calculations weird.
381 333
   */
382 334
  for (d = 0; d <= mid; d++) {
383 335
    int k, x, y;
384 336
385 -
    /* reached maximum allowable differences before real exit condition*/
386 -
    if ((2 * d - 1) >= ctx->dmax) {
337 +
    /* Reached maximum allowable differences before real exit condition.
338 +
     * Each loop iteration finds up to 2 d differences (one forward, one
339 +
     * backward).
340 +
     *
341 +
     * We know there is going to be at least one more difference because there
342 +
     * must be at least one for us to get here, and there might be two if the
343 +
     * extra forward difference doesn't find the end.
344 +
     */
345 +
    if (2 * (d - 1) > ctx->dmax - 1) {
346 +
      // So far we've found 2*(d - 1) differences
387 347
      ctx->dmaxhit = 1;
388 -
      return _find_faux_snake(a, aoff, n, b, boff, m, ctx, ms, d, faux_snake);
348 +
      ms->x = x_max; ms->y = y_max; ms->u = u_max; ms->v = v_max;
349 +
      return 2 * (d - 1) + _find_faux_snake(
350 +
        a, aoff, n, b, boff, m, ms, faux_snake, d - 1
351 +
      );
389 352
    }
390 -
    /* Forward (from top left) paths*/
391 -
353 +
    /* Forward (from top left) paths */
354 +
355 +
    // // Alternate looping picks path closest to middle diagonal.  If we change
356 +
    // // this we also should change it for backward paths.  This leads to more
357 +
    // // compact diffs, but TBD whether this is good IRL so we abandon it for
358 +
    // // now to avoid introducing behavior change.
359 +
    // int ki = 0;
360 +
    // k = d % 2 ? 1 : 0;
361 +
    // for (;
362 +
    //   k >= -d && k <= d;
363 +
    //   ki++, k += 2 * ki * (ki % 2 ? -1 : 1)
364 +
    // ) {
392 365
    for (k = d; k >= -d; k -= 2) {
366 +
      // If at lowest possible diag, or not at highest and next diag up is
367 +
      // further along in x, move to the right, otherwise move down.
393 368
      if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) {
394 -
        x = FV(k + 1);
369 +
        x = FV(k + 1);      // move to the right, effectively
395 370
      } else {
396 -
        x = FV(k - 1) + 1;
371 +
        x = FV(k - 1) + 1;  // move down, effectively
397 372
      }
398 373
      y = x - k;
399 374
400 375
      ms->x = x;
401 376
      ms->y = y;
402 377
      while(x < n && y < m && _comp_chr(a, aoff + x, b, boff + y)) {
403 -
        /* matching characters, just walk down diagonal */
404 -
        x++; y++;
378 +
        x++; y++;  /* matching characters, just walk down diagonal */
379 +
      }
380 +
      double dist_new = (x - u_max) * (x - u_max) + (y - v_max) * (y - v_max);
381 +
      if(x <= n && y <= m && dist_new < dist) {
382 +
        dist = dist_new;
383 +
        x_max = x;
384 +
        y_max = y;
405 385
      }
406 386
      _setv(ctx, k, 0, x);
407 387
408 -
      /* for this diagonal we (think we) are now at farthest reaching point for
409 -
       * a given d.  Then return if:
410 -
       * - If we're at the edge of the addressable part of the graph
411 -
       * - The reverse snakes are already overlapping in the `x` coordinate
388 +
      /* For this diagonal k we are now at farthest reaching point for a given
389 +
       * `d`.  Then return if:
412 390
       *
413 -
       * then it means that the only way to get to the snake coming from the
414 -
       * other direction is by either moving down or across for every remaining
415 -
       * move, so record the current coord as `u` and `v` and return
391 +
       * - We're at the edge of the addressable part of the graph
392 +
       * - The reverse snakes are already overlapping in the `x` coordinate
416 393
       *
417 -
       * Note that for the backward snake we reverse xy and uv so that the
418 -
       * matching snake is always defined in `ms` as starting at `ms.(xy)` and
419 -
       * ending at `ms.(uv)`
394 +
       * For the backward snake we reverse xy and uv so that the matching snake
395 +
       * is defined ` as starting at `ms.(xy)` and ending at `ms.(uv)`
420 396
       */
421 397
      if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) {
422 398
        if (x >= RV(k)) {
@@ -426,8 +402,19 @@
Loading
426 402
        }
427 403
      }
428 404
    }
429 -
    /* Backwards (from bottom right) paths*/
430 -
405 +
    // Check again if we'd go over by engaging the reverse snake
406 +
    if (2 * d > ctx->dmax) {
407 +
      // So far we've found 2*(d - 1) differences
408 +
      ctx->dmaxhit = 1;
409 +
      ms->x = x_max; ms->y = y_max; ms->u = u_max; ms->v = v_max;
410 +
      return 2 * (d - 1) + 1 + _find_faux_snake(
411 +
        a, aoff, n, b, boff, m, ms, faux_snake, d - 1
412 +
      );
413 +
    }
414 +
    /* Backwards (from bottom right) paths (see forward loop).  The two loops
415 +
     * are very similar so it is tempting to fold them into each other now, but
416 +
     * would require some work + ensuring no performance degradation.
417 +
     */
431 418
    for (k = d; k >= -d; k -= 2) {
432 419
      int kr = (n - m) + k;
433 420
@@ -445,6 +432,12 @@
Loading
445 432
        /* matching characters, just walk up diagonal */
446 433
        x--; y--;
447 434
      }
435 +
      double dist_new = (x_max - x) * (x_max - x) + (y_max - y) * (y_max - y);
436 +
      if(x >= 0 && y >= 0 && dist_new < dist) {
437 +
        dist = dist_new;
438 +
        u_max = x;
439 +
        v_max = y;
440 +
      }
448 441
      _setv(ctx, kr, 1, x);
449 442
450 443
      /* see comments in forward section */
@@ -524,21 +517,19 @@
Loading
524 517
  struct middle_snake ms;
525 518
  int d;
526 519
527 -
  //Rprintf("m: %d n: %d\n", m, n);
528 520
  if (n == 0) {
529 521
    _edit(ctx, DIFF_INSERT, boff, m);
530 522
    d = m;
531 523
  } else if (m == 0) {
532 524
    _edit(ctx, DIFF_DELETE, aoff, n);
533 525
    d = n;
534 526
  } else {
535 -
    /* Find the middle "snake" around which we
536 -
     * recursively solve the sub-problems.  Note this modifies `ms` by ref to
537 -
     * set the beginning and end coordinates of the snake of the furthest
538 -
     * reaching path.  The beginning is always the top left part of the snake,
539 -
     * irrespective of whether it was found on a forward or reverse path as
540 -
     * f_m_s will flip the coordinates when appropriately when recording them
541 -
     * in `ms`
527 +
    /* Find the middle "snake" around which we recursively solve the
528 +
     * sub-problems.  Note this modifies `ms` by ref to set the beginning and
529 +
     * end coordinates of the snake of the furthest reaching path.  The
530 +
     * beginning is always the top left part of the snake, irrespective of
531 +
     * whether it was found on a forward or reverse path as f_m_s will flip the
532 +
     * coordinates when appropriately when recording them in `ms`
542 533
     *
543 534
     * Additionally, if diffs exceed max.diffs, then `faux.snake` will also
544 535
     * be set.  `faux_snake` is a pointer to a pointer that points to a the
@@ -549,15 +540,8 @@
Loading
549 540
    diff_op fsv = DIFF_NULL;
550 541
    diff_op * faux_snake;
551 542
    faux_snake = &fsv;
552 -
    //
553 -
    // d
554 -
    // diff_op * fsp = NULL;
555 -
    // diff_op fsv = DIFF_NULL;
556 -
    // *fsp = fsv;
557 -
    // **faux_snake = *fsp;
558 543
559 544
    d = _find_middle_snake(a, aoff, n, b, boff, m, ctx, &ms, &faux_snake);
560 -
    //Rprintf("d: %d\n", d);
561 545
    if (d == -1) {
562 546
      // nocov start
563 547
      error(
@@ -569,7 +553,7 @@
Loading
569 553
      error(err_msg_ubrnch, 6);
570 554
      return d;
571 555
      // nocov end
572 -
    } else if (d > 1) {
556 +
    } else if (d != 1) {
573 557
      /* in this case we have something along the lines of (note the non-
574 558
       * diagonal bits are just non-diagonal, we're making no claims about
575 559
       * whether they should or could be of the horizontal variety)
@@ -579,6 +563,9 @@
Loading
579 563
       *        \- ...
580 564
       * so we will record the snake (diagonal) in the middle, and recurse
581 565
       * on the stub at the beginning and on the stub at the end separately
566 +
       *
567 +
       * Also have d == 0 case which can happen when the backward snake only has
568 +
       * matches.
582 569
       */
583 570
584 571
      /* Beginning stub */
@@ -613,19 +600,19 @@
Loading
613 600
      boff += ms.v;
614 601
      n -= ms.u;
615 602
      m -= ms.v;
616 -
      if (_ses(a, aoff, n, b, boff, m, ctx) == -1) {
603 +
      if(_ses(a, aoff, n, b, boff, m, ctx) == -1) {
617 604
        // nocov start
618 605
        error("Logic error: failed trying to run ses 2; contact maintainer.");
619 606
        // nocov end
620 607
      }
621 -
    } else {
608 +
    } else if (d == 1) {
622 609
      int x = ms.x;
623 610
      int u = ms.u;
624 611
625 -
      /* There are only 4 base cases when the
626 -
       * edit distance is 1.  Having a hard time finding cases that trigger the
627 -
       * x == u, possibly because the algo eats leading matches, although
628 -
       * apparently we do achieve it somewhere in the test suite.
612 +
      /* There are only 4 base cases when the edit distance is 1.  Having a hard
613 +
       * time finding cases that trigger the x == u, possibly because the algo
614 +
       * eats leading matches, although apparently we do achieve it somewhere in
615 +
       * the test suite.
629 616
       *
630 617
       * n > m   m > n
631 618
       *
@@ -638,7 +625,6 @@
Loading
638 625
       *     -       |
639 626
       */
640 627
641 -
      //Rprintf("x: %d u: %d y: %d v: %d\n",  ms.x, ms.u, ms.y, ms.v);
642 628
      if (m > n) {
643 629
        if (x == u) {
644 630
          _edit(ctx, DIFF_MATCH, aoff, n);
@@ -659,8 +645,8 @@
Loading
659 645
        // Should never get here since this should be a D 2 case
660 646
        // nocov start
661 647
        error(
662 -
          "Very special case n %d m %d aoff %d boff %d u %d\n", n, m,
663 -
          aoff, boff, ms.u
648 +
          "%s d %d n %d m %d aoff %d boff %d u %d; contact maintainer\n",
649 +
          "Logic Error: special case", d, n, m, aoff, boff, ms.u
664 650
        );
665 651
        // nocov end
666 652
      }
@@ -681,14 +667,18 @@
Loading
681 667
) {
682 668
  if(n < 0 || m < 0)
683 669
    error("Logic Error: negative lengths; contact maintainer.");  // nocov
670 +
  if(n > INT_MAX - m)
671 +
    error("Combined length of diffed vectors exeeds INT_MAX (%d)", INT_MAX);  // nocov
684 672
  struct _ctx ctx;
685 673
  int d, x, y;
686 674
  struct diff_edit *e = NULL;
687 675
  int delta = n - m;
688 676
  if(delta < 0) delta = -delta;
677 +
  if(n + m > INT_MAX - delta)
678 +
    error("Logic Error: exceeded max allowable combined string length.");  // nocov
679 +
  if(n + m + delta > INT_MAX / 4 - 1)
680 +
    error("Logic Error: exceeded max allowable combined string length.");  // nocov
689 681
  int bufmax = 4 * (n + m + delta) + 1;  // see _setv
690 -
  if(bufmax < n || bufmax < m)
691 -
    error("Logic Error: exceeded maximum allowable combined string length.");  // nocov
692 682
693 683
  int *tmp = (int *) R_alloc(bufmax, sizeof(int));
694 684
  for(int i = 0; i < bufmax; i++) *(tmp + i) = 0;
@@ -702,7 +692,7 @@
Loading
702 692
  ctx.ses = ses;
703 693
  ctx.si = 0;
704 694
  ctx.simax = n + m;
705 -
  ctx.dmax = dmax ? dmax : INT_MAX;
695 +
  ctx.dmax = dmax >= 0 ? dmax : INT_MAX;
706 696
  ctx.dmaxhit = 0;
707 697
708 698
  /* initialize first ses edit struct*/
@@ -712,18 +702,16 @@
Loading
712 702
    }
713 703
    e->op = 0;
714 704
  }
715 -
716 705
  /* The _ses function assumes the SES will begin or end with a delete
717 706
   * or insert. The following will ensure this is true by eating any
718 707
   * beginning matches. This is also a quick to process sequences
719 708
   * that match entirely.
720 709
   */
721 710
  x = y = 0;
711 +
  if(boff > INT_MAX - m || aoff > INT_MAX - n)
712 +
      error("Internal error: overflow for a/boff; contact maintainer"); //nocov
713 +
722 714
  while (x < n && y < m) {
723 -
    if(boff > INT_MAX - y)
724 -
      error("Internal error: overflow for boff; contact maintainer"); //nocov
725 -
    if(aoff > INT_MAX - x)
726 -
      error("Internal error: overflow for aoff; contact maintainer"); //nocov
727 715
    if(!_comp_chr(a, aoff + x, b, boff + y)) break;
728 716
    x++; y++;
729 717
  }

@@ -164,7 +164,10 @@
Loading
164 164
#' methods do.
165 165
#'
166 166
#' Strings are re-encoded to UTF-8 with \code{\link{enc2utf8}} prior to
167 -
#' comparison to avoid spurious encoding-only differences.
167 +
#' comparison to avoid encoding-only differences.
168 +
#'
169 +
#' The text representation of `target` and `current` should each have no more
170 +
#' than ~INT_MAX/4 lines.
168 171
#'
169 172
#' @section Matrices and Data Frames:
170 173
#'
@@ -350,10 +353,10 @@
Loading
350 353
#'   particular diff is a function of how many differences, and also how much
351 354
#'   \code{context} is used since context can cause two hunks to bleed into
352 355
#'   each other and become one.
353 -
#' @param max.diffs integer(1L), number of \emph{differences} after which we
354 -
#'   abandon the \code{O(n^2)} diff algorithm in favor of a naive element by
355 -
#'   element comparison. Set to \code{-1L} to always stick to the original
356 -
#'   algorithm (defaults to 50000L).
356 +
#' @param max.diffs integer(1L), number of \emph{differences} (default 50000L)
357 +
#'   after which we abandon the \code{O(n^2)} diff algorithm in favor of a naive
358 +
#'   \code{O(n)} one. Set to \code{-1L} to stick to the original algorithm up to
359 +
#'   the maximum allowed (~INT_MAX/4).
357 360
#' @param disp.width integer(1L) number of display columns to take up; note that
358 361
#'   in \dQuote{sidebyside} \code{mode} the effective display width is half this
359 362
#'   number (set to 0L to use default widths which are \code{getOption("width")}

@@ -34,7 +34,7 @@
Loading
34 34
    error("Logic Error: `max` not integer(1L) and not NA"); // nocov
35 35
36 36
  int max_i = asInteger(max);
37 -
  if(max_i < 0) max_i = 0;
37 +
  if(max_i < 0) max_i = -1;
38 38
39 39
  struct diff_edit *ses = (struct diff_edit *)
40 40
    R_alloc(n + m + 1, sizeof(struct diff_edit));

Learn more Showing 1 files with coverage changes found.

Changes in R/text.R
-1
+1
Loading file...

47 Commits

Hiding 43 contexual commits
+18
+18
Files Coverage
R 0.06% 99.64%
src -0.71% 90.77%
Project Totals (28 files) 99.01%
Loading