brodieG / diffobj

@@ -164,7 +164,10 @@
Loading
164 164
#' methods do.
165 165
#'
166 166
#' Strings are re-encoded to UTF-8 with \code{\link{enc2utf8}} prior to
167 -
#' comparison to avoid spurious encoding-only differences.
167 +
#' comparison to avoid encoding-only differences.
168 +
#'
169 +
#' The text representation of `target` and `current` should each have no more
170 +
#' than ~INT_MAX/4 lines.
168 171
#'
169 172
#' @section Matrices and Data Frames:
170 173
#'
@@ -350,10 +353,10 @@
Loading
350 353
#'   particular diff is a function of how many differences, and also how much
351 354
#'   \code{context} is used since context can cause two hunks to bleed into
352 355
#'   each other and become one.
353 -
#' @param max.diffs integer(1L), number of \emph{differences} after which we
354 -
#'   abandon the \code{O(n^2)} diff algorithm in favor of a naive element by
355 -
#'   element comparison. Set to \code{-1L} to always stick to the original
356 -
#'   algorithm (defaults to 50000L).
356 +
#' @param max.diffs integer(1L), number of \emph{differences} (default 50000L)
357 +
#'   after which we abandon the \code{O(n^2)} diff algorithm in favor of a naive
358 +
#'   \code{O(n)} one. Set to \code{-1L} to stick to the original algorithm up to
359 +
#'   the maximum allowed (~INT_MAX/4).
357 360
#' @param disp.width integer(1L) number of display columns to take up; note that
358 361
#'   in \dQuote{sidebyside} \code{mode} the effective display width is half this
359 362
#'   number (set to 0L to use default widths which are \code{getOption("width")}

@@ -153,8 +153,10 @@
Loading
153 153
#' raw diff data and not the printed output of \code{diffobj}, but do not wish
154 154
#' to manually parse the \code{ses} output.  Whether it is faster than
155 155
#' \code{ses} or not depends on the ratio of matching to non-matching values as
156 -
#' \code{ses_dat} includes matching values whereas \code{ses} does not.  See
157 -
#' examples.
156 +
#' \code{ses_dat} includes matching values whereas \code{ses} does not.
157 +
#' \code{ses_dat} objects have a print method that makes it easy to interpret
158 +
#' the diff, but are actually data.frames.  You can see the underlying data by
159 +
#' using \code{as.data.frame}, removing the "ses_dat" class, etc..
158 160
#'
159 161
#' @export
160 162
#' @param a character
@@ -166,17 +168,18 @@
Loading
166 168
#' @param warn TRUE (default) or FALSE whether to warn if we hit
167 169
#'   \code{max.diffs}.
168 170
#' @return character shortest edit script, or a machine readable version of it
169 -
#'   as a \code{data.frame} with columns \code{op} (factor, values
170 -
#'   \dQuote{Match}, \dQuote{Insert}, or \dQuote{Delete}), \code{val} character
171 -
#'   corresponding to the value taken from either \code{a} or \code{b},
172 -
#'   and if \code{extra} is TRUE, integer columns \code{id.a} and \code{id.b}
173 -
#'   corresponding to the indices in \code{a} or \code{b} that \code{val} was
174 -
#'   taken from.  See Details.
171 +
#'   as a \code{ses_dat} object, which is a \code{data.frame} with columns
172 +
#'   \code{op} (factor, values \dQuote{Match}, \dQuote{Insert}, or
173 +
#'   \dQuote{Delete}), \code{val} character corresponding to the value taken
174 +
#'   from either \code{a} or \code{b}, and if \code{extra} is TRUE, integer
175 +
#'   columns \code{id.a} and \code{id.b} corresponding to the indices in
176 +
#'   \code{a} or \code{b} that \code{val} was taken from.  See Details.
175 177
#' @examples
176 178
#' a <- letters[1:6]
177 179
#' b <- c('b', 'CC', 'DD', 'd', 'f')
178 180
#' ses(a, b)
179 181
#' (dat <- ses_dat(a, b))
182 +
#' str(dat)                 # data.frame with a print method
180 183
#'
181 184
#' ## use `ses_dat` output to construct a minimal diff
182 185
#' ## color with ANSI CSI SGR
@@ -237,7 +240,7 @@
Loading
237 240
  values <- character(length(id))
238 241
  values[use.a] <- a[id2[use.a]]
239 242
  values[use.b] <- b[id2[use.b]]
240 -
  if(extra) {
243 +
  res <- if(extra) {
241 244
    id.a <- id.b <- rep(NA_integer_, length(values))
242 245
    id.a[use.a] <- id2[use.a]
243 246
    id.b[use.b] <- id2[use.b]
@@ -248,7 +251,31 @@
Loading
248 251
  } else {
249 252
    data.frame(op=type2, val=values, stringsAsFactors=FALSE)
250 253
  }
254 +
  structure(res, class=c('ses_dat', class(res)))
251 255
}
256 +
#' @export
257 +
258 +
print.ses_dat <- function(x, quote=FALSE, ...) {
259 +
  op <- x[['op']]
260 +
  diff <- matrix(
261 +
    "", 3, nrow(x),
262 +
    dimnames=list(c('D:', 'M:', 'I:'), character(nrow(x)))
263 +
  )
264 +
  d <- op == 'Delete'
265 +
  m <- op == 'Match'
266 +
  i <- op == 'Insert'
267 +
  diff[1, d] <- x[['val']][d]
268 +
  diff[2, m] <- x[['val']][m]
269 +
  diff[3, i] <- x[['val']][i]
270 +
  writeLines(
271 +
    sprintf(
272 +
      "\"ses_dat\" object (Match: %d, Delete: %d, Insert: %d):",
273 +
      sum(m), sum(d), sum(i)
274 +
  ) )
275 +
  print(diff, quote=quote, ...)
276 +
  invisible(x)
277 +
}
278 +
252 279
# Internal validation fun for ses_*
253 280
254 281
ses_prep <- function(a, b, max.diffs, warn) {
@@ -299,12 +326,12 @@
Loading
299 326
#' @param a character
300 327
#' @param b character
301 328
#' @param max.diffs integer(1L) how many differences before giving up; set to
302 -
#'   zero to allow as many as there are
329 +
#'   -1 to allow as many as there are up to the maximum allowed (~INT_MAX/4).
303 330
#' @param warn TRUE or FALSE, whether to warn if we hit `max.diffs`.
304 331
#' @return MyersMbaSes object
305 332
#' @useDynLib diffobj, .registration=TRUE, .fixes="DIFFOBJ_"
306 333
307 -
diff_myers <- function(a, b, max.diffs=0L, warn=FALSE) {
334 +
diff_myers <- function(a, b, max.diffs=-1L, warn=FALSE) {
308 335
  stopifnot(
309 336
    is.character(a), is.character(b), all(!is.na(c(a, b))), is.int.1L(max.diffs),
310 337
    is.TF(warn)

@@ -76,13 +76,33 @@
Loading
76 76
 *   script rather than simply saying the shortest edit script is longer than
77 77
 *   allowable diffs; this is all the `faux_snake` stuff.  This algorithm tries
78 78
 *   to salvage whatever the myers algo computed up to the point of max diffs
79 -
 * - Adding lots of comments as we worked through the logic
79 +
 * - Comments.
80 +
 */
81 +
/*
82 +
 * Terms
83 +
 *
84 +
 * * A: first string
85 +
 * * B: second string
86 +
 * * N: length of A
87 +
 * * M: length of B
88 +
 * * K: grid-diagonal, numbered from -M to N.  For each grid-diagonal K,
89 +
 *   X - Y == K, i.e. (3, 1) is on diagonal K == 2.
90 +
 * * D: Number of differences in a path.
91 +
 * * Snake: sequence of diagonal moves (i.e. matching substrings).  An edit
92 +
 *   script (path) will combine snakes with (possibly zero) right/down moves.  A
93 +
 *   D-path may have up to D + 1 snakes, some or all zero length.
94 +
 * * Middle Snake: Possibly zero length snake in which the forward and reverse
95 +
 *   paths meet in the Linear Space Refinement algorithm.  Defined in terms of
96 +
 *   (x, y, u, v) where (x, y) are the coordinates from (0, 0), and (u, v) those
97 +
 *   from (M, N).
80 98
 */
81 99
82 100
83 101
#include <stdlib.h>
84 102
#include "diffobj.h"
85 103
104 +
// See _v and _setv for details
105 +
86 106
#define FV(k) _v(ctx, (k), 0)
87 107
#define RV(k) _v(ctx, (k), 1)
88 108
@@ -124,9 +144,13 @@
Loading
124 144
}
125 145
*/
126 146
/*
127 -
 * k = diagonal number
128 -
 * val = x value
129 -
 * r = presumably whether we are looking up in reverse snakes
147 +
 * For each diagonal k, we only store the path that up to this point has gotten
148 +
 * furthest on it (well, two really, one from the forward and one from the
149 +
 * reverse directins)
150 +
 *
151 +
 * @param k = diagonal number
152 +
 * @param val = x value
153 +
 * @param r = 0 for forward snake, 1 for backward snake
130 154
 */
131 155
  static void
132 156
_setv(struct _ctx *ctx, int k, int r, int val)
@@ -151,8 +175,9 @@
Loading
151 175
}
152 176
/*
153 177
 * For any given `k` diagonal, return the x coordinate of the furthest reaching
154 -
 * path we've found.  Use `r` to look for the x coordinate for the paths that
155 -
 * are starting from the bottom right instead of top left
178 +
 * path we've found.
179 +
 *
180 +
 * See `_set_v`.
156 181
 */
157 182
  static int
158 183
_v(struct _ctx *ctx, int k, int r)
@@ -175,9 +200,8 @@
Loading
175 200
 *
176 201
 * Needs to account for special case when indeces are oob for the strings.  The
177 202
 * oob checks seem to be necessary since algo is requesting reads past length
178 -
 * of string, but not sure if that is intentional or not.  In theory it should
179 -
 * not be, but perhaps this was handled gracefully by the varray business b4
180 -
 * we changed it.
203 +
 * of string (maybe this worked with the varrays).  As the current algo is
204 +
 * written, it does not seem to be the case that we even attempt oob reads.
181 205
 */
182 206
int _comp_chr(SEXP a, int aidx, SEXP b, int bidx) {
183 207
  int alen = XLENGTH(a);
@@ -189,141 +213,70 @@
Loading
189 213
    comp = 1;
190 214
    // nocov end
191 215
  } else if(aidx >= alen || bidx >= blen) {
192 -
    comp = 0;
216 +
    comp = 0;  // nocov
193 217
  } else comp = STRING_ELT(a, aidx) == STRING_ELT(b, bidx);
194 218
  return(comp);
195 219
}
196 220
/*
197 221
 * Handle cases where differences exceed maximum allowable differences
198 222
 *
199 -
 * General logic is to create a faux snake that instead of moving down
200 -
 * diagonally will chain right and down moves until it hits a path coming
201 -
 * from the other direction.  This snake is stored in `ctx`, and is then
202 -
 * written by `ses` to the `ses` list.
223 +
 * General logic is to try to connect the prior furthest points by naively
224 +
 * incrementing in each dimension and hoping for some diagonal runs.
225 +
 *
226 +
 * @param a character vector
227 +
 * @param b character vector
228 +
 * @param d number of differences associated with the backward snake implicit in
229 +
 *   the ms.u/v values.
230 +
 * forward loops in difference seeking; this is not the same as the
231 +
 *   number of differences as in each loop we find a forward and backward
232 +
 *   difference.
203 233
 */
204 234
static int
205 235
_find_faux_snake(
206 -
  SEXP a, int aoff, int n, SEXP b, int boff, int m, struct _ctx *ctx,
207 -
  struct middle_snake *ms, int d, diff_op ** faux_snake
236 +
  SEXP a, int aoff, int n, SEXP b, int boff, int m,
237 +
  struct middle_snake *ms, diff_op ** faux_snake, int d
208 238
) {
209 -
  /* normally we would record k/x values at the end of the furthest reaching
210 -
   * snake, but here we need pick a path from top left  and extend it until
211 -
   * we hit something coming from bottom right.
212 -
   */
213 -
  /* start by finding which diagonal has the furthest reaching value
214 -
   * when looking from top left
215 -
   */
216 -
  int k_max_f = 0, x_max_f = -1;
217 -
  int x_f, y_f, k_f;
218 -
  int delta = n - m;
219 -
220 -
  for (int k = d - 1; k >= -d + 1; k -= 2) { /* might need to shift by 1 */
221 -
    int x_f = FV(k);
222 -
    int f_dist = x_f - abs(k);
239 +
  int x = ms->x;
240 +
  int y = ms->y;
241 +
  // Should switch to unsigned int...
242 +
  if(x < 0 || y < 0 || ms->u < 0 || ms->v < 0) 
243 +
      error("Internal Error: fake snake with -ve start; contact maintainer.");  // nocov
223 244
224 -
    if(x_f > n || x_f - k > m) continue;
225 -
226 -
    if(f_dist > x_max_f - abs(k_max_f)) {
227 -
      x_max_f = x_f;
228 -
      k_max_f = k;
229 -
    }
230 -
  }
231 -
  /* didn't find a path so use origin */
232 -
  if(x_max_f < 0) {
233 -
    // nocov start
234 -
    error(err_msg_ubrnch, 2);
235 -
    x_f = y_f = k_f = 0;
236 -
    // nocov end
237 -
  } else {
238 -
    k_f = k_max_f;
239 -
    x_f = x_max_f;
240 -
    y_f = x_f - k_max_f;
241 -
  }
242 -
  /*
243 -
   * now look for the furthest reaching point in any diagonal that is
244 -
   * below the diagonal we found above since those are the only ones we
245 -
   * can connect to
246 -
   *
247 -
   */
248 -
  int k_max_r = 0, x_max_r = n + 1;
249 -
  int x_r, y_r;
250 -
251 -
  for (int k = -d; k <= k_max_f - delta; k += 2) {
252 -
    int x_r = RV(k);
253 -
    int r_dist = n - x_r - abs(k);
254 -
    /* skip reverse snakes that overshoot our forward snake
255 -
     * ---\
256 -
     *     \
257 -
     *   \
258 -
     *    \
259 -
     *     \---
260 -
     * where there should be a decent path we can use, but because we are only
261 -
     * tracking the last coordinates we don't really have a way connecting this
262 -
     * type of path so we just go straight to the origin even though that is
263 -
     * even more sub-optinal; not even sure if this is a possible scenario
264 -
     */
265 -
    if(x_r < x_f || x_r - k - delta < y_f) continue;
266 -
267 -
    /* since buffer is init to zero, an x_r value of zero means nothing, and
268 -
     * not all the way to the left of the graph; also, in reverse snakes the
269 -
     * snake should end at x == 1 in the leftmost case (we think)
270 -
     */
271 -
    if(r_dist > n - x_max_r - abs(k_max_r) && x_r) {
272 -
      x_max_r = x_r;
273 -
      k_max_r = k;
274 -
    }
275 -
  }
276 -
  if(x_max_r >= n) {
277 -
    x_r = n; y_r = m;
278 -
  } else {
279 -
    x_r = x_max_r;
280 -
    /* not 100% sure about this one; seems like k_max_r is relative to the
281 -
     * bottom right origin, so maybe this should be x_r - k_max_r - delta?
282 -
     */
283 -
    y_r = x_r - k_max_r - delta;
284 -
  }
285 -
  /*
286 -
   * attempt to connect the two paths we found.  We need to store this
287 -
   * information as our "faux" snake since it will have to be processed
288 -
   * in a manner similar as the middle snake would be processed; start by
289 -
   * figuring out max number of steps it would take to connect the two
290 -
   * paths
291 -
   */
292 -
  int max_steps = x_r - x_f + y_r - y_f + 1;
293 245
  int steps = 0;
294 -
  int diffs = 0;
246 +
  int diffs = 0;    // only diffs from fake snake
295 247
  int step_dir = 1; /* last direction we moved in, 1 is down */
296 -
  int x_sn = x_f, y_sn = y_f;
297 248
298 -
  /* initialize the fake snake */
299 -
  if(max_steps < 0)
249 +
  if(x > ms->u || y > ms->v) {
250 +
    // Overshot backward snake, e.g. you hit a long diagonal run that overshoots
251 +
    // the prior backward closest point.  In this case toss backward snake.
252 +
    ms->u = n;
253 +
    ms->v = m;
254 +
    diffs -= d;  // we're also tossing accrued differences from back snake
255 +
    if(x > ms->u || y > ms->v)
256 +
      error("Internal Error: can't correct fwd snake overshoot; contact maintainer"); // nocov
257 +
  }
258 +
  if(ms->u > INT_MAX - ms->v - 1) // x/y positive, so this is conservative
300 259
    error("Logic Error: fake snake step overflow? Contact maintainer."); // nocov
301 260
261 +
  int max_steps;
262 +
  max_steps = (ms->u - x) + (ms->v - y) + 1;
263 +
302 264
  diff_op * faux_snake_tmp = (diff_op*) R_alloc(max_steps, sizeof(diff_op));
303 265
  for(int i = 0; i < max_steps; i++) *(faux_snake_tmp + i) = DIFF_NULL;
304 -
305 -
  /* we have a further reaching reverse snake:
306 -
   * not entirely sure if this should happen, but it seems it does
307 -
   */
308 -
  while(x_sn < x_r || y_sn < y_r) {
309 -
    if(x_sn > x_r || y_sn > y_r) {
310 -
      error("Logic Error: Exceeded buffer for finding fake snake; contact maintainer.");  // nocov
311 -
    }
312 -
    /* check to see if we could possibly move on a diagonal, and do so
313 -
     * if possible, if not alternate going down and right*/
266 +
  while((x < ms->u) || (y < ms->v)) {
314 267
    if(
315 -
        x_sn <= x_r && y_sn <= y_r &&
316 -
        _comp_chr(a, aoff + x_sn, b, boff + y_sn)
268 +
      x < ms->u && y < ms->v &&
269 +
      _comp_chr(a, aoff + x, b, boff + y)
317 270
    ) {
318 -
      x_sn++; y_sn++;
271 +
      x++; y++;
319 272
      *(faux_snake_tmp + steps) = DIFF_MATCH;
320 -
    } else if (x_sn < x_r && (step_dir || y_sn >= y_r)) {
321 -
      x_sn++;
273 +
    } else if (x < ms->u && (step_dir || y >= ms->v)) {
274 +
      x++;
322 275
      diffs++;
323 276
      step_dir = !step_dir;
324 277
      *(faux_snake_tmp + steps) = DIFF_DELETE;
325 -
    } else if (y_sn < y_r && (!step_dir || x_sn >= x_r)) {
326 -
      y_sn++;
278 +
    } else if (y < ms->v && (!step_dir || x >= ms->u)) {
279 +
      y++;
327 280
      diffs++;
328 281
      *(faux_snake_tmp + steps) = DIFF_INSERT;
329 282
      step_dir = !step_dir;
@@ -332,23 +285,10 @@
Loading
332 285
    }
333 286
    steps++;
334 287
  }
335 -
  /* corner cases; must absolutely make sure steps LT max_steps since we rely
336 -
   * on at least one zero at the end of the faux_snake when we read it to know
337 -
   * to stop reading it
338 -
   */
339 -
  if(x_sn != x_r || y_sn != y_r || steps >= max_steps) {
288 +
  if(x != ms->u || y != ms->v || steps >= max_steps) {
340 289
    error("Logic Error: faux snake process failed; contact maintainer."); // nocov
341 290
  }
342 -
  /* modify the pointer to the pointer so we can return in by ref */
343 -
344 291
  *faux_snake = faux_snake_tmp;
345 -
346 -
  /* record the coordinates of our faux snake using `ms` */
347 -
  ms->x = x_f;
348 -
  ms->y = y_f;
349 -
  ms->u = x_r;
350 -
  ms->v = y_r;
351 -
352 292
  return diffs;
353 293
}
354 294
@@ -360,6 +300,8 @@
Loading
360 300
 * the maximum number of differences, we return to `_ses` with the number of
361 301
 * differences found.  `_ses` will then attempt to stitch back the snakes
362 302
 * together.
303 +
 *
304 +
 * @param ms tracks beginning (x,y) and end (u,v) coords of the middle snake
363 305
 */
364 306
  static int
365 307
_find_middle_snake(
@@ -367,10 +309,17 @@
Loading
367 309
  struct middle_snake *ms, diff_op ** faux_snake
368 310
) {
369 311
  int delta, odd, mid, d;
312 +
  int x_max, y_max, v_max, u_max;
313 +
  ms->x = x_max = 0;
314 +
  ms->y = y_max = 0;
315 +
  ms->u = u_max = n;
316 +
  ms->v = v_max = m;
317 +
  double dist = (x_max - u_max) * (x_max - u_max) +
318 +
    (y_max - v_max) * (y_max - v_max);
370 319
371 320
  delta = n - m;
372 321
  odd = delta & 1;
373 -
  mid = (n + m) / 2;
322 +
  mid = (n + m) / 2;  // we check in `diff` that this won't overflow int
374 323
  mid += odd;
375 324
376 325
  _setv(ctx, 1, 0, 0);
@@ -378,45 +327,72 @@
Loading
378 327
379 328
  /* For each number of differences `d`, compute the farthest reaching paths
380 329
   * from both the top left and bottom right of the edit graph
330 +
   *
331 +
   * First loop does NOT actually find a difference, which makes all the
332 +
   * difference calculations weird.
381 333
   */
382 334
  for (d = 0; d <= mid; d++) {
383 335
    int k, x, y;
384 336
385 -
    /* reached maximum allowable differences before real exit condition*/
386 -
    if ((2 * d - 1) >= ctx->dmax) {
337 +
    /* Reached maximum allowable differences before real exit condition.
338 +
     * Each loop iteration finds up to 2 d differences (one forward, one
339 +
     * backward).
340 +
     *
341 +
     * We know there is going to be at least one more difference because there
342 +
     * must be at least one for us to get here, and there might be two if the
343 +
     * extra forward difference doesn't find the end.
344 +
     */
345 +
    if (2 * (d - 1) > ctx->dmax - 1) {
346 +
      // So far we've found 2*(d - 1) differences
387 347
      ctx->dmaxhit = 1;
388 -
      return _find_faux_snake(a, aoff, n, b, boff, m, ctx, ms, d, faux_snake);
348 +
      ms->x = x_max; ms->y = y_max; ms->u = u_max; ms->v = v_max;
349 +
      return 2 * (d - 1) + _find_faux_snake(
350 +
        a, aoff, n, b, boff, m, ms, faux_snake, d - 1
351 +
      );
389 352
    }
390 -
    /* Forward (from top left) paths*/
391 -
353 +
    /* Forward (from top left) paths */
354 +
355 +
    // // Alternate looping picks path closest to middle diagonal.  If we change
356 +
    // // this we also should change it for backward paths.  This leads to more
357 +
    // // compact diffs, but TBD whether this is good IRL so we abandon it for
358 +
    // // now to avoid introducing behavior change.
359 +
    // int ki = 0;
360 +
    // k = d % 2 ? 1 : 0;
361 +
    // for (;
362 +
    //   k >= -d && k <= d;
363 +
    //   ki++, k += 2 * ki * (ki % 2 ? -1 : 1)
364 +
    // ) {
392 365
    for (k = d; k >= -d; k -= 2) {
366 +
      // If at lowest possible diag, or not at highest and next diag up is
367 +
      // further along in x, move to the right, otherwise move down.
393 368
      if (k == -d || (k != d && FV(k - 1) < FV(k + 1))) {
394 -
        x = FV(k + 1);
369 +
        x = FV(k + 1);      // move to the right, effectively
395 370
      } else {
396 -
        x = FV(k - 1) + 1;
371 +
        x = FV(k - 1) + 1;  // move down, effectively
397 372
      }
398 373
      y = x - k;
399 374
400 375
      ms->x = x;
401 376
      ms->y = y;
402 377
      while(x < n && y < m && _comp_chr(a, aoff + x, b, boff + y)) {
403 -
        /* matching characters, just walk down diagonal */
404 -
        x++; y++;
378 +
        x++; y++;  /* matching characters, just walk down diagonal */
379 +
      }
380 +
      double dist_new = (x - u_max) * (x - u_max) + (y - v_max) * (y - v_max);
381 +
      if(x <= n && y <= m && dist_new < dist) {
382 +
        dist = dist_new;
383 +
        x_max = x;
384 +
        y_max = y;
405 385
      }
406 386
      _setv(ctx, k, 0, x);
407 387
408 -
      /* for this diagonal we (think we) are now at farthest reaching point for
409 -
       * a given d.  Then return if:
410 -
       * - If we're at the edge of the addressable part of the graph
411 -
       * - The reverse snakes are already overlapping in the `x` coordinate
388 +
      /* For this diagonal k we are now at farthest reaching point for a given
389 +
       * `d`.  Then return if:
412 390
       *
413 -
       * then it means that the only way to get to the snake coming from the
414 -
       * other direction is by either moving down or across for every remaining
415 -
       * move, so record the current coord as `u` and `v` and return
391 +
       * - We're at the edge of the addressable part of the graph
392 +
       * - The reverse snakes are already overlapping in the `x` coordinate
416 393
       *
417 -
       * Note that for the backward snake we reverse xy and uv so that the
418 -
       * matching snake is always defined in `ms` as starting at `ms.(xy)` and
419 -
       * ending at `ms.(uv)`
394 +
       * For the backward snake we reverse xy and uv so that the matching snake
395 +
       * is defined ` as starting at `ms.(xy)` and ending at `ms.(uv)`
420 396
       */
421 397
      if (odd && k >= (delta - (d - 1)) && k <= (delta + (d - 1))) {
422 398
        if (x >= RV(k)) {
@@ -426,8 +402,19 @@
Loading
426 402
        }
427 403
      }
428 404
    }
429 -
    /* Backwards (from bottom right) paths*/
430 -
405 +
    // Check again if we'd go over by engaging the reverse snake
406 +
    if (2 * d > ctx->dmax) {
407 +
      // So far we've found 2*(d - 1) differences
408 +
      ctx->dmaxhit = 1;
409 +
      ms->x = x_max; ms->y = y_max; ms->u = u_max; ms->v = v_max;
410 +
      return 2 * (d - 1) + 1 + _find_faux_snake(
411 +
        a, aoff, n, b, boff, m, ms, faux_snake, d - 1
412 +
      );
413 +
    }
414 +
    /* Backwards (from bottom right) paths (see forward loop).  The two loops
415 +
     * are very similar so it is tempting to fold them into each other now, but
416 +
     * would require some work + ensuring no performance degradation.
417 +
     */
431 418
    for (k = d; k >= -d; k -= 2) {
432 419
      int kr = (n - m) + k;
433 420
@@ -445,6 +432,12 @@
Loading
445 432
        /* matching characters, just walk up diagonal */
446 433
        x--; y--;
447 434
      }
435 +
      double dist_new = (x_max - x) * (x_max - x) + (y_max - y) * (y_max - y);
436 +
      if(x >= 0 && y >= 0 && dist_new < dist) {
437 +
        dist = dist_new;
438 +
        u_max = x;
439 +
        v_max = y;
440 +
      }
448 441
      _setv(ctx, kr, 1, x);
449 442
450 443
      /* see comments in forward section */
@@ -524,7 +517,6 @@
Loading
524 517
  struct middle_snake ms;
525 518
  int d;
526 519
527 -
  //Rprintf("m: %d n: %d\n", m, n);
528 520
  if (n == 0) {
529 521
    _edit(ctx, DIFF_INSERT, boff, m);
530 522
    d = m;
@@ -532,13 +524,12 @@
Loading
532 524
    _edit(ctx, DIFF_DELETE, aoff, n);
533 525
    d = n;
534 526
  } else {
535 -
    /* Find the middle "snake" around which we
536 -
     * recursively solve the sub-problems.  Note this modifies `ms` by ref to
537 -
     * set the beginning and end coordinates of the snake of the furthest
538 -
     * reaching path.  The beginning is always the top left part of the snake,
539 -
     * irrespective of whether it was found on a forward or reverse path as
540 -
     * f_m_s will flip the coordinates when appropriately when recording them
541 -
     * in `ms`
527 +
    /* Find the middle "snake" around which we recursively solve the
528 +
     * sub-problems.  Note this modifies `ms` by ref to set the beginning and
529 +
     * end coordinates of the snake of the furthest reaching path.  The
530 +
     * beginning is always the top left part of the snake, irrespective of
531 +
     * whether it was found on a forward or reverse path as f_m_s will flip the
532 +
     * coordinates when appropriately when recording them in `ms`
542 533
     *
543 534
     * Additionally, if diffs exceed max.diffs, then `faux.snake` will also
544 535
     * be set.  `faux_snake` is a pointer to a pointer that points to a the
@@ -549,15 +540,8 @@
Loading
549 540
    diff_op fsv = DIFF_NULL;
550 541
    diff_op * faux_snake;
551 542
    faux_snake = &fsv;
552 -
    //
553 -
    // d
554 -
    // diff_op * fsp = NULL;
555 -
    // diff_op fsv = DIFF_NULL;
556 -
    // *fsp = fsv;
557 -
    // **faux_snake = *fsp;
558 543
559 544
    d = _find_middle_snake(a, aoff, n, b, boff, m, ctx, &ms, &faux_snake);
560 -
    //Rprintf("d: %d\n", d);
561 545
    if (d == -1) {
562 546
      // nocov start
563 547
      error(
@@ -569,7 +553,7 @@
Loading
569 553
      error(err_msg_ubrnch, 6);
570 554
      return d;
571 555
      // nocov end
572 -
    } else if (d > 1) {
556 +
    } else if (d != 1) {
573 557
      /* in this case we have something along the lines of (note the non-
574 558
       * diagonal bits are just non-diagonal, we're making no claims about
575 559
       * whether they should or could be of the horizontal variety)
@@ -579,6 +563,9 @@
Loading
579 563
       *        \- ...
580 564
       * so we will record the snake (diagonal) in the middle, and recurse
581 565
       * on the stub at the beginning and on the stub at the end separately
566 +
       *
567 +
       * Also have d == 0 case which can happen when the backward snake only has
568 +
       * matches.
582 569
       */
583 570
584 571
      /* Beginning stub */
@@ -613,19 +600,19 @@
Loading
613 600
      boff += ms.v;
614 601
      n -= ms.u;
615 602
      m -= ms.v;
616 -
      if (_ses(a, aoff, n, b, boff, m, ctx) == -1) {
603 +
      if(_ses(a, aoff, n, b, boff, m, ctx) == -1) {
617 604
        // nocov start
618 605
        error("Logic error: failed trying to run ses 2; contact maintainer.");
619 606
        // nocov end
620 607
      }
621 -
    } else {
608 +
    } else if (d == 1) {
622 609
      int x = ms.x;
623 610
      int u = ms.u;
624 611
625 -
      /* There are only 4 base cases when the
626 -
       * edit distance is 1.  Having a hard time finding cases that trigger the
627 -
       * x == u, possibly because the algo eats leading matches, although
628 -
       * apparently we do achieve it somewhere in the test suite.
612 +
      /* There are only 4 base cases when the edit distance is 1.  Having a hard
613 +
       * time finding cases that trigger the x == u, possibly because the algo
614 +
       * eats leading matches, although apparently we do achieve it somewhere in
615 +
       * the test suite.
629 616
       *
630 617
       * n > m   m > n
631 618
       *
@@ -638,7 +625,6 @@
Loading
638 625
       *     -       |
639 626
       */
640 627
641 -
      //Rprintf("x: %d u: %d y: %d v: %d\n",  ms.x, ms.u, ms.y, ms.v);
642 628
      if (m > n) {
643 629
        if (x == u) {
644 630
          _edit(ctx, DIFF_MATCH, aoff, n);
@@ -659,8 +645,8 @@
Loading
659 645
        // Should never get here since this should be a D 2 case
660 646
        // nocov start
661 647
        error(
662 -
          "Very special case n %d m %d aoff %d boff %d u %d\n", n, m,
663 -
          aoff, boff, ms.u
648 +
          "%s d %d n %d m %d aoff %d boff %d u %d; contact maintainer\n",
649 +
          "Logic Error: special case", d, n, m, aoff, boff, ms.u
664 650
        );
665 651
        // nocov end
666 652
      }
@@ -681,14 +667,18 @@
Loading
681 667
) {
682 668
  if(n < 0 || m < 0)
683 669
    error("Logic Error: negative lengths; contact maintainer.");  // nocov
670 +
  if(n > INT_MAX - m)
671 +
    error("Combined length of diffed vectors exeeds INT_MAX (%d)", INT_MAX);  // nocov
684 672
  struct _ctx ctx;
685 673
  int d, x, y;
686 674
  struct diff_edit *e = NULL;
687 675
  int delta = n - m;
688 676
  if(delta < 0) delta = -delta;
677 +
  if(n + m > INT_MAX - delta)
678 +
    error("Logic Error: exceeded max allowable combined string length.");  // nocov
679 +
  if(n + m + delta > INT_MAX / 4 - 1)
680 +
    error("Logic Error: exceeded max allowable combined string length.");  // nocov
689 681
  int bufmax = 4 * (n + m + delta) + 1;  // see _setv
690 -
  if(bufmax < n || bufmax < m)
691 -
    error("Logic Error: exceeded maximum allowable combined string length.");  // nocov
692 682
693 683
  int *tmp = (int *) R_alloc(bufmax, sizeof(int));
694 684
  for(int i = 0; i < bufmax; i++) *(tmp + i) = 0;
@@ -702,7 +692,7 @@
Loading
702 692
  ctx.ses = ses;
703 693
  ctx.si = 0;
704 694
  ctx.simax = n + m;
705 -
  ctx.dmax = dmax ? dmax : INT_MAX;
695 +
  ctx.dmax = dmax >= 0 ? dmax : INT_MAX;
706 696
  ctx.dmaxhit = 0;
707 697
708 698
  /* initialize first ses edit struct*/
@@ -712,18 +702,16 @@
Loading
712 702
    }
713 703
    e->op = 0;
714 704
  }
715 -
716 705
  /* The _ses function assumes the SES will begin or end with a delete
717 706
   * or insert. The following will ensure this is true by eating any
718 707
   * beginning matches. This is also a quick to process sequences
719 708
   * that match entirely.
720 709
   */
721 710
  x = y = 0;
711 +
  if(boff > INT_MAX - m || aoff > INT_MAX - n)
712 +
      error("Internal error: overflow for a/boff; contact maintainer"); //nocov
713 +
722 714
  while (x < n && y < m) {
723 -
    if(boff > INT_MAX - y)
724 -
      error("Internal error: overflow for boff; contact maintainer"); //nocov
725 -
    if(aoff > INT_MAX - x)
726 -
      error("Internal error: overflow for aoff; contact maintainer"); //nocov
727 715
    if(!_comp_chr(a, aoff + x, b, boff + y)) break;
728 716
    x++; y++;
729 717
  }

@@ -285,7 +285,7 @@
Loading
285 285
            gsub(paste0(".*", ansi_regex, ".*"), "\\1", tail(x, 1L), perl=TRUE)
286 286
        ) }
287 287
        res
288 -
      } else x
288 +
      } else x # nocov has.cr elements can't have length zero after split...
289 289
    },
290 290
    character(1L)
291 291
  )

@@ -34,7 +34,7 @@
Loading
34 34
    error("Logic Error: `max` not integer(1L) and not NA"); // nocov
35 35
36 36
  int max_i = asInteger(max);
37 -
  if(max_i < 0) max_i = 0;
37 +
  if(max_i < 0) max_i = -1;
38 38
39 39
  struct diff_edit *ses = (struct diff_edit *)
40 40
    R_alloc(n + m + 1, sizeof(struct diff_edit));
Files Coverage
R 99.64%
src 90.77%
Project Totals (28 files) 99.01%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading