1
|
|
#' @export
|
2
|
|
|
3
|
|
mgsub_censor = function(string, pattern, censor="*", split=any(nchar(censor) > 1), seed=NULL, ...) {
|
4
|
|
#' @title Safe, multiple censoring of text strings
|
5
|
|
#'
|
6
|
|
#' @description \code{mgsub_censor} - A safe, simultaneous, multiple global string censoring
|
7
|
|
#' (replace matches with a censoring character like '*')
|
8
|
|
#'
|
9
|
|
#' @param string a character vector to censor
|
10
|
|
#' @param pattern regular expressions used to identify where to censor
|
11
|
|
#' @param censor character to use in censoring - see details
|
12
|
|
#' @param split if a multicharacter censor pattern is provided, should it be
|
13
|
|
#' split to preserve original string length
|
14
|
|
#' @param seed optional parameter to fix sampling of multicharacter censors
|
15
|
|
#' @param \dots arguments to pass to \code{\link[base:grep]{regexpr}} /
|
16
|
|
#' \code{\link[base:grep]{sub}}
|
17
|
|
#' @rdname mgsub_censor
|
18
|
|
#' @return Censored string.
|
19
|
|
#' @details When censor is provided as a >1 length vector or as a multicharacter
|
20
|
|
#' string with split = TRUE, it will be sampled to return random censoring patterns.
|
21
|
|
#' This can be helpful if you want to create cartoonish swear censoring. If
|
22
|
|
#' needed, the randomization can be controlled with the seed argument.
|
23
|
|
#'
|
24
|
|
#' @examples
|
25
|
|
#' mgsub_censor("Flowers for a friend", pattern=c("low"), censor="*")
|
26
|
|
|
27
|
6
|
if (all(is.na(string))) return(string)
|
28
|
6
|
sna = !is.na(string)
|
29
|
6
|
result = vapply(string[sna],
|
30
|
6
|
censor_worker,
|
31
|
6
|
c(""),
|
32
|
6
|
USE.NAMES = FALSE,
|
33
|
6
|
pattern = pattern,
|
34
|
6
|
censor = censor,
|
35
|
6
|
split = split,
|
36
|
6
|
seed = seed,
|
37
|
|
...)
|
38
|
6
|
string[sna] = result
|
39
|
6
|
return(string)
|
40
|
|
}
|
41
|
|
|
42
|
|
censor_worker = function(string, pattern, censor, split=any(nchar(censor) > 1), seed=NULL, ...) {
|
43
|
|
#' @title mgsub_censor worker
|
44
|
|
#'
|
45
|
|
#' @description The hard worker doing everything for mgsub_censor
|
46
|
|
#'
|
47
|
|
#' @param string a character vector where replacements are sought
|
48
|
|
#' @param pattern Character string to be matched in the given character vector
|
49
|
|
#' @param censor character to use in censoring - see details
|
50
|
|
#' @param split if a multicharacter censor pattern is provided, should it be
|
51
|
|
#' split to preserve original string length
|
52
|
|
#' @param seed optional parameter to fix sampling of multicharacter censors
|
53
|
|
#' @param \dots arguments to pass to regexpr family
|
54
|
|
|
55
|
6
|
x0 = do.call(rbind, lapply(seq_along(pattern),
|
56
|
6
|
get_matches,
|
57
|
6
|
string = string,
|
58
|
6
|
pattern = pattern,
|
59
|
|
...))
|
60
|
6
|
x0 = matrix(x0[x0[, 2] != -1, ], ncol = 4)
|
61
|
6
|
if (nrow(x0) == 0) return(string)
|
62
|
6
|
if (nrow(x0) > 1) {
|
63
|
6
|
x = x0[order(x0[, 3], decreasing = T), ]
|
64
|
6
|
x = filter_overlap(x) #nolint
|
65
|
6
|
x = x[order(x[, 2]), , drop = FALSE]
|
66
|
|
} else {
|
67
|
6
|
x = x0
|
68
|
|
}
|
69
|
6
|
for (i in rev(seq_len(nrow(x)))) {
|
70
|
6
|
s = x[i, 2]
|
71
|
6
|
e = x[i, 4]
|
72
|
6
|
p = pattern[x[i, 1]]
|
73
|
6
|
if (split) censor = unlist(strsplit(censor, ""))
|
74
|
6
|
if (!is.null(seed)) set.seed(seed)
|
75
|
6
|
r = if (length(censor) > 1) {
|
76
|
6
|
paste(sample(censor, x[i, 3], replace = TRUE), collapse = "")
|
77
|
|
} else {
|
78
|
6
|
paste(rep(censor, x[i, 3]), collapse = "")
|
79
|
|
}
|
80
|
6
|
pre = if (s > 1) substr(string, 1, s - 1) else ""
|
81
|
6
|
r0 = sub(p, r, substr(string, s, e), ...)
|
82
|
6
|
end = if (e < nchar(string)) substr(string, e + 1, nchar(string)) else ""
|
83
|
6
|
string = paste0(pre, r0, end)
|
84
|
|
}
|
85
|
6
|
return(string)
|
86
|
|
}
|