carloscinelli / benford.analysis

Compare 357f30c ... +3 ... 1a375bf

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.

Showing 2 of 8 files from the diff.

@@ -1,4 +1,18 @@
Loading
1 -
#### Benford ####
1 +
### TODO
2 +
# - last-two digits
3 +
# - verificar se os números do last-two são inteiros (resto da divisão por 1)
4 +
# - possível solução: se todos forem inteiros, "arrendodar" e não pegar "depois do ponto"
5 +
#   - emitir warning falando que todos numeros parecem ser inteiros, então last-two não pegou decimal
6 +
# - tem que ter uma opção para definir quantos decimais existem no número (inteiro = zero, dinheiro = 2)
7 +
#    - argumento "round", "digits".
8 +
9 +
10 +
11 +
12 +
13 +
# Tests -------------------------------------------------------------------
14 +
15 +
2 16
3 17
DF <- function(data){
4 18
  data <- data[data >= 10]
@@ -109,6 +123,15 @@
Loading
109 123
  return(out)
110 124
}
111 125
126 +
127 +
128 +
129 +
# Extraction functions ----------------------------------------------------
130 +
131 +
132 +
133 +
134 +
112 135
#' @title Extracts the leading digits from the data
113 136
#' @description It extracts the leading digits from the data.
114 137
#' 
@@ -168,21 +191,47 @@
Loading
168 191
  return(results)
169 192
}
170 193
194 +
#' @title Trunc decimal values.
195 +
#' @description It trunc decimal values from the data.
196 +
#' 
197 +
#'This function is used by the main function of the package \code{\link{benford}}
198 +
#' to trunc the values in its ndec argument to the specified number of decimal places. 
199 +
#'
200 +
#' @param x a numeric vector. 
201 +
#' @param n.dec integer indicating the number of decimal places (trunc) to be used.
202 +
#' @return 
203 +
#' @export
204 +
205 +
truncDec <- function(x, n.dec = 2){
206 +
  int <- floor(x)
207 +
  isDec <- grepl("\\.", as.character(x))
208 +
  dec <- sub("^[0-9]*\\.", "", as.character(x))
209 +
  truncateDec <- substr(dec, 1, n.dec)
210 +
  truncatedNumber <- vector("character", length = length(dec))
211 +
  truncatedNumber[isDec] <- paste0(int[isDec], ".", dec[isDec])
212 +
  truncatedNumber[!isDec] <- as.character(int[!isDec])
213 +
  truncatedNumber <- as.numeric(truncatedNumber)
214 +
  return(truncatedNumber)
215 +
} 
216 +
217 +
171 218
#' @title Extracts the last two digits from the data
172 219
#' @description It extracts the last two digits from the data.
173 220
#' 
174 221
#'This function is used by the main function of the package \code{\link{benford}} to extract the 
175 222
#'ast two digits of the data.
176 223
#'
177 224
#' @param data a numeric vector. 
178 -
#' @param sign  The default value for sign is "positive" and it analyzes only data greater than zero. 
225 +
#' @param ndec it specifies the number of decimals to be used (default 2).
226 +
#' @param sign The default value for sign is "positive" and it analyzes only data greater than zero. 
179 227
#' There are also the options "negative" and "both" that will analyze only negative values or both positive and negative values of the data,
180 228
#' respectively. For large datasets with both positive and negative numbers, 
181 229
#' it is usually recommended to perform a separate analysis for each group,
182 230
#' for the incentives to manipulate the numbers are usually different.
183 231
#' @return A data.frame with the data and the last digits.
184 232
#' @export
185 -
last.two.digits <- function(data, sign="positive") {
233 +
234 +
last.two.digits <- function(data, sign="positive", ndec = 2) {
186 235
  
187 236
  if (!is.numeric(data)) stop("Data must be a numeric vector")
188 237
  
@@ -191,23 +240,49 @@
Loading
191 240
  if (sign == "negative")  positives <- data[data < 0 & !is.na(data)]*(-1)
192 241
  if (sign == "both")      positives <- abs(data[data != 0 & !is.na(data)]) 
193 242
  
194 -
  digits.as.str <- as.character(positives)
195 -
  nozeros <- grepl("\\.", digits.as.str)
196 -
  digits.as.str.nz <- digits.as.str[nozeros]
197 -
  dgts.after.dot <- sub("^[0-9]*\\.", "", digits.as.str.nz)
198 -
  ltd.dgts.after.dot <- substr(dgts.after.dot, nchar(dgts.after.dot)-1, nchar(dgts.after.dot))
199 -
  which.dgts.mult.10 <- grepl("^0", ltd.dgts.after.dot)
200 -
  dgts.mult.10 <- as.character(as.numeric(ltd.dgts.after.dot[which.dgts.mult.10])*10)
201 -
  ltd.dgts.after.dot[which.dgts.mult.10] <- dgts.mult.10
202 -
  digits.as.str[!nozeros] <- 0
203 -
  digits.as.str[nozeros] <- ltd.dgts.after.dot
204 -
  ltd <- as.numeric(digits.as.str)
243 +
  remainder <- positives %% 1
244 +
  if (all(remainder == 0)){
245 +
    warning("Data appears to be integers, so the argument 'ndec' is set to zero.")
246 +
    ndec <- 0
247 +
  } 
205 248
  
206 -
  results <- data.frame(data = positives,
249 +
  truncated.values <- truncDec(positives, ndec)
250 +
  values.as.str <- as.character(truncated.values)
251 +
  if (ndec == 0){
252 +
    nchar.values <- nchar(truncated.values)
253 +
    ltd <- substr(truncated.values, nchar.values - 1, nchar.values)
254 +
    ltd <- as.numeric(ltd)
255 +
  }else{
256 +
    nozeros <- grepl("\\.", values.as.str)
257 +
    values.as.str.nz <- values.as.str[nozeros]
258 +
    dgts.after.dot <- sub("^[0-9]*\\.", "", values.as.str.nz)
259 +
    ltd.dgts.after.dot <- substr(dgts.after.dot, nchar(dgts.after.dot) - 1, nchar(dgts.after.dot))
260 +
    which.dgts.mult.10 <- nchar(ltd.dgts.after.dot) == 1
261 +
    dgts.mult.10 <- as.character(as.numeric(ltd.dgts.after.dot[which.dgts.mult.10])*10)
262 +
    ltd.dgts.after.dot[which.dgts.mult.10] <- dgts.mult.10
263 +
    values.as.str[!nozeros] <- 0
264 +
    values.as.str[nozeros] <- ltd.dgts.after.dot
265 +
    ltd <- as.numeric(values.as.str)
266 +
  }
267 +
  
268 +
  results <- data.frame(data = truncated.values,
207 269
                        data.digits = ltd)
208 270
  return(results)
209 271
}
210 272
273 +
274 +
extract.mantissa <- function(positives) {
275 +
  log <- log10(positives)
276 +
  log[log < 0] <- log[log < 0] + as.integer(log[log < 0])*(-1) + 1
277 +
  mantissa <- log - trunc(log)
278 +
  return(mantissa)
279 +
}
280 +
281 +
282 +
# Digits probability ------------------------------------------------------
283 +
284 +
285 +
211 286
#' @title Probability of a digit sequence
212 287
#' @description It calculates the probability of a digit sequence "d".
213 288
#' @usage
@@ -229,6 +304,11 @@
Loading
229 304
}
230 305
231 306
307 +
308 +
309 +
310 +
311 +
232 312
#' @title Probability of a digit at the nth position
233 313
#' @description It calculates the probability of digit "d" at the "n"th position.
234 314
#' @usage
@@ -264,6 +344,12 @@
Loading
264 344
  return(sum)
265 345
}
266 346
347 +
348 +
349 +
# Generating functions ----------------------------------------------------
350 +
351 +
352 +
267 353
generate.benford.digits <- function(number.of.digits) {
268 354
  number.of.digits <- as.integer(number.of.digits)
269 355
  begin <- 10^(number.of.digits - 1)
@@ -300,12 +386,7 @@
Loading
300 386
  return(results)
301 387
}
302 388
303 -
extract.mantissa <- function(positives) {
304 -
  log <- log10(positives)
305 -
  log[log < 0] <- log[log < 0] + as.integer(log[log < 0])*(-1) + 1
306 -
  mantissa <- log - trunc(log)
307 -
  return(mantissa)
308 -
}
389 +
309 390
310 391
generate.summation <- function(benford.digits, data, data.digits) {
311 392
  x <- NULL
@@ -328,9 +409,12 @@
Loading
328 409
329 410
#### Basic Calculations ####
330 411
331 -
excess.kurtosis <- function(x) 
412 +
excess.kurtosis <- function(x){ 
332 413
  (mean((x - mean(x))^4)/(mean((x - mean(x))^2)^2)) - 3
414 +
}
333 415
334 416
335 -
skewness <- function(x) (mean((x - mean(x))^3)/(mean((x - mean(x))^2)^(3/2)))
417 +
skewness <- function(x) {
418 +
  (mean((x - mean(x))^3)/(mean((x - mean(x))^2)^(3/2)))
419 +
}
336 420

@@ -33,7 +33,7 @@
Loading
33 33
                         select = c("digits", "second order", "summation", "chi squared", "ex summation"), 
34 34
                         except = NULL, 
35 35
                         multiple = TRUE,  
36 -
                         col.bar = "lightblue", 
36 +
                         col.bar = "turquoise3", 
37 37
                         err.bounds = FALSE, 
38 38
                         alpha = 0.05, 
39 39
                         grid = TRUE,
@@ -115,7 +115,7 @@
Loading
115 115
    #par(mfrow = c(rows, cols))
116 116
    nslots <- rows*cols
117 117
    plot_this <- plots
118 -
    lg_size <- ifelse(rows > 1, 1, ifelse(err.bounds, 0.4, 0.7))/rows
118 +
    lg_size <- ifelse(rows > 1, 1, 0.7)/rows
119 119
    
120 120
    for (i in 1:length(plot_this)) {
121 121
      plot.switch(plot_this[i], x, col.bar, grid, err.bounds, alpha, exp.benford, freq)
@@ -128,7 +128,7 @@
Loading
128 128
    old.par <- par(no.readonly = TRUE)
129 129
    #on.exit(par(old.par))
130 130
    plot_this <- plots
131 -
    lg_size <- ifelse(err.bounds, 0.4, 0.7)
131 +
    lg_size <- 0.7
132 132
    
133 133
    for (i in 1:length(plot_this)) {
134 134
      plot.switch(plot_this[i], x, col.bar, grid, err.bounds, alpha, exp.benford, freq)
@@ -142,16 +142,16 @@
Loading
142 142
# Separate plots --------------------------------------------------------------------------------------
143 143
144 144
histogram.Benford <- function(x,
145 -
                                  obs.freq = "digits",
146 -
                                  main = NULL,
147 -
                                  xlab = NULL,
148 -
                                  ylab = NULL,
149 -
                                  grid = TRUE,
150 -
                                  col.bar = "lightblue",
151 -
                                  err.bounds = FALSE,
152 -
                                  alpha = 0.05,
153 -
                                  exp.benford = TRUE,
154 -
                                  freq = TRUE, ...){
145 +
                              obs.freq = "digits",
146 +
                              main = NULL,
147 +
                              xlab = NULL,
148 +
                              ylab = NULL,
149 +
                              grid = TRUE,
150 +
                              col.bar = "lightblue",
151 +
                              err.bounds = FALSE,
152 +
                              alpha = 0.05,
153 +
                              exp.benford = TRUE,
154 +
                              freq = TRUE, ...){
155 155
  
156 156
  digits <- x[["bfd"]]$digits
157 157
  obs.freq <- tolower(obs.freq)
@@ -255,16 +255,16 @@
Loading
255 255
256 256
257 257
rootogram.Benford <- function(x,
258 -
                                  obs.freq = "digits",
259 -
                                  main = NULL,
260 -
                                  xlab = NULL,
261 -
                                  ylab = NULL,
262 -
                                  grid = TRUE,
263 -
                                  col.bar = "lightblue",
264 -
                                  err.bounds = FALSE,
265 -
                                  alpha = 0.05,
266 -
                                  exp.benford = TRUE,
267 -
                                  freq = TRUE, ...){
258 +
                              obs.freq = "digits",
259 +
                              main = NULL,
260 +
                              xlab = NULL,
261 +
                              ylab = NULL,
262 +
                              grid = TRUE,
263 +
                              col.bar = "lightblue",
264 +
                              err.bounds = FALSE,
265 +
                              alpha = 0.05,
266 +
                              exp.benford = TRUE,
267 +
                              freq = TRUE, ...){
268 268
  digits <- x[["bfd"]]$digits
269 269
  obs.freq <- tolower(obs.freq)
270 270
  
@@ -301,11 +301,13 @@
Loading
301 301
    ylim <- c(min(exp_freq - obs_freq)*1.1, max(abs(exp_freq - obs_freq)*0.5, exp_freq)*1.1)
302 302
  }
303 303
  
304 -
  if(is.null(xlab)){
305 -
    xlab <- c("First Digit", "First-Two Digits", "First-Three Digits", "First-Order Digits")[ifelse(x$info$number.of.digits <= 3, x$info$number.of.digits, 4)]
304 +
  if (is.null(xlab)) {
305 +
    xlab.options <- c("First Digit", "First-Two Digits", "First-Three Digits", "First-Order Digits")
306 +
    lab.picker <- ifelse(x$info$number.of.digits <= 3, x$info$number.of.digits, 4)
307 +
    xlab <- xlab.options[lab.picker]
306 308
  }
307 309
  
308 -
  if(is.null(ylab)){
310 +
  if (is.null(ylab)) {
309 311
    ylab <- "Frequency"
310 312
  }
311 313
  
@@ -317,7 +319,7 @@
Loading
317 319
       ylim = ylim,
318 320
       yaxs = 'i', xaxs = 'i', xaxt = "n", type = 'n',
319 321
       panel.first = {
320 -
         if(grid){
322 +
         if (grid) {
321 323
           grid(nx = NA, ny = NULL, lty = 1, col = "gray90")
322 324
           axis(1, at = xmarks[seq(1, length(xmarks), ifelse(length(digits) <= 90, 1, 10))], tck = 1, col.ticks = "gray90", labels = F)
323 325
         }
@@ -488,8 +490,7 @@
Loading
488 490
           col = plot_colors, 
489 491
           cex = size,
490 492
           lwd = c(rep(2, 2), 2),
491 -
           lty = c(rep(1, 2), 2),
492 -
           horiz = TRUE)
493 +
           lty = c(rep(1, 2), 2))
493 494
  }else{
494 495
    plot_colors <- c("lightblue","red")
495 496
    legend(x = "topright",
@@ -499,8 +500,7 @@
Loading
499 500
           col = plot_colors, 
500 501
           cex = size,
501 502
           lwd = 2,
502 -
           lty = rep(1, 2),
503 -
           horiz = TRUE)
503 +
           lty = rep(1, 2))
504 504
  }
505 505
}
506 506

Everything is accounted for!

No changes detected that need to be reviewed.
What changes does Codecov check for?
Lines, not adjusted in diff, that have changed coverage data.
Files that introduced coverage data that had none before.
Files that have missing coverage data that once were tracked.
Files Coverage
R 0.01% 99.51%
Project Totals (4 files) 99.51%
Loading