carloscinelli / benford.analysis
Showing 3 of 4 files from the diff.

@@ -152,14 +152,16 @@
Loading
152 152
    
153 153
  benford.dist <- generate.benford.distribution(benford.digits)
154 154
  
155 -
  empirical.distribution <- generate.empirical.distribution(data, number.of.digits,sign, second.order = FALSE, benford.digits)
155 +
  empirical.distribution <- generate.empirical.distribution(data, number.of.digits, sign = sign, second.order = FALSE, benford.digits = benford.digits)
156 156
  
157 157
  n <- length(empirical.distribution$data)
158 158
  
159 -
  second.order <- generate.empirical.distribution(data, number.of.digits,sign, second.order = TRUE, benford.digits, discrete = discrete, round = round)
159 +
  second.order <- generate.empirical.distribution(data, number.of.digits, sign = sign, second.order = TRUE, benford.digits = benford.digits, discrete = discrete, round = round)
160 160
  
161 161
  n.second.order <- length(second.order$data)
162 162
  
163 +
  last.two.digits <- generate.empirical.distribution(data, sign = sign, last.two.dgts = TRUE, benford.digits = benford.digits)
164 +
  
163 165
  benford.dist.freq <- benford.dist*n
164 166
  
165 167
  ## calculating useful summaries and differences
@@ -226,6 +228,9 @@
Loading
226 228
                 s.o.data = data.table(second.order = second.order$data,
227 229
                                       data.second.order.digits = second.order$data.digits),
228 230
                 
231 +
                 last.two.digits = data.table(last.two.digits = 0:99,
232 +
                                              data.dist.freq = last.two.digits$dist.freq),
233 +
                 
229 234
                 bfd = data.table(digits = benford.digits,
230 235
                                  data.dist = empirical.distribution$dist,
231 236
                                  data.second.order.dist = second.order$dist,

@@ -192,9 +192,16 @@
Loading
192 192
  if (sign == "both")      positives <- abs(data[data != 0 & !is.na(data)]) 
193 193
  
194 194
  digits.as.str <- as.character(positives)
195 -
  digits.as.str <- gsub("\\.", "", digits.as.str)
196 -
  ltd <- as.integer(substr(digits.as.str, nchar(digits.as.str) - 1, nchar(digits.as.str)))
197 -
  ltd[ltd < 10] <- ltd[ltd < 10]*10
195 +
  nozeros <- grepl("\\.", digits.as.str)
196 +
  digits.as.str.nz <- digits.as.str[nozeros]
197 +
  dgts.after.dot <- sub("^[0-9]*\\.", "", digits.as.str.nz)
198 +
  ltd.dgts.after.dot <- substr(dgts.after.dot, nchar(dgts.after.dot)-1, nchar(dgts.after.dot))
199 +
  which.dgts.mult.10 <- grepl("^0", ltd.dgts.after.dot)
200 +
  dgts.mult.10 <- as.character(as.numeric(ltd.dgts.after.dot[which.dgts.mult.10])*10)
201 +
  ltd.dgts.after.dot[which.dgts.mult.10] <- dgts.mult.10
202 +
  digits.as.str[!nozeros] <- 0
203 +
  digits.as.str[nozeros] <- ltd.dgts.after.dot
204 +
  ltd <- as.numeric(digits.as.str)
198 205
  
199 206
  results <- data.frame(data = positives,
200 207
                        data.digits = ltd)
@@ -270,13 +277,19 @@
Loading
270 277
  return(benford.dist)
271 278
}
272 279
273 -
generate.empirical.distribution <- function(data, number.of.digits,sign, second.order = FALSE, benford.digits, discrete = TRUE, round = 3){
280 +
generate.empirical.distribution <- function(data, number.of.digits, sign, last.two.dgts = FALSE, second.order = FALSE, benford.digits, discrete = TRUE, round = 3){
274 281
  x <- NULL
275 282
  v <- NULL
276 -
  data.frame <- extract.digits(data, number.of.digits, sign, second.order, discrete = discrete, round = round)
283 +
  if(last.two.dgts){
284 +
    data.frame <- last.two.digits(data, sign)
285 +
    DF <- data.table(x = c(data.frame$data.digits, 0:99),
286 +
                     v = c(data.frame$data.digits, 0:99))
287 +
  }else{
288 +
    data.frame <- extract.digits(data, number.of.digits, sign, second.order, discrete = discrete, round = round) 
289 +
    DF <- data.table(x = c(data.frame$data.digits, benford.digits),
290 +
                     v = c(data.frame$data.digits, benford.digits) )
291 +
  }
277 292
  n <- length(data.frame$data.digits)
278 -
  DF <- data.table(x = c(data.frame$data.digits, benford.digits),
279 -
                   v = c(data.frame$data.digits, benford.digits) )
280 293
  DFcount <- DF[ ,length(x) - 1, by = v][order(v)]
281 294
  dist.freq <- DFcount$V1
282 295
  dist <- dist.freq/n

@@ -124,7 +124,6 @@
Loading
124 124
}
125 125
126 126
# Separate plots --------------------------------------------------------------------------------------
127 -
#utils::globalVariables(c("x", "err.bounds", "alpha", "exp.benford"))
128 127
129 128
histogram.Benford <- function(x,
130 129
                                  obs.freq = "digits",
@@ -183,8 +182,7 @@
Loading
183 182
  }
184 183
  
185 184
  if(is.null(xlab)){
186 -
    ord <- c("","-two","-tree", "-order")[ifelse(x$info$number.of.digits <= 3, x$info$number.of.digits, 4)]
187 -
    xlab <- paste0("First", ord, " Digits") 
185 +
    xlab <- c("First Digit", "First-Two Digits", "First-Three Digits", "First-Order Digits")[ifelse(x$info$number.of.digits <= 3, x$info$number.of.digits, 4)]
188 186
  }
189 187
  
190 188
  if(is.null(ylab)){
@@ -277,8 +275,7 @@
Loading
277 275
  }
278 276
  
279 277
  if(is.null(xlab)){
280 -
    ord <- c("","-two","-tree", "-order")[ifelse(x$info$number.of.digits <= 3, x$info$number.of.digits, 4)]
281 -
    xlab <- paste0("First", ord, " Digits") 
278 +
    xlab <- c("First Digit", "First-Two Digits", "First-Three Digits", "First-Order Digits")[ifelse(x$info$number.of.digits <= 3, x$info$number.of.digits, 4)]
282 279
  }
283 280
  
284 281
  if(is.null(ylab)){
@@ -341,10 +338,10 @@
Loading
341 338
  )
342 339
  
343 340
  if(is.null(xlab)){
344 -
    ord <- c("","-two","-tree", "-order")[ifelse(x$info$number.of.digits <= 3, x$info$number.of.digits, 4)]
345 -
    xlab <- paste0("First", ord, " Digits") 
341 +
    xlab <- c("First Digit", "First-Two Digits", "First-Three Digits", "First-Order Digits")[ifelse(x$info$number.of.digits <= 3, x$info$number.of.digits, 4)]
346 342
  }
347 343
  
344 +
  
348 345
  if(is.null(ylab)){
349 346
    ylab <- "Frequency"
350 347
  }
@@ -487,16 +484,6 @@
Loading
487 484
  }
488 485
}
489 486
490 -
# histogram.Benford(x,  obs.freq = "digits", freq = T, main = NULL, xlab = NULL, ylab = NULL, grid = TRUE, col.bar = "lightblue", err.bounds = FALSE, alpha = 0.05, exp.benford = TRUE, ...)
491 -
# 
492 -
# rootogram.Benford(x,obs.freq = "digits", freq = TRUE,main = NULL,xlab = NULL,ylab = NULL,grid = TRUE,col.bar = "lightblue",err.bounds = FALSE,alpha = 0.05, exp.benford = TRUE, ...)
493 -
# 
494 -
# needle.Benford(x,discrepancy = "abs diff",main = NULL,xlab = NULL,ylab = NULL,grid = TRUE,col = "blue", ...)
495 -
# 
496 -
# xyplot.Berford(x,obs.freq = "digits",freq = TRUE,main = "Expected vs observed frequencies",xlab = "Observed Frequency",ylab = "Expected Frequency",grid = TRUE,col = "blue",...)
497 -
# 
498 -
# histogram.Benford(x,  obs.freq = "digits", freq, main = NULL, xlab = NULL, ylab = NULL, grid, col.bar, err.bounds, alpha)
499 -
500 487
plot.switch <- function(plot_this, x, col.bar, grid, err.bounds, alpha, freq){
501 488
  switch(plot_this,
502 489
         "digits" = histogram.Benford(x,  obs.freq = "digits", main = "Digits distribution", xlab = NULL, ylab = NULL, grid, col.bar, err.bounds, alpha, freq),
Files Coverage
R 100.00%
Project Totals (4 files) 100.00%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading