1
#' Check accurate recoding of variables
2
#'
3
#' This was written a few days after the retraction of a paper in JAMA due to an
4
#' error in recoding the treatment variable
5
#' (\url{https://jamanetwork.com/journals/jama/fullarticle/2752474}). This
6
#' takes a data frame or tibble, fuzzy matches variable names, and produces
7
#' crosstables of all matched variables. A visual inspection should reveal any
8
#' miscoding.
9
#'
10
#' @param .data Data frame or tibble.
11
#' @param dependent Optional character vector: name(s) of depdendent
12
#'   variable(s).
13
#' @param explanatory Optional character vector: name(s) of explanatory
14
#'   variable(s). 
15
#' @param include_numerics Logical. Include numeric variables in function. 
16
#' @param ... Pass other arguments to \code{\link{agrep}}.
17
#'
18
#' @return List of length two. The first is an index of variable combiations.
19
#'   The second is a nested list of crosstables as tibbles.
20
#' @export
21
#'
22
#' @examples
23
#' library(dplyr)
24
#' data(colon_s)
25
#' colon_s_small = colon_s %>%
26
#'   select(-id, -rx, -rx.factor) %>%
27
#'   mutate(
28
#'     age.factor2 = forcats::fct_collapse(age.factor,
29
#'       "<60 years" = c("<40 years", "40-59 years")),
30
#'     sex.factor2 = forcats::fct_recode(sex.factor,
31
#'     # Intentional miscode
32
#'       "F" = "Male",
33
#'       "M" = "Female")
34
#'   )
35
#'
36
#' # Check
37
#' colon_s_small %>%
38
#'   check_recode(include_numerics = FALSE)
39
#'
40
#' out = colon_s_small %>%
41
#'   select(-extent, -extent.factor,-time, -time.years) %>%
42
#'   check_recode()
43
#' out
44
#'
45
#' # Select a tibble and expand
46
#' out$counts[[9]] %>%
47
#'   print()
48
#' # Note this variable (node4) appears miscoded in original dataset survival::colon.
49
#' 
50
#' # Choose to only include variables that you actually use. 
51
#' # This uses standard Finalfit grammar. 
52
#' dependent = "mort_5yr"
53
#' explanatory = c("age.factor2", "sex.factor2")
54
#' colon_s_small %>% 
55
#'   check_recode(dependent, explanatory)
56
check_recode <- function(.data, dependent = NULL, explanatory = NULL, include_numerics = TRUE, ...){
57 1
	if(!is.data.frame(.data)) stop(".data is not dataframe")
58
	
59 1
	if(!include_numerics){
60 0
		.data = .data  %>% 
61 0
			dplyr::select_if(purrr::negate(is.numeric))
62
	}
63
	
64 1
	if(is.null(dependent) && is.null(explanatory)){
65 1
		.varnames = .data %>% names()
66
	} else {
67 0
		.varnames = .data %>% 
68 0
			dplyr::select(dependent, explanatory) %>% 
69 0
			names()
70
	}
71
	
72 1
	.varnames_combinations = .varnames %>%  
73 1
		purrr::map(., agrep, names(.data), value = TRUE, ...) %>% 
74 1
		dplyr::tibble(var1 = .varnames, 
75 1
									var2 = .) %>% 
76 1
		tidyr::unnest(cols = c(var2)) %>% 
77 1
		dplyr::filter(var1 != var2) %>% 
78 1
		dplyr::mutate(
79 1
			keep = purrr::map2_chr(var1, var2, ~toString(sort(c(.x, .y))))
80
		) %>% 
81 1
		dplyr::distinct(keep, .keep_all = TRUE) %>%
82 1
		dplyr::select(-keep)
83
	
84 1
	count_stuff = function(.data, var1, var2){
85 0
		.data %>% 
86 0
			dplyr::count(!! sym(var1), !! sym(var2))
87
	}
88
	
89 1
	list.out = .varnames_combinations %>% 
90 1
		purrr::pmap(count_stuff, .data = .data)
91 1
	list.out = list(index = .varnames_combinations, counts = list.out)
92 1
	return(list.out)
93
}

Read our documentation on viewing source code .

Loading