ewenharrison / finalfit
1
#' Missing values occurence plot
2
#'
3
#' Create a plot of missing values by observations on the x-axis and variable on
4
#' the y-axis. \code{Dependent} and \code{explanatory} are for convenience and are optional.
5
#'
6
#' @param .data Data frame.
7
#' @param dependent Character vector. Optional name of dependent variable.
8
#' @param explanatory Character vector. Optional name(s) of explanatory
9
#'   variables.
10
#' @param use_labels Use variable label names in plot labelling.
11
#' @param title Character vector. Optional title for plot.
12
#' @param plot_opts A list of arguments to be appended to the ggplot call by
13
#'   "+".
14
#'
15
#' @return Heat map of missing values in dataset.
16
#' @export
17
#'
18
#' @importFrom forcats fct_rev
19
#' @importFrom tidyr gather
20
#'
21
#' @examples
22
#'
23
#' colon_s %>%
24
#'   missing_plot()
25
missing_plot <- function(.data, dependent=NULL, explanatory=NULL,
26
                         use_labels = TRUE,
27
                         title=NULL,
28
                         plot_opts = NULL){
29 1
  requireNamespace("ggplot2")
30

31 1
  if (is.null(dependent) && is.null(explanatory)) {
32 1
    df.in = .data
33
  }
34
  else {
35 0
    df.in = .data %>%
36 0
      dplyr::select(dependent, explanatory)
37
  }
38

39
  # Labels
40 1
  if(use_labels){
41 1
    vlabels = extract_labels(df.in)$vfill
42
  }
43

44
  # Replace missings with 1s
45 1
  df.in %>%
46 1
    dplyr::mutate_all(.fun = function(x){
47 1
      ifelse(is.na(x), 1, 0)
48 1
    }) -> df.in
49

50
  # Take dataframe rownames for x-axis
51 1
  df.in$.id = rownames(df.in) %>% as.numeric()
52

53
  # Gather to key and values for plot
54 1
  df.in %>%
55 1
    tidyr::gather("var", "value", -.id, factor_key = TRUE) -> plot_df
56

57
  # Plot title
58 1
  if(is.null(title)) title = paste0("Missing values map")
59

60

61 1
  ggplot(plot_df, aes(x = .id, y = forcats::fct_rev(var), fill = value))+
62 1
    geom_raster()+
63 1
    xlab("Observation")+
64 1
    scale_y_discrete("", breaks = rev(levels(plot_df$var)), labels=rev(vlabels))+
65 1
    theme_minimal()+
66 1
    theme(legend.position="none")+
67 1
    ggtitle(title)+
68 1
    plot_opts
69
}
70

71

72
#' Missing values data frame
73
#'
74
#' Create a data frame of missing vs. observed values for all variables
75
#' provided. \code{Dependent} and \code{explanatory} are for convenience and are
76
#' optional.
77
#'
78
#' @param .data Data frame.
79
#' @param dependent Character vector. Optional name of dependent variable.
80
#' @param explanatory Character vector. Optional name(s) of explanatory
81
#'   variables.
82
#'
83
#' @return Data frame of missing values for all variables.
84
#' @export
85
#' @keywords internal
86
#'
87
#' @examples
88
#' colon_s %>%
89
#'   missing_df()
90
missing_df = function(.data, dependent=NULL, explanatory=NULL){
91 1
  if (is.null(dependent) && is.null(explanatory)) {
92 1
    df.in = .data
93
  }
94
  else {
95 0
    df.in = .data %>%
96 0
      dplyr::select(dependent, explanatory)
97
  }
98 1
  df.out = df.in %>%
99 1
    is.na() %>%
100 1
    data.frame() %>%
101 1
    dplyr::mutate_all(factor, levels=c("FALSE", "TRUE"), labels=c("Obs", "Miss"))
102 1
  names(df.out) = paste0(names(df.out), "_na")
103 1
  return(df.out)
104
}
105

106

107
#' Missing values pairs plot
108
#'
109
#' Compare the occurence of missing values in all variables by each other.
110
#' Suggest limit the number of variables to a maximum of around six.
111
#' \code{Dependent} and \code{explanatory} are for convenience of variable
112
#' selection, are optional, and have no other specific function.
113
#'
114
#' @param .data Data frame.
115
#' @param dependent Character vector. Optional name of dependent variable.
116
#' @param explanatory Character vector. Optional name(s) of explanatory
117
#'   variables.
118
#' @param use_labels Use variable label names in plot labelling.
119
#' @param title Character vector. Optional title for plot.
120
#' @param position For discrete variables, choose "stack" or "fill" to show
121
#'   counts or proportions.
122
#' @param showXAxisPlotLabels Show x-axis plot labels.
123
#' @param showYAxisPlotLabels Show y-axis plot labels.
124
#'
125
#' @return A plot matrix comparing missing values in all variables against each
126
#'   other.
127
#' @export
128
#' @importFrom purrr pmap
129
#' @examples
130
#' \dontrun{
131
#' explanatory = c("age", "nodes", "age.factor", "sex.factor", "obstruct.factor", "perfor.factor")
132
#' dependent = 'mort_5yr'
133
#' colon_s %>%
134
#'   missing_pairs(dependent, explanatory)
135
#' }
136
missing_pairs = function(.data, dependent = NULL, explanatory = NULL,
137
                         use_labels = TRUE,
138
												 title=NULL,
139
												 position = "stack",
140
                         showXAxisPlotLabels = TRUE,
141
                         showYAxisPlotLabels = FALSE){
142 1
  if (is.null(dependent) && is.null(explanatory)) {
143 0
    df.in = .data
144
  }
145
  else {
146 1
    df.in = .data %>%
147 1
      dplyr::select(dependent, explanatory)
148
  }
149 1
  vars_n = length(df.in)
150 1
  df.miss = missing_df(df.in)
151 1
  df.plot = data.frame(df.in, df.miss)
152 1
  obs_vector = rep(names(df.in), vars_n)
153 1
  miss_vector = rep(names(df.miss), each=vars_n)
154 1
  if(use_labels){
155 1
    labels = extract_labels(df.in)$vfill
156
  }else{
157 0
    labels = extract_labels(df.in)$vname
158
  }
159

160
  # Plot title
161 1
  if(is.null(title)) title = paste0("Missing data matrix")
162

163
  # Everything below can be made into functions
164 1
  obs_discrete = sapply(df.in, function(x){
165 1
    any(is.factor(x),
166 1
        is.character(x),
167 1
        is.logical(x))
168
  })
169 1
  obs_discrete_vector = rep(obs_discrete, vars_n)
170

171
  # Make colours permanent
172 1
  palColours = c("lightblue", "gray50")
173 1
  names(palColours) = c("Obs", "Miss")
174 1
  colScale = scale_fill_manual(values=palColours)
175

176
  # Make list of plots
177 1
  plot_list = purrr::pmap(list(obs_vector, miss_vector, obs_discrete_vector),
178 1
                          function(obs, miss, discrete){
179 1
                            if(!discrete){
180 1
                              ggplot(data = df.plot) +
181 1
                                geom_boxplot(aes_string(x=miss, y=obs, fill=miss))+
182 1
                                colScale+
183 1
                                scale_x_discrete(limits=c("Miss", "Obs"))+
184 1
                                coord_flip()
185
                              #	geom_density(aes_string(x = miss), colour = "darkblue")
186
                            }else{
187 1
                              ggplot(data = df.plot, aes_string(x = obs, fill=miss)) +
188 1
                                geom_bar(position=position)+
189 1
                                colScale
190

191
                            }
192
                          })
193

194
  # Plot matrix
195 1
  GGally::ggmatrix(plot_list, nrow=vars_n, ncol=vars_n,
196 1
                   xAxisLabels = labels,
197 1
                   yAxisLabels = paste(labels, "(miss)"),
198 1
                   showXAxisPlotLabels = showXAxisPlotLabels,
199 1
                   showYAxisPlotLabels = showYAxisPlotLabels,
200 1
                   title = title)+
201

202 1
    theme_classic()
203
}

Read our documentation on viewing source code .

Loading