1
|
|
#' Summarise a set of factors (or continuous variables) by a dependent variable
|
2
|
|
#'
|
3
|
|
#' A function that takes a single dependent variable with a vector of
|
4
|
|
#' explanatory variable names (continuous or categorical variables) to produce a
|
5
|
|
#' summary table.
|
6
|
|
#'
|
7
|
|
#' This function aims to produce publication-ready summary tables for
|
8
|
|
#' categorical or continuous dependent variables. It usually takes a categorical
|
9
|
|
#' dependent variable to produce a cross table of counts and proportions
|
10
|
|
#' expressed as percentages or summarised continuous explanatory variables.
|
11
|
|
#' However, it will take a continuous dependent variable to produce mean
|
12
|
|
#' (standard deviation) or median (interquartile range) for use with linear
|
13
|
|
#' regression models.
|
14
|
|
#'
|
15
|
|
#' @param .data Dataframe.
|
16
|
|
#' @param dependent Character vector of length 1: name of dependent variable (2
|
17
|
|
#' to 5 factor levels).
|
18
|
|
#' @param explanatory Character vector of any length: name(s) of explanatory
|
19
|
|
#' variables.
|
20
|
|
#' @param cont Summary for continuous explanatory variables: "mean" (standard
|
21
|
|
#' deviation) or "median" (interquartile range). If "median" then
|
22
|
|
#' non-parametric hypothesis test performed (see below).
|
23
|
|
#' @param cont_nonpara Numeric vector of form e.g. \code{c(1,2)}. Specify which
|
24
|
|
#' variables to perform non-parametric hypothesis tests on and summarise with
|
25
|
|
#' "median".
|
26
|
|
#' @param cont_cut Numeric: number of unique values in continuous variable at
|
27
|
|
#' which to consider it a factor.
|
28
|
|
#' @param cont_range Logical. Median is show with 1st and 3rd quartiles.
|
29
|
|
#' @param p Logical: Include null hypothesis statistical test.
|
30
|
|
#' @param p_cont_para Character. Continuous variable parametric test. One of
|
31
|
|
#' either "aov" (analysis of variance) or "t.test" for Welch two sample
|
32
|
|
#' t-test. Note continuous non-parametric test is always Kruskal Wallis
|
33
|
|
#' (kruskal.test) which in two-group setting is equivalent to Mann-Whitney U
|
34
|
|
#' /Wilcoxon rank sum test.
|
35
|
|
#'
|
36
|
|
#' For continous dependent and continuous explanatory, the parametric test
|
37
|
|
#' p-value returned is for the Pearson correlation coefficient. The
|
38
|
|
#' non-parametric equivalent is for the p-value for the Spearman correlation
|
39
|
|
#' coefficient.
|
40
|
|
#' @param p_cat Character. Categorical variable test. One of either "chisq" or
|
41
|
|
#' "fisher".
|
42
|
|
#' @param column Logical: Compute margins by column rather than row.
|
43
|
|
#' @param total_col Logical: include a total column summing across factor
|
44
|
|
#' levels.
|
45
|
|
#' @param orderbytotal Logical: order final table by total column high to low.
|
46
|
|
#' @param digits Number of digits to round to (1) mean/median, (2) standard
|
47
|
|
#' deviation / interquartile range, (3) p-value, (4) count percentage.
|
48
|
|
#' @param na_include Logical: make explanatory variables missing data explicit
|
49
|
|
#' (\code{NA}).
|
50
|
|
#' @param na_include_dependent Logical: make dependent variable missing data
|
51
|
|
#' explicit.
|
52
|
|
#' @param na_complete_cases Logical: include only rows with complete data.
|
53
|
|
#' @param na_to_p Logical: include missing as group in statistical test.
|
54
|
|
#' @param fit_id Logical: allows merging via \code{\link{finalfit_merge}}.
|
55
|
|
#' @param add_dependent_label Add the name of the dependent label to the top
|
56
|
|
#' left of table.
|
57
|
|
#' @param dependent_label_prefix Add text before dependent label.
|
58
|
|
#' @param dependent_label_suffix Add text after dependent label.
|
59
|
|
#' @param add_col_totals Logical. Include column total n.
|
60
|
|
#' @param include_col_totals_percent Include column percentage of total.
|
61
|
|
#' @param col_totals_rowname Logical. Row name for column totals.
|
62
|
|
#' @param col_totals_prefix Character. Prefix to column totals, e.g. "N=".
|
63
|
|
#' @param add_row_totals Logical. Include row totals. Note this differs from
|
64
|
|
#' \code{total_col} above particularly for continuous explanatory variables.
|
65
|
|
#' @param include_row_missing_col Logical. Include missing data total for each
|
66
|
|
#' row. Only used when \code{add_row_totals} is \code{TRUE}.
|
67
|
|
#' @param row_totals_colname Character. Column name for row totals.
|
68
|
|
#' @param row_missing_colname Character. Column name for missing data totals for
|
69
|
|
#' each row.
|
70
|
|
#' @param catTest Deprecated. See \code{p_cat} above.
|
71
|
|
#'
|
72
|
|
#' @return Returns a \code{factorlist} dataframe.
|
73
|
|
#'
|
74
|
|
#' @family finalfit wrappers
|
75
|
|
#' @seealso \code{\link{fit2df}} \code{\link{ff_column_totals}}
|
76
|
|
#' \code{\link{ff_row_totals}} \code{\link{ff_label}} \code{\link{ff_glimpse}}
|
77
|
|
#' \code{\link{ff_percent_only}}
|
78
|
|
#' @export
|
79
|
|
#'
|
80
|
|
#' @examples
|
81
|
|
#' library(finalfit)
|
82
|
|
#' library(dplyr)
|
83
|
|
#' # Load example dataset, modified version of survival::colon
|
84
|
|
#' data(colon_s)
|
85
|
|
#'
|
86
|
|
#' # Table 1 - Patient demographics ----
|
87
|
|
#' explanatory = c("age", "age.factor", "sex.factor", "obstruct.factor")
|
88
|
|
#' dependent = "perfor.factor"
|
89
|
|
#' colon_s %>%
|
90
|
|
#' summary_factorlist(dependent, explanatory, p=TRUE)
|
91
|
|
#'
|
92
|
|
#' # summary.factorlist() is also commonly used to summarise any number of
|
93
|
|
#' # variables by an outcome variable (say dead yes/no).
|
94
|
|
#'
|
95
|
|
#' # Table 2 - 5 yr mortality ----
|
96
|
|
#' explanatory = c("age.factor", "sex.factor", "obstruct.factor", "perfor.factor")
|
97
|
|
#' dependent = "mort_5yr"
|
98
|
|
#' colon_s %>%
|
99
|
|
#' summary_factorlist(dependent, explanatory)
|
100
|
|
summary_factorlist <- function(.data,
|
101
|
|
dependent = NULL, explanatory,
|
102
|
|
cont = "mean", cont_nonpara = NULL, cont_cut = 5, cont_range = FALSE,
|
103
|
|
p = FALSE, p_cont_para = "aov", p_cat = "chisq",
|
104
|
|
column = TRUE, total_col = FALSE, orderbytotal = FALSE,
|
105
|
|
digits = c(1, 1, 3, 1),
|
106
|
|
na_include = FALSE, na_include_dependent = FALSE,
|
107
|
|
na_complete_cases = FALSE, na_to_p = FALSE,
|
108
|
|
fit_id = FALSE,
|
109
|
|
add_dependent_label = FALSE,
|
110
|
|
dependent_label_prefix = "Dependent: ", dependent_label_suffix = "",
|
111
|
|
add_col_totals = FALSE, include_col_totals_percent = TRUE,
|
112
|
|
col_totals_rowname = NULL, col_totals_prefix = "",
|
113
|
|
add_row_totals = FALSE, include_row_missing_col = TRUE,
|
114
|
|
row_totals_colname = "Total N", row_missing_colname = "Missing N",
|
115
|
|
catTest = NULL){
|
116
|
|
|
117
|
|
|
118
|
|
# Warnings/Checks --------------
|
119
|
1
|
if(!is.data.frame(.data)) stop(".data is not dataframe")
|
120
|
1
|
if(any(class(.data) %in% c("tbl_df", "tbl"))) .data = data.frame(.data)
|
121
|
1
|
if(is.null(explanatory)) stop("No explanatory variable(s) provided")
|
122
|
1
|
if(any(explanatory == ".")){
|
123
|
0
|
explanatory = .data %>%
|
124
|
0
|
dplyr::select(-dependent) %>%
|
125
|
0
|
names()
|
126
|
|
}
|
127
|
1
|
if(is.null(dependent)){
|
128
|
0
|
message("No dependent variable(s) provided; defaulting to single-level factor")
|
129
|
0
|
dependent = "all"
|
130
|
0
|
.data$all = factor(1, labels="all")
|
131
|
|
}
|
132
|
1
|
if(na_to_p & !na_include) warning("If wish to pass missing to hypothesis test (na_to_p), must have na_include = TRUE")
|
133
|
|
|
134
|
|
# Deprecated catTest from Hmisc for reverse dependencies
|
135
|
1
|
if(!is.null(catTest)){
|
136
|
0
|
message("catTest is deprecated. Using p_cat = 'fisher'")
|
137
|
0
|
p_cat = "fisher"}
|
138
|
|
|
139
|
|
# Extract explanatory terms (to support using * and :)
|
140
|
1
|
explanatory_terms = paste("~", paste(explanatory, collapse = "+")) %>%
|
141
|
1
|
formula() %>%
|
142
|
1
|
terms() %>%
|
143
|
1
|
attr("term.labels")
|
144
|
|
|
145
|
1
|
if(dependent %in% explanatory) stop("Cannot have dependent variable in explanatory list.")
|
146
|
|
|
147
|
1
|
if(!is.null(cont_nonpara) && max(cont_nonpara) > length(explanatory)) {
|
148
|
0
|
stop("cont_nonpara cannot include values greater than the number of explanatory variables")
|
149
|
|
}
|
150
|
|
|
151
|
|
# Definitions ------------------------------------------------------------
|
152
|
|
## Dependent as survival object handling
|
153
|
1
|
d_is.surv = grepl("Surv[(].*[)]", dependent)
|
154
|
|
|
155
|
1
|
if(d_is.surv){
|
156
|
1
|
message("Dependent variable is a survival object")
|
157
|
1
|
.data$all = factor(1, labels="all")
|
158
|
1
|
dependent = "all"
|
159
|
|
|
160
|
|
# Remove strata and cluster terms - keep in table for now
|
161
|
1
|
drop = grepl("cluster[(].*[)]", explanatory) |
|
162
|
1
|
grepl("strata[(].*[)]", explanatory) |
|
163
|
1
|
grepl("frailty[(].*[)]", explanatory)
|
164
|
1
|
explanatory = explanatory[!drop]
|
165
|
|
}
|
166
|
|
|
167
|
|
# Remove interactions and indicator variables
|
168
|
|
## Intentionally done separately to above line.
|
169
|
1
|
explanatory = paste("~", paste(explanatory, collapse = "+")) %>%
|
170
|
1
|
formula() %>%
|
171
|
1
|
all.vars()
|
172
|
|
|
173
|
|
## Active dataset
|
174
|
1
|
.data = .data %>%
|
175
|
1
|
dplyr::select(dependent, explanatory)
|
176
|
|
|
177
|
|
## Dependent is numeric
|
178
|
1
|
d_is.numeric = .data %>%
|
179
|
1
|
dplyr::pull(dependent) %>%
|
180
|
1
|
is.numeric()
|
181
|
|
|
182
|
1
|
if(d_is.numeric & add_col_totals){
|
183
|
0
|
add_col_totals = FALSE
|
184
|
0
|
message("Cannot have add_col_totals with numeric dependent.")
|
185
|
|
}
|
186
|
|
|
187
|
|
## Continous data to categorical if unique values below threshold
|
188
|
1
|
cont_distinct = .data %>%
|
189
|
1
|
dplyr::select(explanatory) %>%
|
190
|
1
|
dplyr::summarise_if(is.numeric, dplyr::n_distinct) %>%
|
191
|
1
|
purrr::keep(~ .x < cont_cut) %>%
|
192
|
1
|
names()
|
193
|
1
|
.data = .data %>%
|
194
|
1
|
dplyr::mutate_at(cont_distinct, as.factor)
|
195
|
|
|
196
|
|
## Explanatory variable type
|
197
|
1
|
explanatory_type = .data %>%
|
198
|
1
|
dplyr::select(explanatory) %>%
|
199
|
1
|
purrr::map(is.numeric)
|
200
|
|
|
201
|
|
# Non-parametric variables
|
202
|
1
|
explanatory_nonpara = vector(length = length(explanatory))
|
203
|
1
|
explanatory_nonpara[cont_nonpara] = TRUE
|
204
|
1
|
if(cont == "median") explanatory_nonpara = TRUE
|
205
|
|
|
206
|
|
## Labels
|
207
|
1
|
var_labels = .data %>%
|
208
|
1
|
dplyr::select(explanatory) %>%
|
209
|
1
|
extract_variable_label()
|
210
|
|
|
211
|
|
# Missing data handling ------------------------------------------------------------
|
212
|
1
|
df.in = .data
|
213
|
|
|
214
|
|
# Explanatory variables, make NA explicit for factors
|
215
|
1
|
if(na_complete_cases){
|
216
|
0
|
df.in = df.in %>%
|
217
|
0
|
tidyr::drop_na()
|
218
|
|
}
|
219
|
|
|
220
|
1
|
if(na_include){
|
221
|
0
|
df.in = df.in %>%
|
222
|
0
|
dplyr::mutate_if(names(.) %in% unlist(explanatory) &
|
223
|
0
|
sapply(., is.factor),
|
224
|
0
|
forcats::fct_explicit_na
|
225
|
|
)}
|
226
|
|
|
227
|
1
|
if(na_include_dependent & !d_is.numeric){
|
228
|
0
|
df.in = df.in %>%
|
229
|
0
|
dplyr::mutate(
|
230
|
0
|
!! sym(dependent) := forcats::fct_explicit_na(!! sym(dependent))
|
231
|
|
)
|
232
|
1
|
} else if(!na_include_dependent & !d_is.numeric){
|
233
|
1
|
df.in = df.in %>%
|
234
|
1
|
tidyr::drop_na(dependent)
|
235
|
1
|
} else if(na_include_dependent & d_is.numeric){
|
236
|
0
|
warnings("Dependent is numeric and missing values cannot be made explicit.
|
237
|
0
|
Make dependent a factor or use na_include_dependent = FALSE.")
|
238
|
|
}
|
239
|
|
|
240
|
|
## Missing data to p-tests or not
|
241
|
1
|
if(na_to_p){
|
242
|
0
|
df.p = df.in
|
243
|
|
} else {
|
244
|
1
|
df.p = .data
|
245
|
|
}
|
246
|
1
|
if(p && na_to_p){
|
247
|
0
|
message("Explanatory variable(s) missing data included in hypothesis test (p-value).")
|
248
|
|
}
|
249
|
|
|
250
|
1
|
if(!na_include_dependent &
|
251
|
1
|
.data %>%
|
252
|
1
|
dplyr::pull(dependent) %>%
|
253
|
1
|
is.na() %>%
|
254
|
1
|
any()) {message("Note: dependent includes missing data. These are dropped.")}
|
255
|
|
|
256
|
|
|
257
|
|
# Continous dependent --------------------------------------------------------------------
|
258
|
1
|
if(d_is.numeric){
|
259
|
|
|
260
|
|
## Hypothesis tests ---------
|
261
|
1
|
if(p){
|
262
|
1
|
p_tests = purrr::pmap(list(explanatory, explanatory_type, explanatory_nonpara),
|
263
|
|
# Categorical / parametric
|
264
|
1
|
~ if(!..2 && !..3){
|
265
|
1
|
if(p_cont_para == "aov"){
|
266
|
1
|
summary(aov(as.formula(paste(dependent, "~", ..1)), df.p))[[1]][["Pr(>F)"]][[1]] %>%
|
267
|
1
|
p_tidy(digits[3], "")
|
268
|
1
|
} else if (p_cont_para == "t.test"){
|
269
|
0
|
t.test(as.formula(paste(dependent, "~", ..1)), df.p)$p.value %>%
|
270
|
0
|
p_tidy(digits[3], "")
|
271
|
|
}
|
272
|
|
# Categorical / non-parametric
|
273
|
1
|
} else if (!..2 & ..3){
|
274
|
0
|
kruskal.test(as.formula(paste(dependent, "~", ..1)), df.p)$p.value %>%
|
275
|
0
|
p_tidy(digits[3], "")
|
276
|
|
# Continous / parametric
|
277
|
1
|
} else if (..2 & !..3){
|
278
|
1
|
cor.test(as.formula(paste("~", dependent, "+", ..1)), df.p, method="pearson")$p.value %>%
|
279
|
1
|
p_tidy(digits[3], "")
|
280
|
|
# Continous / non-parametric
|
281
|
1
|
} else if (..2 & ..3){
|
282
|
0
|
cor.test(as.formula(paste("~", dependent, "+", ..1)), df.p, method="spearman")$p.value %>%
|
283
|
0
|
p_tidy(digits[3], "")
|
284
|
|
}
|
285
|
|
)
|
286
|
|
}
|
287
|
|
|
288
|
1
|
summary_cont_name = rep("Mean (sd)", length(explanatory_nonpara))
|
289
|
1
|
summary_cont_name[explanatory_nonpara] = "Median (IQR)"
|
290
|
|
|
291
|
|
## Output table --------------
|
292
|
1
|
df.out = purrr::pmap(list(explanatory, explanatory_type, explanatory_nonpara, summary_cont_name),
|
293
|
1
|
~ if(!..2){
|
294
|
1
|
df.in %>%
|
295
|
1
|
dplyr::group_by(!! sym(..1)) %>%
|
296
|
1
|
tidyr::drop_na(!! sym(dependent)) %>%
|
297
|
1
|
dplyr::summarise(value_mean = mean(!! sym(dependent), na.rm = TRUE),
|
298
|
1
|
value_sd = sd(!! sym(dependent), na.rm = TRUE),
|
299
|
1
|
value_median = median(!! sym(dependent), na.rm = TRUE),
|
300
|
1
|
value_q1 =quantile(!! sym(dependent), 0.25, na.rm = TRUE),
|
301
|
1
|
value_q3 = quantile(!! sym(dependent), 0.75, na.rm = TRUE),
|
302
|
1
|
n = dplyr::n()) %>%
|
303
|
1
|
tidyr::drop_na() %>%
|
304
|
1
|
dplyr::ungroup() %>%
|
305
|
1
|
dplyr::mutate(
|
306
|
1
|
col_total = sum(n),
|
307
|
1
|
col_total_prop = 100 * n/col_total,
|
308
|
1
|
Total = format_n_percent(n, col_total_prop, digits[[4]]),
|
309
|
1
|
label = ..1,
|
310
|
1
|
unit = ..4,
|
311
|
|
) %>%
|
312
|
1
|
dplyr::rename(levels = 1) %>%
|
313
|
1
|
{ if(! ..3){
|
314
|
1
|
dplyr::mutate(.,
|
315
|
1
|
value = paste0(value_mean %>% round_tidy(digits[1]), " (",
|
316
|
1
|
value_sd %>% round_tidy(digits[1]), ")")
|
317
|
|
)
|
318
|
|
} else {
|
319
|
1
|
{ if(cont_range){
|
320
|
1
|
dplyr::mutate(.,
|
321
|
1
|
value = paste0(value_median %>% round_tidy(digits[1]), " (",
|
322
|
1
|
value_q1 %>% round_tidy(digits[1]), " to ",
|
323
|
1
|
value_q3 %>% round_tidy(digits[1]), ")")
|
324
|
|
)
|
325
|
|
} else {
|
326
|
1
|
dplyr::mutate(.,
|
327
|
1
|
value = paste0(value_median %>% round_tidy(digits[1]), " (",
|
328
|
1
|
{value_q3 - value_q1} %>% round_tidy(digits[1]), ")")
|
329
|
|
)
|
330
|
|
}}
|
331
|
|
|
332
|
|
}} %>%
|
333
|
1
|
{if(total_col){
|
334
|
1
|
dplyr::select(., label, levels, unit, value, Total)
|
335
|
|
} else {
|
336
|
1
|
dplyr::select(., label, levels, unit, value)
|
337
|
|
}} %>%
|
338
|
1
|
dplyr::mutate_all(as.character)
|
339
|
1
|
} else if(..2){
|
340
|
1
|
df.in %>%
|
341
|
1
|
tidyr::drop_na(!! sym(dependent)) %>%
|
342
|
1
|
dplyr::summarise(value_mean = mean(!! sym(dependent), na.rm = TRUE),
|
343
|
1
|
value_sd = sd(!! sym(dependent), na.rm = TRUE),
|
344
|
1
|
value_median = median(!! sym(dependent), na.rm = TRUE),
|
345
|
1
|
value_q1 =quantile(!! sym(dependent), 0.25, na.rm = TRUE),
|
346
|
1
|
value_q3 = quantile(!! sym(dependent), 0.75, na.rm = TRUE),
|
347
|
1
|
value_min = min(!! sym(..1), na.rm = TRUE),
|
348
|
1
|
value_max = max(!! sym(..1), na.rm = TRUE),
|
349
|
1
|
n = (!is.na(!! sym(..1))) %>% sum(),
|
350
|
|
# row_total = dplyr::n(), # think about whether prop of df length
|
351
|
|
# row_prop = 100 * n/row_total,
|
352
|
1
|
Total = format_n_percent(n, 100, digits[[4]])) %>%
|
353
|
1
|
dplyr::mutate(
|
354
|
1
|
label = ..1,
|
355
|
1
|
levels = paste0("[", value_min %>% round_tidy(digits[1]), ",",
|
356
|
1
|
value_max %>% round_tidy(digits[1]), "]"),
|
357
|
1
|
unit = ..4
|
358
|
|
) %>%
|
359
|
1
|
{ if(! ..3){
|
360
|
1
|
dplyr::mutate(.,
|
361
|
1
|
value = paste0(value_mean %>% round_tidy(digits[1]), " (",
|
362
|
1
|
value_sd %>% round_tidy(digits[1]), ")")
|
363
|
|
)
|
364
|
|
} else {
|
365
|
1
|
{ if(cont_range){
|
366
|
1
|
dplyr::mutate(.,
|
367
|
1
|
value = paste0(value_median %>% round_tidy(digits[1]), " (",
|
368
|
1
|
value_q1 %>% round_tidy(digits[1]), " to ",
|
369
|
1
|
value_q3 %>% round_tidy(digits[1]), ")")
|
370
|
|
)
|
371
|
|
} else {
|
372
|
1
|
dplyr::mutate(.,
|
373
|
1
|
value = paste0(value_median %>% round_tidy(digits[1]), " (",
|
374
|
1
|
{value_q3 - value_q1} %>% round_tidy(digits[1]), ")")
|
375
|
|
)
|
376
|
|
}}
|
377
|
|
|
378
|
|
}} %>%
|
379
|
1
|
{if(total_col){
|
380
|
1
|
dplyr::select(., label, levels, unit, value, Total)
|
381
|
|
} else{
|
382
|
1
|
dplyr::select(., label, levels, unit, value)
|
383
|
|
}} %>%
|
384
|
1
|
dplyr::mutate_all(as.character)
|
385
|
|
}
|
386
|
|
)
|
387
|
|
} else {
|
388
|
|
|
389
|
|
# Categorical dependent -----------------------------------------------------------------------------
|
390
|
|
|
391
|
|
## Hypothesis tests ---------
|
392
|
1
|
if(p){
|
393
|
1
|
p_tests = purrr::pmap(list(explanatory, explanatory_type, explanatory_nonpara),
|
394
|
1
|
~ if(!..2){
|
395
|
1
|
df.p %>%
|
396
|
1
|
{ if(p_cat == "chisq"){
|
397
|
1
|
dplyr::summarise(., chisq.test(!! sym(..1), !! sym(dependent))$p.value) %>%
|
398
|
1
|
p_tidy(digits[3], "")
|
399
|
1
|
} else if (p_cat == "fisher"){
|
400
|
0
|
dplyr::summarise(., fisher.test(!! sym(..1), !! sym(dependent))$p.value) %>%
|
401
|
0
|
p_tidy(digits[3], "")
|
402
|
|
}}
|
403
|
1
|
} else if (..2 & !..3){
|
404
|
1
|
{if (p_cont_para == "aov"){
|
405
|
1
|
summary(aov(as.formula(paste(..1, "~", dependent)), df.p))[[1]][["Pr(>F)"]][[1]] %>%
|
406
|
1
|
p_tidy(digits[3], "")
|
407
|
1
|
} else if (p_cont_para == "t.test"){
|
408
|
1
|
t.test(as.formula(paste(..1, "~", dependent)), df.p)$p.value %>%
|
409
|
1
|
p_tidy(digits[3], "")
|
410
|
|
}}
|
411
|
1
|
} else if (..2 & ..3){
|
412
|
0
|
kruskal.test(as.formula(paste(..1, "~", dependent)), df.p)$p.value %>%
|
413
|
0
|
p_tidy(digits[3], "")
|
414
|
|
}
|
415
|
|
)
|
416
|
|
}
|
417
|
|
|
418
|
|
|
419
|
|
## Output table --------------
|
420
|
1
|
df.out = purrr::pmap(list(explanatory, explanatory_type, explanatory_nonpara),
|
421
|
1
|
~ if(!..2){
|
422
|
1
|
df.in %>%
|
423
|
1
|
dplyr::group_by(!! sym(dependent)) %>%
|
424
|
1
|
dplyr::count(!! sym(..1), .drop = FALSE) %>%
|
425
|
1
|
dplyr::ungroup() %>%
|
426
|
1
|
tidyr::drop_na() %>%
|
427
|
1
|
dplyr::mutate(grand_total = sum(n)) %>%
|
428
|
1
|
dplyr::group_by_at(2) %>%
|
429
|
1
|
dplyr::mutate(row_total = sum(n),
|
430
|
1
|
col_total_prop = 100 * row_total / grand_total) %>%
|
431
|
1
|
{ if(column) {
|
432
|
1
|
dplyr::group_by(., !! sym(dependent)) %>%
|
433
|
1
|
dplyr::mutate(
|
434
|
1
|
col_total = sum(n),
|
435
|
1
|
prop = 100 * n / col_total,
|
436
|
1
|
Total = format_n_percent(row_total, col_total_prop, digits[[4]])
|
437
|
|
) %>%
|
438
|
1
|
dplyr::select(-col_total)
|
439
|
|
} else {
|
440
|
1
|
dplyr::group_by_at(., 2) %>%
|
441
|
1
|
dplyr::mutate(
|
442
|
1
|
prop = 100 * n / row_total,
|
443
|
1
|
Total = paste0(row_total, " (100)")
|
444
|
|
)
|
445
|
|
}
|
446
|
|
} %>%
|
447
|
1
|
dplyr::ungroup() %>%
|
448
|
1
|
dplyr::mutate(
|
449
|
1
|
value = format_n_percent(n, prop, digits[4])
|
450
|
|
) %>%
|
451
|
1
|
dplyr::select(-prop, -n, -grand_total, -col_total_prop) %>%
|
452
|
1
|
tidyr::pivot_wider(names_from = !! dependent, values_from = value) %>%
|
453
|
1
|
dplyr::mutate(
|
454
|
1
|
label = names(.)[1]
|
455
|
|
) %>%
|
456
|
1
|
dplyr::rename(levels = 1) %>%
|
457
|
1
|
{if(orderbytotal){
|
458
|
0
|
dplyr::arrange(., -row_total)
|
459
|
|
} else {
|
460
|
|
.
|
461
|
|
}} %>%
|
462
|
1
|
dplyr::select(-row_total) %>%
|
463
|
1
|
dplyr::select(label, levels, dplyr::everything()) %>%
|
464
|
1
|
dplyr::select(-Total, dplyr::everything()) %>%
|
465
|
1
|
dplyr::mutate_all(as.character) %>%
|
466
|
|
# Total column
|
467
|
1
|
{ if(total_col){
|
468
|
|
.
|
469
|
|
} else {
|
470
|
1
|
dplyr::select(., -Total)
|
471
|
|
}
|
472
|
|
}
|
473
|
|
} else {
|
474
|
1
|
df.in %>%
|
475
|
1
|
dplyr::mutate(
|
476
|
1
|
value_mean_total = mean(!! sym(..1), na.rm = TRUE),
|
477
|
1
|
value_sd_total = sd(!! sym(..1), na.rm = TRUE),
|
478
|
1
|
value_median_total = median(!! sym(..1), na.rm = TRUE),
|
479
|
1
|
value_q1_total = quantile(!! sym(..1), 0.25, na.rm = TRUE),
|
480
|
1
|
value_q3_total = quantile(!! sym(..1), 0.75, na.rm = TRUE)
|
481
|
|
) %>%
|
482
|
1
|
dplyr::group_by(!! sym(dependent)) %>%
|
483
|
1
|
dplyr::summarise(
|
484
|
1
|
value_mean = mean(!! sym(..1), na.rm = TRUE),
|
485
|
1
|
value_sd = sd(!! sym(..1), na.rm = TRUE),
|
486
|
1
|
value_median = median(!! sym(..1), na.rm = TRUE),
|
487
|
1
|
value_q1 = quantile(!! sym(..1), 0.25, na.rm = TRUE),
|
488
|
1
|
value_q3 = quantile(!! sym(..1), 0.75, na.rm = TRUE),
|
489
|
1
|
value_iqr = value_q3 - value_q1,
|
490
|
1
|
value_mean_total = unique(value_mean_total),
|
491
|
1
|
value_sd_total = unique(value_sd_total),
|
492
|
1
|
value_median_total = unique(value_median_total),
|
493
|
1
|
value_q1_total = unique(value_q1_total),
|
494
|
1
|
value_q3_total = unique(value_q3_total),
|
495
|
1
|
value_iqr_total = value_q3_total - value_q1_total,
|
496
|
|
|
497
|
|
) %>%
|
498
|
1
|
{ if(! ..3) {
|
499
|
1
|
dplyr::mutate(.,
|
500
|
1
|
value = paste0(value_mean %>% round_tidy(digits[1]), " (",
|
501
|
1
|
value_sd %>% round_tidy(digits[2]), ")") ,
|
502
|
1
|
Total = paste0(value_mean_total %>% round_tidy(digits[1]), " (",
|
503
|
1
|
value_sd_total %>% round_tidy(digits[2]), ")")
|
504
|
|
) %>%
|
505
|
1
|
dplyr::select(dependent, value, Total) %>%
|
506
|
1
|
tidyr::pivot_wider(names_from = !! dependent, values_from = value) %>%
|
507
|
1
|
dplyr::mutate(
|
508
|
1
|
label = .x,
|
509
|
1
|
levels = "Mean (SD)"
|
510
|
|
)
|
511
|
1
|
} else if (..3){
|
512
|
1
|
{if(cont_range){
|
513
|
0
|
dplyr::mutate(.,
|
514
|
0
|
value = paste0(value_median %>% round_tidy(digits[1]), " (",
|
515
|
0
|
value_q1 %>% round_tidy(digits[2]), " to ",
|
516
|
0
|
value_q3 %>% round_tidy(digits[2]), ")"),
|
517
|
0
|
Total = paste0(value_median_total %>% round_tidy(digits[1]), " (",
|
518
|
0
|
value_q1_total %>% round_tidy(digits[2]), " to ",
|
519
|
0
|
value_q3_total %>% round_tidy(digits[2]), ")")
|
520
|
|
)
|
521
|
|
} else {
|
522
|
1
|
dplyr::mutate(.,
|
523
|
1
|
value = paste0(value_median %>% round_tidy(digits[1]), " (",
|
524
|
1
|
value_iqr %>% round_tidy(digits[2]), ")"),
|
525
|
1
|
Total = paste0(value_median_total %>% round_tidy(digits[1]), " (",
|
526
|
1
|
value_iqr_total %>% round_tidy(digits[2]), ")")
|
527
|
|
)
|
528
|
|
}} %>%
|
529
|
1
|
dplyr::select(dependent, value, Total) %>%
|
530
|
1
|
tidyr::pivot_wider(names_from = !! dependent, values_from = value) %>%
|
531
|
1
|
dplyr::mutate(
|
532
|
1
|
label = .x,
|
533
|
1
|
levels = "Median (IQR)"
|
534
|
|
)
|
535
|
|
}
|
536
|
|
} %>%
|
537
|
1
|
dplyr::select(label, levels, dplyr::everything()) %>%
|
538
|
1
|
dplyr::select(-Total, dplyr::everything()) %>%
|
539
|
|
# Total column
|
540
|
1
|
{ if(total_col){
|
541
|
|
.
|
542
|
|
} else {
|
543
|
1
|
dplyr::select(., -Total)
|
544
|
|
}
|
545
|
|
}
|
546
|
|
}
|
547
|
|
)
|
548
|
|
}
|
549
|
1
|
df.out = df.out %>%
|
550
|
|
# Add hypothesis test
|
551
|
1
|
{ if(p){
|
552
|
1
|
purrr::map2_df(., p_tests,
|
553
|
1
|
~ dplyr::mutate(.x,
|
554
|
1
|
p = .y)
|
555
|
|
)} else {
|
556
|
1
|
dplyr::bind_rows(.)
|
557
|
|
}} %>%
|
558
|
1
|
dplyr::select(label, levels, dplyr::everything()) %>%
|
559
|
1
|
as.data.frame() %>%
|
560
|
1
|
{ if(fit_id){
|
561
|
1
|
levels_id = .$levels
|
562
|
|
# Catagorical outcome, continous explanatory
|
563
|
1
|
drop = levels_id %in% c("Mean (SD)", "Median (IQR)")
|
564
|
1
|
levels_id[drop] = ""
|
565
|
|
# Continuous outcome, continuous explanatory
|
566
|
1
|
regex_sqbracket = "^(\\[).*(\\])$"
|
567
|
1
|
drop = grepl(regex_sqbracket, levels_id)
|
568
|
1
|
levels_id[drop] = ""
|
569
|
|
# Where extra terms included, add these in, e.g. I(var) (not interactions)
|
570
|
1
|
extra_terms = explanatory_terms[-which(explanatory_terms %in% explanatory)]
|
571
|
1
|
drop = grepl(":", extra_terms)
|
572
|
1
|
extra_terms = extra_terms[!drop]
|
573
|
1
|
{ if(!identical(extra_terms, character(0))){
|
574
|
0
|
levels_id = c(levels_id, rep("", length(extra_terms)))
|
575
|
0
|
dplyr::add_row(., label = extra_terms)
|
576
|
|
} else {
|
577
|
|
.
|
578
|
|
}} %>%
|
579
|
1
|
dplyr::mutate(., fit_id = paste0(label, levels_id),
|
580
|
1
|
index = 1:dim(.)[1])
|
581
|
|
} else {
|
582
|
|
.
|
583
|
|
}} %>%
|
584
|
|
|
585
|
|
# Recode variable names to labels where available
|
586
|
1
|
dplyr::mutate(
|
587
|
1
|
label = dplyr::recode(label, !!! var_labels)
|
588
|
|
) %>%
|
589
|
|
|
590
|
|
# Remove duplicate variables/p-values
|
591
|
1
|
rm_duplicate_labels() %>%
|
592
|
|
|
593
|
|
# Add column totals
|
594
|
1
|
{ if(add_col_totals){
|
595
|
0
|
ff_column_totals(., df.in, dependent,
|
596
|
0
|
percent = include_col_totals_percent,
|
597
|
0
|
na_include_dependent = na_include_dependent,
|
598
|
0
|
digits = digits[4], label = col_totals_rowname,
|
599
|
0
|
prefix = col_totals_prefix)
|
600
|
|
} else {
|
601
|
|
.
|
602
|
|
}} %>%
|
603
|
|
|
604
|
|
# Add row totals
|
605
|
1
|
{ if(add_row_totals){
|
606
|
0
|
ff_row_totals(., .data, dependent, explanatory, missing_column = include_row_missing_col,
|
607
|
0
|
na_include_dependent = FALSE, na_complete_cases = na_complete_cases,
|
608
|
0
|
total_name = row_totals_colname, na_name = row_missing_colname)
|
609
|
|
} else {
|
610
|
|
.
|
611
|
|
}} %>%
|
612
|
|
|
613
|
|
|
614
|
|
# Add dependent label
|
615
|
1
|
{ if(add_dependent_label){
|
616
|
1
|
dependent_label(., .data, dependent,
|
617
|
1
|
prefix=dependent_label_prefix, suffix = dependent_label_suffix)
|
618
|
|
} else {
|
619
|
|
.
|
620
|
|
}} %>%
|
621
|
|
|
622
|
|
# Replace any missing values with "", e.g. in (Missing) column
|
623
|
1
|
dplyr::mutate_all(.,
|
624
|
1
|
~ ifelse(is.na(.), "", .)
|
625
|
|
)
|
626
|
1
|
class(df.out) = c("data.frame.ff", class(df.out))
|
627
|
1
|
return(df.out)
|
628
|
|
}
|