tlverse / tmle3

@@ -42,7 +42,7 @@
Loading
42 42
  public = list(
43 43
    initialize = function(maxit = 100, cvtmle = TRUE, one_dimensional = FALSE,
44 44
                              constrain_step = FALSE, delta_epsilon = 1e-4,
45 -
                              convergence_type = c("se_logn", "n_samp"),
45 +
                              convergence_type = c("scaled_var", "sample_size"),
46 46
                              verbose = FALSE) {
47 47
      private$.maxit <- maxit
48 48
      private$.cvtmle <- cvtmle
@@ -62,7 +62,10 @@
Loading
62 62
    update_step = function(likelihood, tmle_task, fold_number = "full") {
63 63
64 64
      # get new submodel fit
65 -
      all_submodels <- self$generate_submodel_data(likelihood, tmle_task, fold_number)
65 +
      all_submodels <- self$generate_submodel_data(
66 +
        likelihood, tmle_task,
67 +
        fold_number
68 +
      )
66 69
      new_epsilons <- self$fit_submodels(all_submodels)
67 70
68 71
      # update likelihoods
@@ -75,11 +78,15 @@
Loading
75 78
      # increment step count
76 79
      private$.step_number <- private$.step_number + 1
77 80
    },
78 -
    generate_submodel_data = function(likelihood, tmle_task, fold_number = "full") {
81 +
    generate_submodel_data = function(likelihood, tmle_task,
82 +
                                          fold_number = "full") {
79 83
      update_nodes <- self$update_nodes
80 84
81 -
      # todo: support not getting observed for case where we're applying updates instead of fitting them
82 -
      clever_covariates <- lapply(self$tmle_params, function(tmle_param) tmle_param$clever_covariates(tmle_task, fold_number))
85 +
      # TODO: support not getting observed for case where we're applying
86 +
      #       updates instead of fitting them
87 +
      clever_covariates <- lapply(self$tmle_params, function(tmle_param) {
88 +
        tmle_param$clever_covariates(tmle_task, fold_number)
89 +
      })
83 90
84 91
      observed_values <- lapply(update_nodes, tmle_task$get_tmle_node)
85 92
@@ -88,12 +95,17 @@
Loading
88 95
        covariates_dt <- do.call(cbind, node_covariates)
89 96
        if (self$one_dimensional) {
90 97
          observed_task <- likelihood$training_task
91 -
          estimates <- lapply(self$tmle_params, function(tmle_param) tmle_param$estimates(observed_task, fold_number))
98 +
          estimates <- lapply(self$tmle_params, function(tmle_param) {
99 +
            tmle_param$estimates(observed_task, fold_number)
100 +
          })
92 101
          covariates_dt <- self$collapse_covariates(estimates, covariates_dt)
93 102
        }
94 103
95 104
        observed <- tmle_task$get_tmle_node(update_node)
96 -
        initial <- likelihood$get_likelihood(tmle_task, update_node, fold_number)
105 +
        initial <- likelihood$get_likelihood(
106 +
          tmle_task, update_node,
107 +
          fold_number
108 +
        )
97 109
98 110
        # scale observed and predicted values for bounded continuous
99 111
        observed <- tmle_task$scale(observed, update_node)
@@ -129,7 +141,11 @@
Loading
129 141
          mean(loss)
130 142
        }
131 143
132 -
        optim_fit <- optim(par = list(epsilon = self$delta_epsilon), fn = risk, lower = 0, upper = self$delta_epsilon, method = "Brent")
144 +
        optim_fit <- optim(
145 +
          par = list(epsilon = self$delta_epsilon), fn = risk,
146 +
          lower = 0, upper = self$delta_epsilon,
147 +
          method = "Brent"
148 +
        )
133 149
        epsilon <- optim_fit$par
134 150
        risk_val <- optim_fit$value
135 151
        risk_zero <- risk(0)
@@ -139,11 +155,15 @@
Loading
139 155
        }
140 156
      } else {
141 157
        suppressWarnings({
142 -
          submodel_fit <- glm(observed ~ H - 1, submodel_data, offset = qlogis(submodel_data$initial), family = binomial())
158 +
          submodel_fit <- glm(observed ~ H - 1, submodel_data,
159 +
            offset = qlogis(submodel_data$initial),
160 +
            family = binomial()
161 +
          )
143 162
        })
144 163
        epsilon <- coef(submodel_fit)
145 164
146 -
        # this protects against collinear covariates (which we don't care about, we just want an update)
165 +
        # NOTE: this protects against collinear covariates
166 +
        # (which we don't care about, we just want an update)
147 167
        epsilon[is.na(epsilon)] <- 0
148 168
      }
149 169
@@ -174,7 +194,10 @@
Loading
174 194
      update_nodes <- self$update_nodes
175 195
176 196
      # get submodel data for all nodes
177 -
      all_submodel_data <- self$generate_submodel_data(likelihood, tmle_task, fold_number)
197 +
      all_submodel_data <- self$generate_submodel_data(
198 +
        likelihood, tmle_task,
199 +
        fold_number
200 +
      )
178 201
179 202
      # apply update to all nodes
180 203
      updated_likelihoods <- lapply(update_nodes, function(update_node) {
@@ -182,10 +205,12 @@
Loading
182 205
        epsilon <- all_epsilon[[update_node]]
183 206
        updated_likelihood <- self$apply_submodel(submodel_data, epsilon)
184 207
185 -
        # unscale to handle bounded continuous
186 -
        updated_likelihood <- tmle_task$unscale(updated_likelihood, update_node)
208 +
        # un-scale to handle bounded continuous
209 +
        updated_likelihood <- tmle_task$unscale(
210 +
          updated_likelihood,
211 +
          update_node
212 +
        )
187 213
      })
188 -
189 214
      names(updated_likelihoods) <- update_nodes
190 215
191 216
      return(updated_likelihoods)
@@ -198,21 +223,25 @@
Loading
198 223
        }
199 224
      )
200 225
201 -
      if (self$convergence_type == "se_logn") {
226 +
      if (self$convergence_type == "scaled_var") {
227 +
        # NOTE: the point of this criterion is to avoid targeting in an overly
228 +
        #       aggressive manner, as we simply need check that the following
229 +
        #       condition is met |P_n D*| / SE(D*) =< max(1/log(n), 1/10)
202 230
        IC <- do.call(cbind, lapply(estimates, `[[`, "IC"))
203 -
        se_D <- sqrt(apply(IC, 2, var) / tmle_task$nrow)
204 -
        ED_threshold <- se_D / max(log(tmle_task$nrow), 10)
205 -
      } else if (self$convergence_type == "n_samp") {
231 +
        se_Dstar <- sqrt(apply(IC, 2, var) / tmle_task$nrow)
232 +
        ED_threshold <- se_Dstar / min(log(tmle_task$nrow), 10)
233 +
      } else if (self$convergence_type == "sample_size") {
206 234
        ED_threshold <- 1 / tmle_task$nrow
207 235
      }
208 236
237 +
      # get |P_n D*| of any number of parameter estimates
209 238
      ED <- ED_from_estimates(estimates)
210 -
      ED_criterion <- max(abs(ED))
239 +
      ED_criterion <- abs(ED)
211 240
212 241
      if (self$verbose) {
213 242
        cat(sprintf("max(abs(ED)): %e\n", ED_criterion))
214 243
      }
215 -
      return(all(ED_criterion < ED_threshold))
244 +
      return(all(ED_criterion <= ED_threshold))
216 245
    },
217 246
    update = function(likelihood, tmle_task) {
218 247
      update_fold <- self$update_fold
@@ -230,7 +259,10 @@
Loading
230 259
      }
231 260
      private$.tmle_params <- c(private$.tmle_params, new_params)
232 261
      new_update_nodes <- unlist(lapply(new_params, `[[`, "update_nodes"))
233 -
      private$.update_nodes <- unique(c(private$.update_nodes, new_update_nodes))
262 +
      private$.update_nodes <- unique(c(
263 +
        private$.update_nodes,
264 +
        new_update_nodes
265 +
      ))
234 266
    }
235 267
  ),
236 268
  active = list(
@@ -243,7 +275,10 @@
Loading
243 275
          new_params <- list(new_params)
244 276
        }
245 277
        private$.tmle_params <- new_params
246 -
        private$.update_nodes <- unique(unlist(lapply(new_params, `[[`, "update_nodes")))
278 +
        private$.update_nodes <- unique(unlist(lapply(
279 +
          new_params, `[[`,
280 +
          "update_nodes"
281 +
        )))
247 282
      }
248 283
      return(private$.tmle_params)
249 284
    },

@@ -112,17 +112,17 @@
Loading
112 112
113 113
      outcome <- target_node_object$variables
114 114
      covariates <- unlist(lapply(parent_nodes, `[[`, "variables"))
115 -
      
116 -
      
115 +
116 +
117 117
118 118
      nodes <- self$nodes
119 -
      node_data <- self$get_data(,unlist(nodes))
119 +
      node_data <- self$get_data(, unlist(nodes))
120 120
      nodes$outcome <- outcome
121 121
      nodes$covariates <- covariates
122 -
      
123 -
      
124 -
      regression_data <- do.call(cbind, c(all_covariate_data, outcome_data,node_data))
125 -
      
122 +
123 +
124 +
      regression_data <- do.call(cbind, c(all_covariate_data, outcome_data, node_data))
125 +
126 126
      regression_task <- sl3_Task$new(
127 127
        regression_data,
128 128
        nodes = nodes,
Files Coverage
R 79.71%
Project Totals (36 files) 79.71%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading