No flags found
Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.
e.g., #unittest #integration
#production #enterprise
#frontend #backend
79c442b
... +8 ...
e26cb1a
Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.
e.g., #unittest #integration
#production #enterprise
#frontend #backend
1 | + | ### RELIEF |
|
2 | + | # adopted from https://github.com/larskotthoff/fselector/blob/master/R/selector.relief.R |
|
3 | + | # author Piotr Romanski |
|
4 | + | # classification and regression |
|
5 | + | # continous and discrete data |
|
6 | + | ||
7 | + | #' RReliefF filter |
|
8 | + | #' |
|
9 | + | #' @param formula An object of class \link{formula} with model description. |
|
10 | + | #' @param data A \link{data.frame} accompanying formula. |
|
11 | + | #' @param x A \link{data.frame} with attributes. |
|
12 | + | #' @param y A vector with response variable. |
|
13 | + | #' @param neighboursCount number of neighbours to find for every sampled instance |
|
14 | + | #' @param sampleSize number of instances to sample |
|
15 | + | #' |
|
16 | + | #' @description The algorithm finds weights of continuous and discrete attributes basing on a distance between instances. |
|
17 | + | #' |
|
18 | + | #' @return a data.frame containing the worth of attributes in the first column and their names as row names |
|
19 | + | #' |
|
20 | + | #' @references |
|
21 | + | #' Igor Kononenko: Estimating Attributes: Analysis and Extensions of RELIEF. In: European Conference on Machine Learning, 171-182, 1994. |
|
22 | + | #' |
|
23 | + | #' Marko Robnik-Sikonja, Igor Kononenko: An adaptation of Relief for attribute estimation in regression. In: Fourteenth International Conference on Machine Learning, 296-304, 1997. |
|
24 | + | #' |
|
25 | + | #' |
|
26 | + | #' @return |
|
27 | + | #' @export |
|
28 | + | #' |
|
29 | + | #' @examples |
|
30 | + | #' |
|
31 | + | #' data(iris) |
|
32 | + | #' |
|
33 | + | #' weights <- relief(Species~., iris, neighboursCount = 5, sampleSize = 20) |
|
34 | + | #' print(weights) |
|
35 | + | #' subset <- cut_attrs(weights, 2) |
|
36 | + | #' f <- to_formula(subset, "Species") |
|
37 | + | #' print(f) |
|
38 | + | #' |
|
39 | + | relief <- function(formula, data, x, y, neighboursCount = 5, sampleSize = 10) { |
|
40 | + | ||
41 | + | if (!xor( |
|
42 | + | all(!missing(x), !missing(y)), |
|
43 | + | all(!missing(formula), !missing(data)))) { |
|
44 | + | stop(paste("Please specify both `x = attributes, y = response`,", |
|
45 | + | "XOR use both `formula = response ~ attributes, data = dataset")) |
|
46 | + | } |
|
47 | + | if (sum(!missing(x), !missing(y), !missing(formula), !missing(data)) > 2){ |
|
48 | + | stop(paste("Please specify both `x = attributes, y = response`,", |
|
49 | + | "XOR use both `formula = response ~ attributes, data = dataset")) |
|
50 | + | } |
|
51 | + | ||
52 | + | if (!missing(x) && !missing(y)) { |
|
53 | + | if (class(x) == "formula") { |
|
54 | + | stop(paste("Please use `formula = response ~ attributes, data = dataset`", |
|
55 | + | "interface instead of `x = formula`.")) |
|
56 | + | } |
|
57 | + | ||
58 | + | data <- cbind(ReliefResponseVariable = y, x) |
|
59 | + | formula <- ReliefResponseVariable ~ . |
|
60 | + | return(.relief(formula, data, neighboursCount, sampleSize)) |
|
61 | + | } |
|
62 | + | ||
63 | + | if (!missing(formula) && !missing(data)) { |
|
64 | + | return(.relief(formula, data, neighboursCount, sampleSize)) |
|
65 | + | } |
|
66 | + | ||
67 | + | } |
|
68 | + | ||
69 | + | ||
70 | + | .relief <- function(formula, data, neighbours.count = 5, sample.size = 10) { |
|
71 | + | # uses parent.env |
|
72 | + | find_neighbours <- function(instance_idx) { |
|
73 | + | instance = new_data[instance_idx,, drop = FALSE] |
|
74 | + | ||
75 | + | # for every other instance |
|
76 | + | for(current_idx in 1:instances_count) { |
|
77 | + | if(instance_idx == current_idx) |
|
78 | + | next() |
|
79 | + | current_instance = new_data[current_idx,, drop = FALSE] |
|
80 | + | if(is.na(current_instance[1, 1])) |
|
81 | + | next() |
|
82 | + | ||
83 | + | dist = instance_distance(instance, current_instance) |
|
84 | + | ||
85 | + | if(classification) |
|
86 | + | class_no = which(classes == current_instance[[1]]) |
|
87 | + | else |
|
88 | + | class_no = 1 |
|
89 | + | if(nn_stored_count[class_no] < neighbours.count) { |
|
90 | + | nn_stored_count[class_no] <<- nn_stored_count[class_no] + 1 |
|
91 | + | n_array[class_no, nn_stored_count[class_no], ] <<- c(dist, current_idx) |
|
92 | + | } else { |
|
93 | + | max_idx = which.max(n_array[class_no, , 1]) |
|
94 | + | max_value = n_array[class_no, max_idx, 1] |
|
95 | + | if(dist < max_value) { |
|
96 | + | n_array[class_no, max_idx, ] <<- c(dist, current_idx) |
|
97 | + | } |
|
98 | + | } |
|
99 | + | } |
|
100 | + | } |
|
101 | + | ||
102 | + | # uses parent.env |
|
103 | + | update_weights <- function(instance_idx) { |
|
104 | + | instance = new_data[instance_idx,, drop = FALSE] |
|
105 | + | instance_class = instance[1, 1] |
|
106 | + | instance_class_no = which(classes == instance_class) |
|
107 | + | ||
108 | + | if(classification) { |
|
109 | + | # for each attribute |
|
110 | + | for(attr_idx in 1:attributes_count) { |
|
111 | + | col_idx = attr_idx + 1 |
|
112 | + | ||
113 | + | # nearest hits |
|
114 | + | hits_sum = 0 |
|
115 | + | if(nn_stored_count[instance_class_no] > 0) { |
|
116 | + | hits_sum = sum(sapply(1:nn_stored_count[instance_class_no], function(n_idx) { |
|
117 | + | n_instance_idx = n_array[instance_class_no, n_idx, 2] |
|
118 | + | n_instance = new_data[n_instance_idx,, drop = FALSE] |
|
119 | + | return(field_distance(col_idx, instance, n_instance)) |
|
120 | + | })) |
|
121 | + | hits_sum = hits_sum / nn_stored_count[instance_class_no] |
|
122 | + | } |
|
123 | + | ||
124 | + | # nearest misses |
|
125 | + | misses_sum = 0 |
|
126 | + | if(class_count > 1) { |
|
127 | + | misses_sum = sum(sapply((1:class_count)[-instance_class_no], function(class_no) { |
|
128 | + | class_misses_sum = 0 |
|
129 | + | if(nn_stored_count[class_no] > 0) { |
|
130 | + | class_misses_sum = sum(sapply(1:nn_stored_count[class_no], function(n_idx) { |
|
131 | + | n_instance_idx = n_array[class_no, n_idx, 2] |
|
132 | + | n_instance = new_data[n_instance_idx,, drop = FALSE] |
|
133 | + | return(field_distance(col_idx, instance, n_instance)) |
|
134 | + | })) |
|
135 | + | class_misses_sum = class_misses_sum * class_prob[class_no] / nn_stored_count[class_no] |
|
136 | + | } |
|
137 | + | return(class_misses_sum) |
|
138 | + | })) |
|
139 | + | ||
140 | + | ||
141 | + | misses_sum = misses_sum / (1 - class_prob[instance_class_no]) |
|
142 | + | } |
|
143 | + | results[attr_idx] <<- results[attr_idx] - hits_sum + misses_sum |
|
144 | + | } |
|
145 | + | } else { |
|
146 | + | if(nn_stored_count[1] > 0) { |
|
147 | + | for(n_idx in 1:nn_stored_count[1]) { |
|
148 | + | n_instance_idx = n_array[1, n_idx, 2] |
|
149 | + | n_instance = new_data[n_instance_idx,, drop = FALSE] |
|
150 | + | class_diff = field_distance(1, instance, n_instance) |
|
151 | + | ndc <<- ndc + class_diff / nn_stored_count[1] |
|
152 | + | for(attr_idx in 1:attributes_count) { |
|
153 | + | col_idx = attr_idx + 1 |
|
154 | + | attr_diff_norm = field_distance(col_idx, instance, n_instance) / nn_stored_count[1] |
|
155 | + | nda[attr_idx] <<- nda[attr_idx] + attr_diff_norm |
|
156 | + | ndcda[attr_idx] <<- ndcda[attr_idx] + class_diff * attr_diff_norm |
|
157 | + | } |
|
158 | + | } |
|
159 | + | } |
|
160 | + | } |
|
161 | + | } |
|
162 | + | ||
163 | + | # parameters: data.frame, data.frame |
|
164 | + | instance_distance <- function(instance1, instance2) { |
|
165 | + | len = dim(instance1)[2] |
|
166 | + | if(len != dim(instance2)[2]) |
|
167 | + | stop("Instances of different lengths") |
|
168 | + | if(len <= 1) |
|
169 | + | stop("Too few attributes") |
|
170 | + | ||
171 | + | result = sapply(2:len, function(i) { |
|
172 | + | return(field_distance(i, instance1, instance2)) |
|
173 | + | }) |
|
174 | + | #return(sqrt(sum(result ^ 2))) #sqrt not needed |
|
175 | + | res = sum(result ^ 2) |
|
176 | + | if(is.na(res)) { |
|
177 | + | stop("Internal error. Distance NA.") |
|
178 | + | } |
|
179 | + | return(res) |
|
180 | + | } |
|
181 | + | ||
182 | + | # uses parent.env |
|
183 | + | # parameters: index, data.frame, data.frame |
|
184 | + | field_distance <- function(col_idx, instance1, instance2) { |
|
185 | + | value1 = instance1[1, col_idx] |
|
186 | + | value2 = instance2[1, col_idx] |
|
187 | + | attr_idx = col_idx - 1 # skip class |
|
188 | + | ||
189 | + | if(is.factor(value1) && is.factor(value2)) { |
|
190 | + | if(is.na(value1) && is.na(value2)) { |
|
191 | + | if(classification) |
|
192 | + | return(1 - sum(p_val_in_class[[attr_idx]][, instance1[1, 1]] * p_val_in_class[[attr_idx]][, instance2[1, 1]])) |
|
193 | + | else |
|
194 | + | return(1 - p_same_val[[attr_idx]]) |
|
195 | + | } else if(is.na(value1) || is.na(value2)) { |
|
196 | + | if(is.na(value1)) { |
|
197 | + | known_value = value2 |
|
198 | + | unknown_class = instance1[1, 1] |
|
199 | + | } else { |
|
200 | + | known_value = value1 |
|
201 | + | unknown_class = instance2[1, 1] |
|
202 | + | } |
|
203 | + | if(classification) |
|
204 | + | return(1 - p_val_in_class[[attr_idx]][known_value, unknown_class]) |
|
205 | + | else |
|
206 | + | return(1 - p_val[[attr_idx]][known_value]) |
|
207 | + | } else if(value1 == value2) { |
|
208 | + | return(0) |
|
209 | + | } else { #if(value1 != value2) |
|
210 | + | return(1) |
|
211 | + | } |
|
212 | + | } else if(is.numeric(value1) && is.numeric(value2)) { |
|
213 | + | if(is.na(value1) && is.na(value2)) { |
|
214 | + | return(1) |
|
215 | + | } else if(is.na(value1)) { |
|
216 | + | return(max(value2, 1 - value2)) |
|
217 | + | } else if(is.na(value2)) { |
|
218 | + | return(max(value1, 1 - value1)) |
|
219 | + | } else { |
|
220 | + | return(abs(value1 - value2)) |
|
221 | + | } |
|
222 | + | } else { |
|
223 | + | stop("Unsupported value type") |
|
224 | + | } |
|
225 | + | } |
|
226 | + | ||
227 | + | ||
228 | + | modelDesc <- formula2names(formula, data) |
|
229 | + | new_data <- data[, c(modelDesc$y, modelDesc$x), drop = FALSE] |
|
230 | + | new_data <- normalize_minmax(new_data) |
|
231 | + | ||
232 | + | # for discrete classes |
|
233 | + | class_vector = NULL |
|
234 | + | class_count = NULL |
|
235 | + | class_prob = NULL |
|
236 | + | classes = NULL |
|
237 | + | p_val_in_class = NULL |
|
238 | + | p_val = NULL |
|
239 | + | p_same_val = NULL |
|
240 | + | ||
241 | + | # for continous class |
|
242 | + | ndc = NULL |
|
243 | + | nda = NULL |
|
244 | + | ndcda = NULL |
|
245 | + | ||
246 | + | results = NULL |
|
247 | + | n_array = NULL |
|
248 | + | nn_stored_count = NULL |
|
249 | + | classification = NULL |
|
250 | + | sample_instances_idx = NULL |
|
251 | + | ||
252 | + | instances_count = dim(new_data)[1] |
|
253 | + | attributes_count = dim(new_data)[2] - 1 |
|
254 | + | attr_names = dimnames(new_data)[[2]][-1] |
|
255 | + | ||
256 | + | if(neighbours.count < 1) { |
|
257 | + | neighbours.count = 1 |
|
258 | + | warning(paste("Assumed: neighbours.count = ", neighbours.count)) |
|
259 | + | } |
|
260 | + | ||
261 | + | if(sample.size < 1) { |
|
262 | + | warning(paste("Assumed: sample.size = ", sample.size)) |
|
263 | + | sample.size = 1 |
|
264 | + | sample_instances_idx = sample(1:instances_count, 1) |
|
265 | + | } else if(sample.size > instances_count) { |
|
266 | + | warning(paste("Assumed: sample.size = ", sample.size)) |
|
267 | + | sample.size = instances_count |
|
268 | + | sample_instances_idx = 1:instances_count |
|
269 | + | } else { |
|
270 | + | sample_instances_idx = sort(sample(1:instances_count, sample.size, replace=TRUE)) |
|
271 | + | } |
|
272 | + | ||
273 | + | classification = is.factor(new_data[[1]]) |
|
274 | + | if(classification) { |
|
275 | + | class_vector = new_data[[1]] |
|
276 | + | class_prob = table(class_vector) |
|
277 | + | class_prob = class_prob / sum(class_prob) |
|
278 | + | classes = names(class_prob) |
|
279 | + | class_count = length(classes) |
|
280 | + | ||
281 | + | p_val_in_class = lapply(new_data[-1], function(vec) { |
|
282 | + | if(!is.factor(vec) || !any(is.na(vec))) |
|
283 | + | return(NULL) |
|
284 | + | tab = table(vec, class_vector) |
|
285 | + | return(apply(tab, 2, function(x) { |
|
286 | + | s = sum(x) |
|
287 | + | if(s == 0) |
|
288 | + | return(x) |
|
289 | + | else |
|
290 | + | return(x / s) |
|
291 | + | })) |
|
292 | + | }) |
|
293 | + | } else { |
|
294 | + | class_count = 1 |
|
295 | + | ndc = 0 |
|
296 | + | nda = array(0, attributes_count) |
|
297 | + | ndcda = array(0, attributes_count) |
|
298 | + | ||
299 | + | p_val = lapply(new_data[-1], function(vec) { |
|
300 | + | if(!is.factor(vec) || !any(is.na(vec))) |
|
301 | + | return(NULL) |
|
302 | + | tab = table(vec) |
|
303 | + | if(sum(tab) != 0) { |
|
304 | + | tab = tab / sum(tab) |
|
305 | + | } |
|
306 | + | return(tab) |
|
307 | + | }) |
|
308 | + | p_same_val = lapply(p_val, function(attr) { |
|
309 | + | if(is.null(attr)) |
|
310 | + | return(NULL) |
|
311 | + | return(sum(attr ^ 2)) |
|
312 | + | }) |
|
313 | + | } |
|
314 | + | ||
315 | + | n_array = array(0, c(class_count, neighbours.count, 2)) |
|
316 | + | nn_stored_count = array(0, class_count) |
|
317 | + | results = rep(0, attributes_count) |
|
318 | + | ||
319 | + | sapply(sample_instances_idx, function(current_instance_idx) { |
|
320 | + | current_instance = new_data[current_instance_idx,, drop = FALSE] |
|
321 | + | if(is.na(current_instance[[1]])) |
|
322 | + | return(NULL) |
|
323 | + | ||
324 | + | n_array[] <<- Inf |
|
325 | + | nn_stored_count[] <<- 0 |
|
326 | + | find_neighbours(current_instance_idx) |
|
327 | + | update_weights(current_instance_idx) |
|
328 | + | }) |
|
329 | + | ||
330 | + | ||
331 | + | if(classification) { |
|
332 | + | results = results / sample.size |
|
333 | + | return(data.frame(attributes = attr_names, importance = results, stringsAsFactors = FALSE)) |
|
334 | + | } else { |
|
335 | + | results = ndcda / ndc - ((nda - ndcda) / (sample.size - ndc)) |
|
336 | + | results = data.frame(attributes = attr_names, importance = results) |
|
337 | + | #results = normalize.min.max(results) |
|
338 | + | return(results) |
|
339 | + | } |
|
340 | + | ||
341 | + | ||
342 | + | } |
|
343 | + | ||
344 | + | ## adopted from https://github.com/larskotthoff/fselector/blob/master/R/normalize.R |
|
345 | + | normalize_minmax <- function(data) { |
|
346 | + | attr_count = dim(data)[2] |
|
347 | + | if (attr_count == 0) |
|
348 | + | return(data) |
|
349 | + | for (i in 1:attr_count) { |
|
350 | + | if (!is.numeric(data[, i])) |
|
351 | + | (next)() |
|
352 | + | if (!any(complete.cases(data[, i]))) |
|
353 | + | (next)() |
|
354 | + | mm = range(data[, i], na.rm = TRUE) |
|
355 | + | minimum = mm[1] |
|
356 | + | maximum = mm[2] |
|
357 | + | if (minimum == maximum) |
|
358 | + | data[, i] = data[, i]/minimum |
|
359 | + | else data[, i] = (data[, i] - minimum)/(maximum - minimum) |
|
360 | + | } |
|
361 | + | return(data) |
|
362 | + | } |
Learn more Showing 1 files with coverage changes found.
R/relief.R
Files | Coverage |
---|---|
R | -12.28% 85.74% |
inst/include | 96.09% |
src | 99.42% |
Project Totals (22 files) | 90.39% |
e26cb1a
a8819b0
15fe41b
fee2300
db0f3a0
77cdaee
519c289
2818f4b
9aaf1ac
79c442b