mi2-warsaw / FSelectorRcpp
Showing 1 of 7 files from the diff.
Other files ignored by Codecov

@@ -0,0 +1,366 @@
Loading
1 +
### RELIEF
2 +
# adopted from https://github.com/larskotthoff/fselector/blob/master/R/selector.relief.R
3 +
# author Piotr Romanski
4 +
# classification and regression
5 +
# continous and discrete data
6 +
7 +
#' RReliefF filter
8 +
#'
9 +
#' @param formula An object of class \link{formula} with model description.
10 +
#' @param data A \link{data.frame} accompanying formula.
11 +
#' @param x A \link{data.frame} with attributes.
12 +
#' @param y A vector with response variable.
13 +
#' @param neighboursCount number of neighbours to find for every sampled instance
14 +
#' @param sampleSize number of instances to sample
15 +
#'
16 +
#' @description The algorithm finds weights of continuous and discrete attributes basing on a distance between instances.
17 +
#'
18 +
#' @return a data.frame containing the worth of attributes in the first column and their names as row names
19 +
#'
20 +
#' @references
21 +
#' Igor Kononenko: Estimating Attributes: Analysis and Extensions of RELIEF. In: European Conference on Machine Learning, 171-182, 1994.
22 +
#'
23 +
#' Marko Robnik-Sikonja, Igor Kononenko: An adaptation of Relief for attribute estimation in regression. In: Fourteenth International Conference on Machine Learning, 296-304, 1997.
24 +
#'
25 +
#' @details The function and it's manual page taken directly from \pkg{FSelector}:
26 +
#' Piotr Romanski and Lars Kotthoff (2018). FSelector: Selecting Attributes.
27 +
#' R package version 0.31. https://CRAN.R-project.org/package=FSelector
28 +
#'
29 +
#'
30 +
#' @examples
31 +
#'
32 +
#' data(iris)
33 +
#'
34 +
#' weights <- relief(Species~., iris, neighboursCount = 5, sampleSize = 20)
35 +
#' print(weights)
36 +
#' subset <- cut_attrs(weights, 2)
37 +
#' f <- to_formula(subset, "Species")
38 +
#' print(f)
39 +
#' @export
40 +
relief <- function(formula, data, x, y, neighboursCount = 5, sampleSize = 10) {
41 +
42 +
  stopifnot(neighboursCount >= 1)
43 +
  stopifnot(sampleSize >= 1)
44 +
45 +
  if (!xor(
46 +
    all(!missing(x), !missing(y)),
47 +
    all(!missing(formula), !missing(data)))) {
48 +
    stop(paste("Please specify both `x = attributes, y = response`,",
49 +
               "XOR use both `formula = response ~ attributes, data = dataset"))
50 +
  }
51 +
  if (sum(!missing(x), !missing(y), !missing(formula), !missing(data)) > 2){
52 +
    stop(paste("Please specify both `x = attributes, y = response`,",
53 +
               "XOR use both `formula = response ~ attributes, data = dataset"))
54 +
  }
55 +
56 +
  if (!missing(x) && !missing(y)) {
57 +
    if (class(x) == "formula") {
58 +
      stop(paste("Please use `formula = response ~ attributes, data = dataset`",
59 +
                 "interface instead of `x = formula`."))
60 +
    }
61 +
62 +
    data <- cbind(ReliefResponseVariable = y, x)
63 +
    formula <- ReliefResponseVariable ~ .
64 +
    return(.relief(formula, data, neighboursCount, sampleSize))
65 +
  }
66 +
67 +
  if (!missing(formula) && !missing(data)) {
68 +
    return(.relief(formula, data, neighboursCount, sampleSize))
69 +
  }
70 +
71 +
}
72 +
73 +
74 +
.relief <- function(formula, data, neighbours.count = 5, sample.size = 10) {
75 +
  # uses parent.env
76 +
  find_neighbours <- function(instance_idx) {
77 +
    instance = new_data[instance_idx,, drop = FALSE]
78 +
79 +
    # for every other instance
80 +
    for(current_idx in 1:instances_count) {
81 +
      if(instance_idx == current_idx)
82 +
        next()
83 +
      current_instance = new_data[current_idx,, drop = FALSE]
84 +
      if(is.na(current_instance[1, 1]))
85 +
        next()
86 +
87 +
      dist = instance_distance(instance, current_instance)
88 +
89 +
      if(classification)
90 +
        class_no = which(classes == current_instance[[1]])
91 +
      else
92 +
        class_no = 1
93 +
      if(nn_stored_count[class_no] < neighbours.count) {
94 +
        nn_stored_count[class_no] <<- nn_stored_count[class_no] + 1
95 +
        n_array[class_no, nn_stored_count[class_no], ] <<- c(dist, current_idx)
96 +
      } else {
97 +
        max_idx = which.max(n_array[class_no, , 1])
98 +
        max_value = n_array[class_no, max_idx, 1]
99 +
        if(dist < max_value) {
100 +
          n_array[class_no, max_idx, ] <<- c(dist, current_idx)
101 +
        }
102 +
      }
103 +
    }
104 +
  }
105 +
106 +
  # uses parent.env
107 +
  update_weights <- function(instance_idx) {
108 +
    instance = new_data[instance_idx,, drop = FALSE]
109 +
    instance_class = instance[1, 1]
110 +
    instance_class_no = which(classes == instance_class)
111 +
112 +
    if(classification) {
113 +
      # for each attribute
114 +
      for(attr_idx in 1:attributes_count) {
115 +
        col_idx = attr_idx + 1
116 +
117 +
        # nearest hits
118 +
        hits_sum = 0
119 +
        if(nn_stored_count[instance_class_no] > 0) {
120 +
          hits_sum = sum(sapply(1:nn_stored_count[instance_class_no], function(n_idx) {
121 +
            n_instance_idx = n_array[instance_class_no, n_idx, 2]
122 +
            n_instance = new_data[n_instance_idx,, drop = FALSE]
123 +
            return(field_distance(col_idx, instance, n_instance))
124 +
          }))
125 +
          hits_sum = hits_sum / nn_stored_count[instance_class_no]
126 +
        }
127 +
128 +
        # nearest misses
129 +
        misses_sum = 0
130 +
        if(class_count > 1) {
131 +
          misses_sum = sum(sapply((1:class_count)[-instance_class_no], function(class_no) {
132 +
            class_misses_sum = 0
133 +
            if(nn_stored_count[class_no] > 0) {
134 +
              class_misses_sum = sum(sapply(1:nn_stored_count[class_no], function(n_idx) {
135 +
                n_instance_idx = n_array[class_no, n_idx, 2]
136 +
                n_instance = new_data[n_instance_idx,, drop = FALSE]
137 +
                return(field_distance(col_idx, instance, n_instance))
138 +
              }))
139 +
              class_misses_sum = class_misses_sum * class_prob[class_no] / nn_stored_count[class_no]
140 +
            }
141 +
            return(class_misses_sum)
142 +
          }))
143 +
144 +
145 +
          misses_sum = misses_sum / (1 - class_prob[instance_class_no])
146 +
        }
147 +
        results[attr_idx] <<- results[attr_idx] - hits_sum + misses_sum
148 +
      }
149 +
    } else {
150 +
      if(nn_stored_count[1] > 0) {
151 +
        for(n_idx in 1:nn_stored_count[1]) {
152 +
          n_instance_idx = n_array[1, n_idx, 2]
153 +
          n_instance = new_data[n_instance_idx,, drop = FALSE]
154 +
          class_diff = field_distance(1, instance, n_instance)
155 +
          ndc <<- ndc + class_diff / nn_stored_count[1]
156 +
          for(attr_idx in 1:attributes_count) {
157 +
            col_idx = attr_idx + 1
158 +
            attr_diff_norm = field_distance(col_idx, instance, n_instance) / nn_stored_count[1]
159 +
            nda[attr_idx] <<- nda[attr_idx] + attr_diff_norm
160 +
            ndcda[attr_idx] <<- ndcda[attr_idx] + class_diff * attr_diff_norm
161 +
          }
162 +
        }
163 +
      }
164 +
    }
165 +
  }
166 +
167 +
  # parameters: data.frame, data.frame
168 +
  instance_distance <- function(instance1, instance2) {
169 +
    len = dim(instance1)[2]
170 +
    if(len != dim(instance2)[2])
171 +
      stop("Instances of different lengths")
172 +
    if(len <= 1)
173 +
      stop("Too few attributes")
174 +
175 +
    result = sapply(2:len, function(i) {
176 +
      return(field_distance(i, instance1, instance2))
177 +
    })
178 +
    #return(sqrt(sum(result ^ 2))) #sqrt not needed
179 +
    res = sum(result ^ 2)
180 +
    if(is.na(res)) {
181 +
      stop("Internal error. Distance NA.")
182 +
    }
183 +
    return(res)
184 +
  }
185 +
186 +
  # uses parent.env
187 +
  # parameters: index, data.frame, data.frame
188 +
  field_distance <- function(col_idx, instance1, instance2) {
189 +
    value1 = instance1[1, col_idx]
190 +
    value2 = instance2[1, col_idx]
191 +
    attr_idx = col_idx - 1 # skip class
192 +
193 +
    if(is.factor(value1) && is.factor(value2)) {
194 +
      if(is.na(value1) && is.na(value2)) {
195 +
        if(classification)
196 +
          return(1 - sum(p_val_in_class[[attr_idx]][, instance1[1, 1]] * p_val_in_class[[attr_idx]][, instance2[1, 1]]))
197 +
        else
198 +
          return(1 - p_same_val[[attr_idx]])
199 +
      } else if(is.na(value1) || is.na(value2)) {
200 +
        if(is.na(value1)) {
201 +
          known_value = value2
202 +
          unknown_class = instance1[1, 1]
203 +
        } else {
204 +
          known_value = value1
205 +
          unknown_class = instance2[1, 1]
206 +
        }
207 +
        if(classification)
208 +
          return(1 - p_val_in_class[[attr_idx]][known_value, unknown_class])
209 +
        else
210 +
          return(1 - p_val[[attr_idx]][known_value])
211 +
      } else if(value1 == value2) {
212 +
        return(0)
213 +
      } else { #if(value1 != value2)
214 +
        return(1)
215 +
      }
216 +
    } else if(is.numeric(value1) && is.numeric(value2)) {
217 +
      if(is.na(value1) && is.na(value2)) {
218 +
        return(1)
219 +
      } else if(is.na(value1)) {
220 +
        return(max(value2, 1 - value2))
221 +
      } else if(is.na(value2)) {
222 +
        return(max(value1, 1 - value1))
223 +
      } else {
224 +
        return(abs(value1 - value2))
225 +
      }
226 +
    } else {
227 +
      stop("Unsupported value type")
228 +
    }
229 +
  }
230 +
231 +
232 +
  modelDesc <- formula2names(formula, data)
233 +
  new_data  <- data[, c(modelDesc$y, modelDesc$x), drop = FALSE]
234 +
  new_data <- normalize_minmax(new_data)
235 +
236 +
  # for discrete classes
237 +
  class_vector = NULL
238 +
  class_count = NULL
239 +
  class_prob = NULL
240 +
  classes = NULL
241 +
  p_val_in_class = NULL
242 +
  p_val = NULL
243 +
  p_same_val = NULL
244 +
245 +
  # for continous class
246 +
  ndc = NULL
247 +
  nda = NULL
248 +
  ndcda = NULL
249 +
250 +
  results = NULL
251 +
  n_array = NULL
252 +
  nn_stored_count = NULL
253 +
  classification = NULL
254 +
  sample_instances_idx = NULL
255 +
256 +
  instances_count = dim(new_data)[1]
257 +
  attributes_count = dim(new_data)[2] - 1
258 +
  attr_names = dimnames(new_data)[[2]][-1]
259 +
260 +
  if(neighbours.count < 1) {
261 +
    neighbours.count = 1
262 +
    warning(paste("Assumed: neighbours.count = ", neighbours.count))
263 +
  }
264 +
265 +
  if(sample.size < 1) {
266 +
    warning(paste("Assumed: sample.size = ", sample.size))
267 +
    sample.size = 1
268 +
    sample_instances_idx = sample(1:instances_count, 1)
269 +
  } else if(sample.size > instances_count) {
270 +
    warning(paste("Assumed: sample.size = ", sample.size))
271 +
    sample.size = instances_count
272 +
    sample_instances_idx = 1:instances_count
273 +
  } else {
274 +
    sample_instances_idx = sort(sample(1:instances_count, sample.size, replace=TRUE))
275 +
  }
276 +
277 +
  classification = is.factor(new_data[[1]])
278 +
  if(classification) {
279 +
    class_vector = new_data[[1]]
280 +
    class_prob = table(class_vector)
281 +
    class_prob = class_prob / sum(class_prob)
282 +
    classes = names(class_prob)
283 +
    class_count = length(classes)
284 +
285 +
    p_val_in_class = lapply(new_data[-1], function(vec) {
286 +
      if(!is.factor(vec) || !any(is.na(vec)))
287 +
        return(NULL)
288 +
      tab = table(vec, class_vector)
289 +
      return(apply(tab, 2, function(x) {
290 +
        s = sum(x)
291 +
        if(s == 0)
292 +
          return(x)
293 +
        else
294 +
          return(x / s)
295 +
      }))
296 +
    })
297 +
  } else {
298 +
    class_count = 1
299 +
    ndc = 0
300 +
    nda = array(0, attributes_count)
301 +
    ndcda = array(0, attributes_count)
302 +
303 +
    p_val = lapply(new_data[-1], function(vec) {
304 +
      if(!is.factor(vec) || !any(is.na(vec)))
305 +
        return(NULL)
306 +
      tab = table(vec)
307 +
      if(sum(tab) != 0) {
308 +
        tab = tab / sum(tab)
309 +
      }
310 +
      return(tab)
311 +
    })
312 +
    p_same_val = lapply(p_val, function(attr) {
313 +
      if(is.null(attr))
314 +
        return(NULL)
315 +
      return(sum(attr ^ 2))
316 +
    })
317 +
  }
318 +
319 +
  n_array = array(0, c(class_count, neighbours.count, 2))
320 +
  nn_stored_count = array(0, class_count)
321 +
  results = rep(0, attributes_count)
322 +
323 +
  sapply(sample_instances_idx, function(current_instance_idx) {
324 +
    current_instance = new_data[current_instance_idx,, drop = FALSE]
325 +
    if(is.na(current_instance[[1]]))
326 +
      return(NULL)
327 +
328 +
    n_array[] <<- Inf
329 +
    nn_stored_count[] <<- 0
330 +
    find_neighbours(current_instance_idx)
331 +
    update_weights(current_instance_idx)
332 +
  })
333 +
334 +
335 +
  if(classification) {
336 +
    results = results / sample.size
337 +
    return(data.frame(attributes = attr_names, importance = results, stringsAsFactors = FALSE))
338 +
  } else {
339 +
    results = ndcda / ndc - ((nda - ndcda) / (sample.size - ndc))
340 +
    results = data.frame(attributes = attr_names, importance = results)
341 +
    #results = normalize.min.max(results)
342 +
    return(results)
343 +
  }
344 +
345 +
346 +
}
347 +
348 +
## adopted from https://github.com/larskotthoff/fselector/blob/master/R/normalize.R
349 +
normalize_minmax <- function(data) {
350 +
  attr_count = dim(data)[2]
351 +
  if (attr_count == 0)
352 +
    return(data)
353 +
  for (i in 1:attr_count) {
354 +
    if (!is.numeric(data[, i]))
355 +
      (next)()
356 +
    if (!any(complete.cases(data[, i])))
357 +
      (next)()
358 +
    mm = range(data[, i], na.rm = TRUE)
359 +
    minimum = mm[1]
360 +
    maximum = mm[2]
361 +
    if (minimum == maximum)
362 +
      data[, i] = data[, i]/minimum
363 +
    else data[, i] = (data[, i] - minimum)/(maximum - minimum)
364 +
  }
365 +
  return(data)
366 +
}
Files Coverage
R 85.78%
inst/include 96.09%
src 99.42%
Project Totals (22 files) 90.41%
Notifications are pending CI completion. Periodically Codecov will check the CI state, when complete notifications will be submitted. Push notifications now.
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading