mi2-warsaw / FSelectorRcpp

Compare 79c442b ... +8 ... e26cb1a

Showing 1 of 6 files from the diff.
Newly tracked file
R/relief.R created.
Other files ignored by Codecov

@@ -0,0 +1,362 @@
Loading
1 +
### RELIEF
2 +
# adopted from https://github.com/larskotthoff/fselector/blob/master/R/selector.relief.R
3 +
# author Piotr Romanski
4 +
# classification and regression
5 +
# continous and discrete data
6 +
7 +
#' RReliefF filter
8 +
#'
9 +
#' @param formula An object of class \link{formula} with model description.
10 +
#' @param data A \link{data.frame} accompanying formula.
11 +
#' @param x A \link{data.frame} with attributes.
12 +
#' @param y A vector with response variable.
13 +
#' @param neighboursCount number of neighbours to find for every sampled instance
14 +
#' @param sampleSize number of instances to sample
15 +
#'
16 +
#' @description The algorithm finds weights of continuous and discrete attributes basing on a distance between instances.
17 +
#'
18 +
#' @return a data.frame containing the worth of attributes in the first column and their names as row names
19 +
#'
20 +
#' @references
21 +
#' Igor Kononenko: Estimating Attributes: Analysis and Extensions of RELIEF. In: European Conference on Machine Learning, 171-182, 1994.
22 +
#'
23 +
#' Marko Robnik-Sikonja, Igor Kononenko: An adaptation of Relief for attribute estimation in regression. In: Fourteenth International Conference on Machine Learning, 296-304, 1997.
24 +
#'
25 +
#'
26 +
#' @return
27 +
#' @export
28 +
#'
29 +
#' @examples
30 +
#'
31 +
#' data(iris)
32 +
#'
33 +
#' weights <- relief(Species~., iris, neighboursCount = 5, sampleSize = 20)
34 +
#' print(weights)
35 +
#' subset <- cut_attrs(weights, 2)
36 +
#' f <- to_formula(subset, "Species")
37 +
#' print(f)
38 +
#'
39 +
relief <- function(formula, data, x, y, neighboursCount = 5, sampleSize = 10) {
40 +
41 +
  if (!xor(
42 +
    all(!missing(x), !missing(y)),
43 +
    all(!missing(formula), !missing(data)))) {
44 +
    stop(paste("Please specify both `x = attributes, y = response`,",
45 +
               "XOR use both `formula = response ~ attributes, data = dataset"))
46 +
  }
47 +
  if (sum(!missing(x), !missing(y), !missing(formula), !missing(data)) > 2){
48 +
    stop(paste("Please specify both `x = attributes, y = response`,",
49 +
               "XOR use both `formula = response ~ attributes, data = dataset"))
50 +
  }
51 +
52 +
  if (!missing(x) && !missing(y)) {
53 +
    if (class(x) == "formula") {
54 +
      stop(paste("Please use `formula = response ~ attributes, data = dataset`",
55 +
                 "interface instead of `x = formula`."))
56 +
    }
57 +
58 +
    data <- cbind(ReliefResponseVariable = y, x)
59 +
    formula <- ReliefResponseVariable ~ .
60 +
    return(.relief(formula, data, neighboursCount, sampleSize))
61 +
  }
62 +
63 +
  if (!missing(formula) && !missing(data)) {
64 +
    return(.relief(formula, data, neighboursCount, sampleSize))
65 +
  }
66 +
67 +
}
68 +
69 +
70 +
.relief <- function(formula, data, neighbours.count = 5, sample.size = 10) {
71 +
  # uses parent.env
72 +
  find_neighbours <- function(instance_idx) {
73 +
    instance = new_data[instance_idx,, drop = FALSE]
74 +
75 +
    # for every other instance
76 +
    for(current_idx in 1:instances_count) {
77 +
      if(instance_idx == current_idx)
78 +
        next()
79 +
      current_instance = new_data[current_idx,, drop = FALSE]
80 +
      if(is.na(current_instance[1, 1]))
81 +
        next()
82 +
83 +
      dist = instance_distance(instance, current_instance)
84 +
85 +
      if(classification)
86 +
        class_no = which(classes == current_instance[[1]])
87 +
      else
88 +
        class_no = 1
89 +
      if(nn_stored_count[class_no] < neighbours.count) {
90 +
        nn_stored_count[class_no] <<- nn_stored_count[class_no] + 1
91 +
        n_array[class_no, nn_stored_count[class_no], ] <<- c(dist, current_idx)
92 +
      } else {
93 +
        max_idx = which.max(n_array[class_no, , 1])
94 +
        max_value = n_array[class_no, max_idx, 1]
95 +
        if(dist < max_value) {
96 +
          n_array[class_no, max_idx, ] <<- c(dist, current_idx)
97 +
        }
98 +
      }
99 +
    }
100 +
  }
101 +
102 +
  # uses parent.env
103 +
  update_weights <- function(instance_idx) {
104 +
    instance = new_data[instance_idx,, drop = FALSE]
105 +
    instance_class = instance[1, 1]
106 +
    instance_class_no = which(classes == instance_class)
107 +
108 +
    if(classification) {
109 +
      # for each attribute
110 +
      for(attr_idx in 1:attributes_count) {
111 +
        col_idx = attr_idx + 1
112 +
113 +
        # nearest hits
114 +
        hits_sum = 0
115 +
        if(nn_stored_count[instance_class_no] > 0) {
116 +
          hits_sum = sum(sapply(1:nn_stored_count[instance_class_no], function(n_idx) {
117 +
            n_instance_idx = n_array[instance_class_no, n_idx, 2]
118 +
            n_instance = new_data[n_instance_idx,, drop = FALSE]
119 +
            return(field_distance(col_idx, instance, n_instance))
120 +
          }))
121 +
          hits_sum = hits_sum / nn_stored_count[instance_class_no]
122 +
        }
123 +
124 +
        # nearest misses
125 +
        misses_sum = 0
126 +
        if(class_count > 1) {
127 +
          misses_sum = sum(sapply((1:class_count)[-instance_class_no], function(class_no) {
128 +
            class_misses_sum = 0
129 +
            if(nn_stored_count[class_no] > 0) {
130 +
              class_misses_sum = sum(sapply(1:nn_stored_count[class_no], function(n_idx) {
131 +
                n_instance_idx = n_array[class_no, n_idx, 2]
132 +
                n_instance = new_data[n_instance_idx,, drop = FALSE]
133 +
                return(field_distance(col_idx, instance, n_instance))
134 +
              }))
135 +
              class_misses_sum = class_misses_sum * class_prob[class_no] / nn_stored_count[class_no]
136 +
            }
137 +
            return(class_misses_sum)
138 +
          }))
139 +
140 +
141 +
          misses_sum = misses_sum / (1 - class_prob[instance_class_no])
142 +
        }
143 +
        results[attr_idx] <<- results[attr_idx] - hits_sum + misses_sum
144 +
      }
145 +
    } else {
146 +
      if(nn_stored_count[1] > 0) {
147 +
        for(n_idx in 1:nn_stored_count[1]) {
148 +
          n_instance_idx = n_array[1, n_idx, 2]
149 +
          n_instance = new_data[n_instance_idx,, drop = FALSE]
150 +
          class_diff = field_distance(1, instance, n_instance)
151 +
          ndc <<- ndc + class_diff / nn_stored_count[1]
152 +
          for(attr_idx in 1:attributes_count) {
153 +
            col_idx = attr_idx + 1
154 +
            attr_diff_norm = field_distance(col_idx, instance, n_instance) / nn_stored_count[1]
155 +
            nda[attr_idx] <<- nda[attr_idx] + attr_diff_norm
156 +
            ndcda[attr_idx] <<- ndcda[attr_idx] + class_diff * attr_diff_norm
157 +
          }
158 +
        }
159 +
      }
160 +
    }
161 +
  }
162 +
163 +
  # parameters: data.frame, data.frame
164 +
  instance_distance <- function(instance1, instance2) {
165 +
    len = dim(instance1)[2]
166 +
    if(len != dim(instance2)[2])
167 +
      stop("Instances of different lengths")
168 +
    if(len <= 1)
169 +
      stop("Too few attributes")
170 +
171 +
    result = sapply(2:len, function(i) {
172 +
      return(field_distance(i, instance1, instance2))
173 +
    })
174 +
    #return(sqrt(sum(result ^ 2))) #sqrt not needed
175 +
    res = sum(result ^ 2)
176 +
    if(is.na(res)) {
177 +
      stop("Internal error. Distance NA.")
178 +
    }
179 +
    return(res)
180 +
  }
181 +
182 +
  # uses parent.env
183 +
  # parameters: index, data.frame, data.frame
184 +
  field_distance <- function(col_idx, instance1, instance2) {
185 +
    value1 = instance1[1, col_idx]
186 +
    value2 = instance2[1, col_idx]
187 +
    attr_idx = col_idx - 1 # skip class
188 +
189 +
    if(is.factor(value1) && is.factor(value2)) {
190 +
      if(is.na(value1) && is.na(value2)) {
191 +
        if(classification)
192 +
          return(1 - sum(p_val_in_class[[attr_idx]][, instance1[1, 1]] * p_val_in_class[[attr_idx]][, instance2[1, 1]]))
193 +
        else
194 +
          return(1 - p_same_val[[attr_idx]])
195 +
      } else if(is.na(value1) || is.na(value2)) {
196 +
        if(is.na(value1)) {
197 +
          known_value = value2
198 +
          unknown_class = instance1[1, 1]
199 +
        } else {
200 +
          known_value = value1
201 +
          unknown_class = instance2[1, 1]
202 +
        }
203 +
        if(classification)
204 +
          return(1 - p_val_in_class[[attr_idx]][known_value, unknown_class])
205 +
        else
206 +
          return(1 - p_val[[attr_idx]][known_value])
207 +
      } else if(value1 == value2) {
208 +
        return(0)
209 +
      } else { #if(value1 != value2)
210 +
        return(1)
211 +
      }
212 +
    } else if(is.numeric(value1) && is.numeric(value2)) {
213 +
      if(is.na(value1) && is.na(value2)) {
214 +
        return(1)
215 +
      } else if(is.na(value1)) {
216 +
        return(max(value2, 1 - value2))
217 +
      } else if(is.na(value2)) {
218 +
        return(max(value1, 1 - value1))
219 +
      } else {
220 +
        return(abs(value1 - value2))
221 +
      }
222 +
    } else {
223 +
      stop("Unsupported value type")
224 +
    }
225 +
  }
226 +
227 +
228 +
  modelDesc <- formula2names(formula, data)
229 +
  new_data  <- data[, c(modelDesc$y, modelDesc$x), drop = FALSE]
230 +
  new_data <- normalize_minmax(new_data)
231 +
232 +
  # for discrete classes
233 +
  class_vector = NULL
234 +
  class_count = NULL
235 +
  class_prob = NULL
236 +
  classes = NULL
237 +
  p_val_in_class = NULL
238 +
  p_val = NULL
239 +
  p_same_val = NULL
240 +
241 +
  # for continous class
242 +
  ndc = NULL
243 +
  nda = NULL
244 +
  ndcda = NULL
245 +
246 +
  results = NULL
247 +
  n_array = NULL
248 +
  nn_stored_count = NULL
249 +
  classification = NULL
250 +
  sample_instances_idx = NULL
251 +
252 +
  instances_count = dim(new_data)[1]
253 +
  attributes_count = dim(new_data)[2] - 1
254 +
  attr_names = dimnames(new_data)[[2]][-1]
255 +
256 +
  if(neighbours.count < 1) {
257 +
    neighbours.count = 1
258 +
    warning(paste("Assumed: neighbours.count = ", neighbours.count))
259 +
  }
260 +
261 +
  if(sample.size < 1) {
262 +
    warning(paste("Assumed: sample.size = ", sample.size))
263 +
    sample.size = 1
264 +
    sample_instances_idx = sample(1:instances_count, 1)
265 +
  } else if(sample.size > instances_count) {
266 +
    warning(paste("Assumed: sample.size = ", sample.size))
267 +
    sample.size = instances_count
268 +
    sample_instances_idx = 1:instances_count
269 +
  } else {
270 +
    sample_instances_idx = sort(sample(1:instances_count, sample.size, replace=TRUE))
271 +
  }
272 +
273 +
  classification = is.factor(new_data[[1]])
274 +
  if(classification) {
275 +
    class_vector = new_data[[1]]
276 +
    class_prob = table(class_vector)
277 +
    class_prob = class_prob / sum(class_prob)
278 +
    classes = names(class_prob)
279 +
    class_count = length(classes)
280 +
281 +
    p_val_in_class = lapply(new_data[-1], function(vec) {
282 +
      if(!is.factor(vec) || !any(is.na(vec)))
283 +
        return(NULL)
284 +
      tab = table(vec, class_vector)
285 +
      return(apply(tab, 2, function(x) {
286 +
        s = sum(x)
287 +
        if(s == 0)
288 +
          return(x)
289 +
        else
290 +
          return(x / s)
291 +
      }))
292 +
    })
293 +
  } else {
294 +
    class_count = 1
295 +
    ndc = 0
296 +
    nda = array(0, attributes_count)
297 +
    ndcda = array(0, attributes_count)
298 +
299 +
    p_val = lapply(new_data[-1], function(vec) {
300 +
      if(!is.factor(vec) || !any(is.na(vec)))
301 +
        return(NULL)
302 +
      tab = table(vec)
303 +
      if(sum(tab) != 0) {
304 +
        tab = tab / sum(tab)
305 +
      }
306 +
      return(tab)
307 +
    })
308 +
    p_same_val = lapply(p_val, function(attr) {
309 +
      if(is.null(attr))
310 +
        return(NULL)
311 +
      return(sum(attr ^ 2))
312 +
    })
313 +
  }
314 +
315 +
  n_array = array(0, c(class_count, neighbours.count, 2))
316 +
  nn_stored_count = array(0, class_count)
317 +
  results = rep(0, attributes_count)
318 +
319 +
  sapply(sample_instances_idx, function(current_instance_idx) {
320 +
    current_instance = new_data[current_instance_idx,, drop = FALSE]
321 +
    if(is.na(current_instance[[1]]))
322 +
      return(NULL)
323 +
324 +
    n_array[] <<- Inf
325 +
    nn_stored_count[] <<- 0
326 +
    find_neighbours(current_instance_idx)
327 +
    update_weights(current_instance_idx)
328 +
  })
329 +
330 +
331 +
  if(classification) {
332 +
    results = results / sample.size
333 +
    return(data.frame(attributes = attr_names, importance = results, stringsAsFactors = FALSE))
334 +
  } else {
335 +
    results = ndcda / ndc - ((nda - ndcda) / (sample.size - ndc))
336 +
    results = data.frame(attributes = attr_names, importance = results)
337 +
    #results = normalize.min.max(results)
338 +
    return(results)
339 +
  }
340 +
341 +
342 +
}
343 +
344 +
## adopted from https://github.com/larskotthoff/fselector/blob/master/R/normalize.R
345 +
normalize_minmax <- function(data) {
346 +
  attr_count = dim(data)[2]
347 +
  if (attr_count == 0)
348 +
    return(data)
349 +
  for (i in 1:attr_count) {
350 +
    if (!is.numeric(data[, i]))
351 +
      (next)()
352 +
    if (!any(complete.cases(data[, i])))
353 +
      (next)()
354 +
    mm = range(data[, i], na.rm = TRUE)
355 +
    minimum = mm[1]
356 +
    maximum = mm[2]
357 +
    if (minimum == maximum)
358 +
      data[, i] = data[, i]/minimum
359 +
    else data[, i] = (data[, i] - minimum)/(maximum - minimum)
360 +
  }
361 +
  return(data)
362 +
}

Learn more Showing 1 files with coverage changes found.

New file R/relief.R
New
Loading file...
Files Coverage
R -12.28% 85.74%
inst/include 96.09%
src 99.42%
Project Totals (22 files) 90.39%
Loading