dgrtwo / widyr
Showing 2 of 7 files from the diff.
Newly tracked file
R/widely_hclust.R created.
Newly tracked file
R/widely_kmeans.R created.
Other files ignored by Codecov

@@ -0,0 +1,58 @@
Loading
1 +
#' Cluster pairs of items into groups using hierarchical clustering
2 +
#'
3 +
#' Reshape a table that represents pairwise distances into hierarchical clusters,
4 +
#' returning a table with \code{item} and \code{cluster} columns.
5 +
#'
6 +
#' @param tbl Table
7 +
#' @param item1 First item
8 +
#' @param item2 Second item
9 +
#' @param distance Distance column
10 +
#' @param k The desired number of groups
11 +
#' @param h Height at which to cut the hierarchically clustered tree
12 +
#'
13 +
#' @examples
14 +
#'
15 +
#' library(gapminder)
16 +
#' library(dplyr)
17 +
#'
18 +
#' # Construct Euclidean distances between countries based on life
19 +
#' # expectancy over time
20 +
#' country_distances <- gapminder %>%
21 +
#'   pairwise_dist(country, year, lifeExp)
22 +
#'
23 +
#' country_distances
24 +
#'
25 +
#' # Turn this into 5 hierarchical clusters
26 +
#' clusters <- country_distances %>%
27 +
#'   widely_hclust(item1, item2, distance, k = 8)
28 +
#'
29 +
#' # Examine a few such clusters
30 +
#' clusters %>% filter(cluster == 1)
31 +
#' clusters %>% filter(cluster == 2)
32 +
#'
33 +
#' @seealso \link{cutree}
34 +
#'
35 +
#' @export
36 +
widely_hclust <- function(tbl, item1, item2, distance, k = NULL, h = NULL) {
37 +
  col1_str <- as.character(substitute(item1))
38 +
  col2_str <- as.character(substitute(item2))
39 +
  dist_str <- as.character(substitute(distance))
40 +
41 +
  unique_items <- unique(c(as.character(tbl[[col1_str]]), as.character(tbl[[col2_str]])))
42 +
43 +
  form <- stats::as.formula(paste(col1_str, "~", col2_str))
44 +
45 +
  max_distance <- max(tbl[[dist_str]])
46 +
47 +
  tibble(item1 = match(tbl[[col1_str]], unique_items),
48 +
         item2 = match(tbl[[col2_str]], unique_items),
49 +
         distance = tbl[[dist_str]]) %>%
50 +
    reshape2::acast(item1 ~ item2, value.var = "distance", fill = max_distance) %>%
51 +
    stats::as.dist() %>%
52 +
    stats::hclust() %>%
53 +
    stats::cutree(k = k, h = h) %>%
54 +
    tibble::enframe("item", "cluster") %>%
55 +
    dplyr::mutate(item = unique_items[as.integer(item)],
56 +
                  cluster = factor(cluster)) %>%
57 +
    dplyr::arrange(cluster)
58 +
}

@@ -0,0 +1,54 @@
Loading
1 +
#' Cluster items based on k-means across features
2 +
#'
3 +
#' Given a tidy table of features describing each item, perform k-means
4 +
#' clustering using \code{\link{kmeans}} and retidy the data into
5 +
#' one-row-per-cluster.
6 +
#'
7 +
#' @param tbl Table
8 +
#' @param item Item to cluster (as a bare column name)
9 +
#' @param feature Feature column (dimension in clustering)
10 +
#' @param value Value column
11 +
#' @param k Number of clusters
12 +
#' @param fill What to fill in for missing values
13 +
#' @param ... Other arguments passed on to \code{\link{kmeans}}
14 +
#'
15 +
#' @seealso \code{\link{widely_hclust}}
16 +
#'
17 +
#' @importFrom rlang :=
18 +
#'
19 +
#' @examples
20 +
#'
21 +
#' library(gapminder)
22 +
#' library(dplyr)
23 +
#'
24 +
#' clusters <- gapminder %>%
25 +
#'   widely_kmeans(country, year, lifeExp, k = 5)
26 +
#'
27 +
#' clusters
28 +
#'
29 +
#' clusters %>%
30 +
#'   count(cluster)
31 +
#'
32 +
#' # Examine a few clusters
33 +
#' clusters %>% filter(cluster == 1)
34 +
#' clusters %>% filter(cluster == 2)
35 +
#'
36 +
#' @export
37 +
widely_kmeans <- function(tbl, item, feature, value, k, fill = 0, ...) {
38 +
  item_str <- as.character(substitute(item))
39 +
  feature_str <- as.character(substitute(feature))
40 +
  value_str <- as.character(substitute(value))
41 +
42 +
  form <- stats::as.formula(paste(item_str, "~", feature_str))
43 +
44 +
  m <- tbl %>%
45 +
    reshape2::acast(form, value.var = value_str, fill = fill)
46 +
47 +
  clustered <- stats::kmeans(m, k, ...)
48 +
49 +
  # Add the clusters to the original table
50 +
  i <- match(rownames(m), as.character(tbl[[item_str]]))
51 +
  tibble::tibble(!!sym(item_str) := tbl[[item_str]][i],
52 +
                 cluster = factor(clustered$cluster)) %>%
53 +
    dplyr::arrange(cluster)
54 +
}
Files Coverage
R 57.92%
Project Totals (13 files) 57.92%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading