dgrtwo / widyr

Compare ee925e4 ... +0 ... 13b3b3c

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.

Showing 2 of 7 files from the diff.
Newly tracked file
R/widely_hclust.R created.
Newly tracked file
R/widely_kmeans.R created.
Other files ignored by Codecov

@@ -0,0 +1,58 @@
Loading
1 +
#' Cluster pairs of items into groups using hierarchical clustering
2 +
#'
3 +
#' Reshape a table that represents pairwise distances into hierarchical clusters,
4 +
#' returning a table with \code{item} and \code{cluster} columns.
5 +
#'
6 +
#' @param tbl Table
7 +
#' @param item1 First item
8 +
#' @param item2 Second item
9 +
#' @param distance Distance column
10 +
#' @param k The desired number of groups
11 +
#' @param h Height at which to cut the hierarchically clustered tree
12 +
#'
13 +
#' @examples
14 +
#'
15 +
#' library(gapminder)
16 +
#' library(dplyr)
17 +
#'
18 +
#' # Construct Euclidean distances between countries based on life
19 +
#' # expectancy over time
20 +
#' country_distances <- gapminder %>%
21 +
#'   pairwise_dist(country, year, lifeExp)
22 +
#'
23 +
#' country_distances
24 +
#'
25 +
#' # Turn this into 5 hierarchical clusters
26 +
#' clusters <- country_distances %>%
27 +
#'   widely_hclust(item1, item2, distance, k = 8)
28 +
#'
29 +
#' # Examine a few such clusters
30 +
#' clusters %>% filter(cluster == 1)
31 +
#' clusters %>% filter(cluster == 2)
32 +
#'
33 +
#' @seealso \link{cutree}
34 +
#'
35 +
#' @export
36 +
widely_hclust <- function(tbl, item1, item2, distance, k = NULL, h = NULL) {
37 +
  col1_str <- as.character(substitute(item1))
38 +
  col2_str <- as.character(substitute(item2))
39 +
  dist_str <- as.character(substitute(distance))
40 +
41 +
  unique_items <- unique(c(as.character(tbl[[col1_str]]), as.character(tbl[[col2_str]])))
42 +
43 +
  form <- stats::as.formula(paste(col1_str, "~", col2_str))
44 +
45 +
  max_distance <- max(tbl[[dist_str]])
46 +
47 +
  tibble(item1 = match(tbl[[col1_str]], unique_items),
48 +
         item2 = match(tbl[[col2_str]], unique_items),
49 +
         distance = tbl[[dist_str]]) %>%
50 +
    reshape2::acast(item1 ~ item2, value.var = "distance", fill = max_distance) %>%
51 +
    stats::as.dist() %>%
52 +
    stats::hclust() %>%
53 +
    stats::cutree(k = k, h = h) %>%
54 +
    tibble::enframe("item", "cluster") %>%
55 +
    dplyr::mutate(item = unique_items[as.integer(item)],
56 +
                  cluster = factor(cluster)) %>%
57 +
    dplyr::arrange(cluster)
58 +
}

@@ -0,0 +1,54 @@
Loading
1 +
#' Cluster items based on k-means across features
2 +
#'
3 +
#' Given a tidy table of features describing each item, perform k-means
4 +
#' clustering using \code{\link{kmeans}} and retidy the data into
5 +
#' one-row-per-cluster.
6 +
#'
7 +
#' @param tbl Table
8 +
#' @param item Item to cluster (as a bare column name)
9 +
#' @param feature Feature column (dimension in clustering)
10 +
#' @param value Value column
11 +
#' @param k Number of clusters
12 +
#' @param fill What to fill in for missing values
13 +
#' @param ... Other arguments passed on to \code{\link{kmeans}}
14 +
#'
15 +
#' @seealso \code{\link{widely_hclust}}
16 +
#'
17 +
#' @importFrom rlang :=
18 +
#'
19 +
#' @examples
20 +
#'
21 +
#' library(gapminder)
22 +
#' library(dplyr)
23 +
#'
24 +
#' clusters <- gapminder %>%
25 +
#'   widely_kmeans(country, year, lifeExp, k = 5)
26 +
#'
27 +
#' clusters
28 +
#'
29 +
#' clusters %>%
30 +
#'   count(cluster)
31 +
#'
32 +
#' # Examine a few clusters
33 +
#' clusters %>% filter(cluster == 1)
34 +
#' clusters %>% filter(cluster == 2)
35 +
#'
36 +
#' @export
37 +
widely_kmeans <- function(tbl, item, feature, value, k, fill = 0, ...) {
38 +
  item_str <- as.character(substitute(item))
39 +
  feature_str <- as.character(substitute(feature))
40 +
  value_str <- as.character(substitute(value))
41 +
42 +
  form <- stats::as.formula(paste(item_str, "~", feature_str))
43 +
44 +
  m <- tbl %>%
45 +
    reshape2::acast(form, value.var = value_str, fill = fill)
46 +
47 +
  clustered <- stats::kmeans(m, k, ...)
48 +
49 +
  # Add the clusters to the original table
50 +
  i <- match(rownames(m), as.character(tbl[[item_str]]))
51 +
  tibble::tibble(!!sym(item_str) := tbl[[item_str]][i],
52 +
                 cluster = factor(clustered$cluster)) %>%
53 +
    dplyr::arrange(cluster)
54 +
}

Learn more Showing 2 files with coverage changes found.

New file R/widely_hclust.R
New
Loading file...
New file R/widely_kmeans.R
New
Loading file...
Files Coverage
R/cor_sparse.R 100.00%
R/pairwise_cor.R 100.00%
R/pairwise_count.R 100.00%
R/pairwise_delta.R 0.00%
R/pairwise_dist.R 100.00%
R/pairwise_pmi.R 0.00%
R/pairwise_similarity.R 100.00%
R/squarely.R 100.00%
R/utils.R 66.67%
R/widely.R 97.83%
R/widely_hclust.R
New File
0.00%
R/widely_kmeans.R
New File
0.00%
R/widely_svd.R 0.00%
Project Totals (13 files) 57.92%
Loading