slu-openGIS / censusxy
Showing 6 of 86 files from the diff.
Other files ignored by Codecov
LICENSE has changed.
docs/link.svg has changed.
R/single.R was deleted.
NAMESPACE has changed.
docs/sitemap.xml has changed.
R/stl_homicides.R has changed.
docs/index.html has changed.
docs/pkgdown.css has changed.
docs/pkgdown.yml has changed.
man/cxy_single.Rd was deleted.
index.md was deleted.
.Rbuildignore has changed.
index.Rmd has changed.
docs/logo.png has changed.
docs/docsearch.js has changed.
docs/favicon.ico has changed.
docs/404.html has changed.
docs/authors.html has changed.
docs/pkgdown.js has changed.
README.md has changed.
NEWS.md has changed.
R/internal.R was deleted.
R/options.R was deleted.
R/batch.R was deleted.
README.Rmd has changed.
DESCRIPTION has changed.

@@ -0,0 +1,43 @@
Loading
1 +
# internal function for constructing output
2 +
3 +
cxy_replace <- function(source, result, style){
4 +
5 +
  # global bindings
6 +
  TIGER_line_id = address = city = cxy_uid = id = lat = lon = match_address = quality =
7 +
    side = state = status = zip = NULL
8 +
9 +
  # rename id
10 +
  result <- dplyr::rename(result, cxy_uid = id)
11 +
12 +
  # modify output
13 +
  if (style == "minimal"){
14 +
15 +
    result <- dplyr::select(result, cxy_uid, status, quality, match_address, lon, lat)
16 +
    result <- dplyr::rename(result,
17 +
                            cxy_status = status,
18 +
                            cxy_quality = quality,
19 +
                            cxy_match = match_address)
20 +
21 +
  } else if (style == "full"){
22 +
23 +
    result <- dplyr::select(result, cxy_uid, status, quality, match_address, lon, lat, dplyr::everything())
24 +
    result <- dplyr::rename(result,
25 +
                            cxy_address = address,
26 +
                            cxy_city = city,
27 +
                            cxy_state = state,
28 +
                            cxy_zip = zip,
29 +
                            cxy_status = status,
30 +
                            cxy_quality = quality,
31 +
                            cxy_match = match_address,
32 +
                            cxy_tiger_id = TIGER_line_id,
33 +
                            cxy_side = side)
34 +
35 +
  }
36 +
37 +
  # combine with source data
38 +
  out <- dplyr::left_join(source, result, by = "cxy_uid")
39 +
40 +
  # drop unique id
41 +
  out <- dplyr::select(out, -cxy_uid)
42 +
43 +
}

@@ -0,0 +1,62 @@
Loading
1 +
# This is the internal function for geocoding
2 +
#
3 +
#' @importFrom dplyr %>% mutate
4 +
#' @importFrom httr POST upload_file timeout content
5 +
#' @importFrom readr write_csv
6 +
#' @importFrom tidyr separate
7 +
#
8 +
cxy_geocoder <- function(.data, timeout){
9 +
10 +
  # global bindings
11 +
  zip = city = state = lon = lat = NULL
12 +
13 +
  # create and store a csv in a temp dir
14 +
  tmp <- tempdir()
15 +
  readr::write_csv(.data, path = paste0(tmp, "/addresses.csv"), col_names = FALSE, na = "")
16 +
  file <- paste0(tmp, "/addresses.csv")
17 +
18 +
  # send file as request
19 +
  response <- httr::POST("https://geocoding.geo.census.gov/geocoder/locations/addressbatch",
20 +
                         body = list(addressFile = httr::upload_file(file),
21 +
                                     benchmark = "Public_AR_Current", vintage = "Current_Current"),
22 +
                         encode = "multipart", httr::timeout(timeout))
23 +
24 +
  # parse as csv
25 +
  text <- httr::content(response, as = "text", encoding = "UTF-8")
26 +
  df <- utils::read.csv(text = text, header = FALSE, stringsAsFactors = FALSE)
27 +
28 +
  # split original address into components
29 +
  df <- tidyr::separate_(df, "V2", c("address", "city" ,"state", "zip"), sep = ",")
30 +
31 +
  # coerce zip to numeric, and remove spaces from state and city
32 +
  df <- dplyr::mutate(df, zip = as.numeric(zip),
33 +
                      city = trimws(city, "left"),
34 +
                      state = trimws(state, "left"))
35 +
36 +
  # special parse case for no matches so that it can bind in large batches
37 +
  if(length(df) < 7){
38 +
39 +
    l = rep_len(NA, nrow(df))
40 +
    non <- data.frame(quality = l, match_address = l, lon = l, lat = l, TIGER_line_id = l, side = l)
41 +
    df <- cbind(df, non)
42 +
43 +
  } else {
44 +
45 +
    # split and coerce class of coords
46 +
    df <- tidyr::separate(df, "V6", c("lon", "lat"), sep = ",", fill = 'right')
47 +
    df <- dplyr::mutate(df,
48 +
                        lon = as.numeric(lon),
49 +
                        lat = as.numeric(lat))
50 +
51 +
  }
52 +
53 +
  # apply names
54 +
  names(df) <- c("id", "address", "city" ,"state", "zip", "status", "quality", "match_address", "lon", "lat", "TIGER_line_id", "side")
55 +
56 +
  # clean-up temp directories
57 +
  unlink(paste0(tmp, "/addresses.csv"))
58 +
59 +
  # return output
60 +
  return(df)
61 +
62 +
}

@@ -0,0 +1,158 @@
Loading
1 +
#' Geocode Addresses Using the Census Bureau Geocoder
2 +
#'
3 +
#' @description This is the single function of the censusxy package, allowing
4 +
#'     for the easy geocoding of US Addresses using the US Census Bureau
5 +
#'     Geocoder. This function allows for flexible input and virtually
6 +
#'     unlimited batch sizes. See the vignette \code{vignette(censusxy)}
7 +
#'     for more details
8 +
#'
9 +
#' @usage cxy_geocode(.data, address, city, state, zip, style = "minimal",
10 +
#'     output = "tibble", fill_na = TRUE, timeout = 30)
11 +
#'
12 +
#' @param .data Data frame or tibble containing address data
13 +
#' @param address Column name containing address
14 +
#' @param city Optional; column name containing city
15 +
#' @param state Optional; column name containing state
16 +
#' @param zip Optional; column name containing 5-digit zip code
17 +
#' @param style One of either \code{"minimal"} or \code{"full"}
18 +
#' @param output One of either \code{"tibble"} or \code{"sf"}
19 +
#' @param fill_na If \code{TRUE}, fill un-matched values to \code{cxy_quality}
20 +
#'     and \code{exy_status} with \code{NA} values (default); \code{FALSE} returns
21 +
#'     empty strings, which mirrors the original output of this function
22 +
#' @param timeout Maximum number of minutes for each API call to the geocoder.
23 +
#'
24 +
#' @return Either a tibble or sf object containing the census geocoder response.
25 +
#'
26 +
#' @importFrom dplyr as_tibble bind_rows filter left_join rename select
27 +
#' @importFrom rlang enquo quo_name
28 +
#' @importFrom sf st_as_sf
29 +
#'
30 +
#' @examples
31 +
#' \donttest{
32 +
#' # load sample data
33 +
#' data <- stl_homicides_small
34 +
#'
35 +
#' # geocode data
36 +
#' data <- cxy_geocode(data, address = "street_address", city = "city",
37 +
#'     state = "state", zip = "postal_code")
38 +
#'
39 +
#' # preview data
40 +
#' data
41 +
#' }
42 +
#'
43 +
#' @export
44 +
cxy_geocode <- function(.data, address, city, state, zip, style = "minimal", output = "tibble",
45 +
                        fill_na = TRUE, timeout = 30){
46 +
47 +
   # global bindings
48 +
   lon = lat = quality = match_address = NULL
49 +
50 +
   # check for missing parameters
51 +
   if (missing(.data)){
52 +
     stop("An existing data frame or tibble must be specified for '.data'.")
53 +
   }
54 +
55 +
   if(missing(address)){
56 +
     stop("A column containing address must be supplied for 'address'.")
57 +
   }
58 +
59 +
   # check for incorrectly specified parameters
60 +
   if (is.numeric(timeout) == FALSE){
61 +
     stop("A numeric value must be specified for 'timeout'.")
62 +
   }
63 +
64 +
   if (style %in% c("minimal", "full") == FALSE){
65 +
     stop("Please choose one of 'minimal' or 'full' for 'style'.")
66 +
   }
67 +
68 +
   if (output %in% c("tibble", "sf") == FALSE){
69 +
     stop("Please choose one of 'tibble' or 'sf' for 'output'.")
70 +
   }
71 +
72 +
  # non-standard evaluation
73 +
  # address
74 +
  addressX <- rlang::quo_name(rlang::enquo(address))
75 +
76 +
  # city
77 +
  if (!missing(city)) {
78 +
    cityX <- rlang::quo_name(rlang::enquo(city))
79 +
  } else {
80 +
    cityX <- NA
81 +
  }
82 +
83 +
  # state
84 +
  if (!missing(state)) {
85 +
    stateX <- rlang::quo_name(rlang::enquo(state))
86 +
  } else {
87 +
    stateX <- NA
88 +
  }
89 +
90 +
  # zip
91 +
  if (!missing(zip)) {
92 +
    zipX <- rlang::quo_name(rlang::enquo(zip))
93 +
  } else {
94 +
    zipX <- NA
95 +
  }
96 +
97 +
  # warning about missing geographies
98 +
  if(any(missing(city), missing(state), missing(zip))){
99 +
    warning("Omission of city, state or zip code greatly reduces the speed and accuracy of the geocoder")
100 +
  }
101 +
102 +
  # construct vector of valid input variables
103 +
  invars <- c(addressX, cityX, stateX, zipX)
104 +
  invars <- invars[!is.na(invars)]
105 +
106 +
  # prepare and split
107 +
  .data <- cxy_id(.data, inputs = invars)
108 +
  prep <- cxy_prep(.data, address = addressX, city = cityX, state = stateX, zip = zipX)
109 +
  split <- cxy_split(prep)
110 +
  response <- vector("list", length(split))
111 +
112 +
  # convert timeout to min
113 +
  timeout <- timeout * 60
114 +
115 +
  # iterate over splits
116 +
  for (i in seq_along(split)) {
117 +
   response[[i]] <- try(
118 +
     cxy_geocoder(split[[i]], timeout)
119 +
     )
120 +
  }
121 +
122 +
  # remove any list element of class try-catch
123 +
  response <- response[sapply(response, function(x) class(x) != "try-error")]
124 +
125 +
  # supress warning of filling with NAs, this is anticipated behavior
126 +
  suppressWarnings({response <- dplyr::bind_rows(response)})
127 +
128 +
  # error if no matches found for batch
129 +
  if(all(response[["status"]] ==  "No_Match")){
130 +
    stop("No matches found for any of the supplied addresses. Make sure to include city, state and zip for best results.")
131 +
  }
132 +
133 +
  # fill NAs
134 +
  if (fill_na == TRUE){
135 +
136 +
    # fill cxy_quality
137 +
    response <- dplyr::mutate(response, quality = ifelse(quality == "", NA, quality))
138 +
139 +
    # fill cxy_match
140 +
    response <- dplyr::mutate(response, match_address = ifelse(match_address == "", NA, match_address))
141 +
142 +
  }
143 +
144 +
  # construct output
145 +
  out <- cxy_replace(source = .data, result = response, style = style)
146 +
147 +
  # optionally convert to sf
148 +
  if (output == "sf"){
149 +
150 +
    out <- dplyr::filter(out, !is.na(lon) & !is.na(lat))
151 +
    out <- sf::st_as_sf(out, coords = c("lon", "lat"), crs = 4326)
152 +
153 +
  }
154 +
155 +
  # return result
156 +
  return(out)
157 +
158 +
}

@@ -0,0 +1,21 @@
Loading
1 +
# indentify distinct observations
2 +
cxy_id <- function(.data, inputs){
3 +
4 +
  # set global bindings
5 +
  ...address = NULL
6 +
7 +
  # convert input vector to unquoted list
8 +
  varList <- rlang::syms(inputs)
9 +
10 +
  # concatenate input elements into single variable
11 +
  .data <- dplyr::mutate(.data, ...address = paste(!!!varList))
12 +
13 +
  # add unique id numbers for each address string
14 +
  dis <- dplyr::distinct(.data, ...address)
15 +
  dis <- tibble::rowid_to_column(dis, var = "cxy_uid")
16 +
  .data <- dplyr::left_join(.data, dis, by = "...address")
17 +
18 +
  # remove ...address and place cxy_uid
19 +
  .data <- dplyr::select(.data, -...address)
20 +
21 +
}

@@ -0,0 +1,50 @@
Loading
1 +
# internal functions for preparing data for geocoding
2 +
3 +
# prepare the dataframe to the geocoder standards
4 +
cxy_prep <- function(.data, address, city, state, zip){
5 +
6 +
  # global binding
7 +
  cxy_uid = NULL
8 +
9 +
  # limit to distinct obervations
10 +
  .data <- dplyr::distinct(.data, cxy_uid, .keep_all = TRUE)
11 +
12 +
  # id
13 +
  idX <- .data[["cxy_uid"]]
14 +
15 +
  # address
16 +
  addressX <- .data[[address]]
17 +
18 +
  # city
19 +
  if (is.na(city)) {
20 +
    cityX <- NA
21 +
  } else {
22 +
    cityX <- .data[[city]]
23 +
  }
24 +
25 +
  # state
26 +
  if (is.na(state)) {
27 +
    stateX <- NA
28 +
  } else {
29 +
    stateX <- .data[[state]]
30 +
  }
31 +
32 +
  # zip
33 +
  if (is.na(zip)) {
34 +
    zipX <- NA
35 +
  } else {
36 +
    zipX <- .data[[zip]]
37 +
  }
38 +
39 +
  # prepare data frame
40 +
  prep <- data.frame(id = idX,
41 +
                     address = addressX,
42 +
                     city = cityX,
43 +
                     state = stateX,
44 +
                     zip = zipX,
45 +
                     stringsAsFactors = FALSE)
46 +
47 +
  # return result
48 +
  return(prep)
49 +
50 +
}

@@ -0,0 +1,11 @@
Loading
1 +
# split and uniques (must be result from prep due to strict naming)
2 +
cxy_split <- function(.data, rows = 1000){
3 +
4 +
  uniq <- .data[!duplicated(.data[,c("address", "city", "state", "zip")]),]
5 +
6 +
  splits <- split(uniq, (seq(nrow(uniq))-1) %/% rows)
7 +
8 +
  # returns list of length `rows` elements
9 +
  return(splits)
10 +
11 +
}
Files Coverage
R 100.00%
Project Totals (6 files) 100.00%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading