jacobkap / asciiSetupReader
Showing 8 of 238 files from the diff.
Other files ignored by Codecov
LICENSE is new.
.gitignore is new.
NAMESPACE is new.
NEWS.md is new.
LICENSE.md is new.
README.md is new.
README.Rmd is new.

@@ -0,0 +1,77 @@
Loading
1 +
#' Read fixed-width ASCII file using SAS Setup file.
2 +
#'
3 +
#' sas_ascii_reader() and spss_ascii_reader() are used when you need to
4 +
#' read an fixed-width ASCII (text) file that comes with a setup file. These file
5 +
#' combinations are sometimes referred to as .txt+.sps, .txt+.sas,
6 +
#' .dat+.sps, or .dat+.sas.
7 +
#' The setup file provides instructions on how to create and name the columns,
8 +
#' and fix the key-value pairs (sometimes called value labels). This is common
9 +
#' in government data, particular data produced before 2010.
10 +
#'
11 +
#' @family ASCII Reader functions
12 +
#' @seealso \code{\link{spss_ascii_reader}} For using an SPSS setup file
13 +
#'
14 +
#' @param sas_name Name of the SAS Setup file - should be a .sas or .txt file.
15 +
#' @inheritParams spss_ascii_reader
16 +
#' @export
17 +
#' @examples
18 +
#' # Text file is zipped to save space.
19 +
#' dataset_name <- system.file("extdata", "example_data.zip",
20 +
#'   package = "asciiSetupReader")
21 +
#' sas_name <- system.file("extdata", "example_setup.sas",
22 +
#'   package = "asciiSetupReader")
23 +
#'
24 +
#' \dontrun{
25 +
#' example <- sas_ascii_reader(dataset_name = dataset_name,
26 +
#'   sas_name = sas_name)
27 +
#'
28 +
#'
29 +
#' # Does not fix value labels
30 +
#' example2 <- sas_ascii_reader(dataset_name = dataset_name,
31 +
#'   sas_name = sas_name, value_label_fix = FALSE)
32 +
#'
33 +
#' # Keeps original column names
34 +
#' example3 <- sas_ascii_reader(dataset_name = dataset_name,
35 +
#'   sas_name = sas_name, real_names = FALSE)
36 +
#'
37 +
#' }
38 +
#' # Only returns the first 5 columns
39 +
#' example <- sas_ascii_reader(dataset_name = dataset_name,
40 +
#'   sas_name = sas_name, keep_columns = 1:5)
41 +
#'
42 +
sas_ascii_reader <- function(dataset_name,
43 +
                             sas_name,
44 +
                             value_label_fix = TRUE,
45 +
                             real_names = TRUE,
46 +
                             keep_columns = NULL,
47 +
                             coerce_numeric = TRUE) {
48 +
49 +
  #  .Deprecated("read_ascii_setup")
50 +
51 +
  stopifnot(is.character(dataset_name), length(dataset_name) == 1,
52 +
            is.character(sas_name), length(sas_name) == 1,
53 +
            is.logical(value_label_fix), length(value_label_fix) == 1,
54 +
            is.logical(real_names), length(real_names) == 1)
55 +
56 +
57 +
  # SAS setup
58 +
  setup <- parse_setup(sas_name)
59 +
  setup$setup <- selected_columns(keep_columns, setup$setup)
60 +
61 +
62 +
63 +
64 +
  # Reads in Data File ------------------------------------------------------
65 +
  data <- read_data(dataset_name, setup)
66 +
67 +
  data <- fix_value_labels(data, setup, value_label_fix)
68 +
69 +
70 +
71 +
  data <- fix_names_missing_numeric(data,
72 +
                                    setup,
73 +
                                    missing,
74 +
                                    real_names,
75 +
                                    coerce_numeric)
76 +
  return(data)
77 +
}

@@ -0,0 +1,342 @@
Loading
1 +
value_label_matrixer <- function(value_label, type) {
2 +
  # Gets rid of column name row
3 +
  value_label <- value_label[2:length(value_label)]
4 +
5 +
  value_label <- gsub("'\\.$", "'", value_label)
6 +
  value_label <- gsub("=\\[(.*)\\]", "='\\1'", value_label)
7 +
  value_label <- gsub('"\\.$', '"', value_label)
8 +
  value_label <- gsub("'(BEFORE) '([0-9]{2})'", "'\\1 \\2'", value_label,
9 +
                      ignore.case = TRUE)
10 +
  value_label <- gsub('"', "'", value_label)
11 +
12 +
13 +
  # For PSID data which for some reason have the word Wife
14 +
  # surrounded by single quotes sometimes!
15 +
  value_label <- gsub("'Wife''s'", "\\/Wifes", value_label, ignore.case = TRUE)
16 +
  value_label <- gsub("\\/'Wife'", "\\/Wife", value_label,
17 +
                      ignore.case = TRUE)
18 +
  value_label <- gsub("''Wife'", "\\/Wife", value_label, ignore.case = TRUE)
19 +
  value_label <- gsub(" 'Wif'", " Wif", value_label, ignore.case = TRUE)
20 +
  value_label <- gsub(" ([[:alnum:]]+) 'Wife'", " \\1 Wife", value_label,
21 +
       ignore.case = TRUE)
22 +
23 +
24 +
25 +
  # Removes any ' inside the text
26 +
  value_label <- gsub("([[:alpha:]]) '([[:alpha:]]+-?[[:alpha:]]+)'", "\\1 \\2",
27 +
                      value_label)
28 +
  value_label <- gsub("([[:alpha:]]) '([[:alpha:]]+ .*[[:alpha:]]+)'([[:alpha:]]+ .*)'([[:alpha:]]+ .*[[:alpha:]]+)''",
29 +
                      "\\1 \\2 \\3 \\4", value_label)
30 +
  value_label <- gsub("([[:alpha:]]) '([[:alpha:]]+ [[:alpha:]]+ .*[[:alpha:]]+)'",
31 +
                      "\\1 \\2", value_label)
32 +
33 +
  # In case some labels are on multiple lines
34 +
  plus <- grep("^\\+", value_label)
35 +
  # For cases where label is on multiple line but doesn't have a plus
36 +
  apostrophe_plus <- grep("\\'$|^\\'", value_label)
37 +
  num_apostrophes <- stringr::str_count(value_label[apostrophe_plus],
38 +
                                        "'")
39 +
  apostrophe_plus <- apostrophe_plus[num_apostrophes == 1]
40 +
  apostrophe_plus <- apostrophe_plus[!is.na(apostrophe_plus)]
41 +
  value_label[apostrophe_plus] <- gsub("\\'", "", value_label[apostrophe_plus])
42 +
43 +
  no_apostrophes <- stringr::str_count(value_label, "'")
44 +
  no_apostrophes <- which(no_apostrophes == 0)
45 +
46 +
  if (type == "sas") {
47 +
    no_equal <- grep("=", value_label, invert = TRUE)
48 +
    value_label[no_equal] <- gsub("\\'", "", value_label[no_equal])
49 +
    plus <- sort(unique(c(plus, no_equal)))
50 +
  }
51 +
  plus <- sort(unique(c(plus, apostrophe_plus, no_apostrophes)))
52 +
  if (length(plus) > 0) {
53 +
    for (n in length(plus):1) {
54 +
      value_label[plus[n] - 1]  <- gsub("\\'$", "", value_label[plus[n] - 1] )
55 +
      value_label[plus[n] - 1] <- paste(value_label[plus[n] - 1],
56 +
                                        value_label[plus[n]],
57 +
                                        collapse = " ")
58 +
      value_label[plus[n] - 1] <- gsub("\\' *\\+ *\\'", "",
59 +
                                       value_label[plus[n] - 1])
60 +
    }
61 +
    value_label <- value_label[-plus]
62 +
  }
63 +
64 +
65 +
66 +
67 +
  value_label <- gsub('\\"', "\\'",
68 +
                      value_label)
69 +
  value_label <- gsub("A 'NECESSARY", "A NECESSARY",
70 +
                      value_label)
71 +
  value_label <- gsub("^''", "'####BLANK####'",
72 +
                      value_label)
73 +
  value_label <- gsub("^' '", "'####SPACE####'",
74 +
                      value_label)
75 +
  value_label <- gsub(" '' ", " '####BLANK####' ",
76 +
                      value_label)
77 +
  value_label <- gsub(" {2,}| /\\.", "", value_label)
78 +
  value_label <- gsub('"', "'", value_label)
79 +
  value_label <- gsub("'\\s+.$", "'", value_label)
80 +
  value_label <- gsub("'$|''", "", value_label,
81 +
                      perl = TRUE)
82 +
  value_label <- gsub("(?<![<>])=", " ", value_label, perl = TRUE)
83 +
  value_label <- gsub("\\s+", " ", value_label)
84 +
  value_label <- gsub("([[:alpha:]])\\'([[:alpha:]])", "\\1 \\2",
85 +
                      value_label)
86 +
87 +
  value_label <- unlist(stringr::str_split(value_label, "' '"))
88 +
  if (all(grepl("\\s", value_label))) {
89 +
    value_label <- unlist(stringr::str_split(value_label, "'"))
90 +
  }
91 +
  value_label <- gsub("'| =", "", value_label)
92 +
  value_label <- stringr::str_trim(value_label)
93 +
  value_label <- gsub("####BLANK####", "", value_label)
94 +
  value_label <- gsub("####SPACE####", " ", value_label)
95 +
96 +
  value_label <- value_label[!value_label %in% c("/", "/ .")]
97 +
98 +
  value_label <- matrix(value_label, ncol = 2, byrow = TRUE)
99 +
  values <- value_label[, 1]
100 +
  names(values) <- value_label[, 2]
101 +
102 +
  return(values)
103 +
}
104 +
105 +
106 +
fix_variable_values <- function(.data, value_label_section, column) {
107 +
108 +
  value_label_section <- single_digit(value_label_section)
109 +
  value_label_section <- double_digit(value_label_section)
110 +
  value_label_section <- value_label_section[!duplicated(value_label_section)]
111 +
112 +
  if (!is.character(.data[[column]])) {
113 +
    data.table::set(.data, j = column, value = as.character(.data[[column]]))
114 +
  }
115 +
  data.table::set(.data, j = column,
116 +
                  value = haven::as_factor(haven::labelled(.data[[column]],
117 +
                                                           value_label_section)))
118 +
119 +
120 +
  data.table::set(.data, j = column, value = as.character(.data[[column]]))
121 +
  return(.data)
122 +
}
123 +
124 +
single_digit <- function(value_label_section) {
125 +
  if (any(grepl("^[0-9]$", value_label_section))) {
126 +
    single_digit <- value_label_section[grep("^[0-9]$", value_label_section)]
127 +
    names_single_digit <- names(single_digit)
128 +
    single_digit <- paste0("0", single_digit)
129 +
    names(single_digit) <- names_single_digit
130 +
    value_label_section <- c(value_label_section, single_digit)
131 +
  }
132 +
  return(value_label_section)
133 +
}
134 +
135 +
double_digit <- function(value_label_section) {
136 +
  if (any(grepl("^0[0-9]$", value_label_section))) {
137 +
    double_digit <- value_label_section[grep("^0[0-9]$", value_label_section)]
138 +
    names_double_digit <- names(double_digit)
139 +
    double_digit <- gsub("^.", "", double_digit)
140 +
    names(double_digit) <- names_double_digit
141 +
    value_label_section <- c(value_label_section, double_digit)
142 +
  }
143 +
  return(value_label_section)
144 +
}
145 +
146 +
get_value_labels <- function(codebook, setup, type) {
147 +
  if (!any(grepl2("^value labels$|SAS FORMAT STATEMENT|^format$|\\/\\* format$",
148 +
                  codebook))) {
149 +
    return(NULL)
150 +
  }
151 +
152 +
  if (type == "sps") {
153 +
    value_labels <- get_value_labels_sps(codebook, setup)
154 +
  } else if (type == "sas") {
155 +
    value_labels <- get_value_labels_sas(codebook, setup)
156 +
  }
157 +
158 +
  for (i in 1:nrow(value_labels)) {
159 +
    if (tolower(value_labels$column[i]) %in% tolower(setup$column_number)) {
160 +
      value_labels$column[i] <-
161 +
        setup$column_number[tolower(setup$column_number) %in% tolower(value_labels$column[i])]
162 +
    }
163 +
  }
164 +
165 +
  return(value_labels)
166 +
}
167 +
168 +
get_value_labels_sas <- function(codebook, setup) {
169 +
  # Gets value labels
170 +
  f_names <- as.character(setup$f_name[!is.na(setup$f_name)])
171 +
  starting <- c()
172 +
  for (f_name in f_names) {
173 +
    result <- grep(paste("VALUE", f_name),
174 +
                   codebook, fixed = TRUE)
175 +
    if (length(result) < 1) {
176 +
      result <- grep(paste("value", f_name),
177 +
                     codebook, fixed = TRUE)
178 +
    }
179 +
    starting <- c(starting, result)
180 +
  }
181 +
  starting <- min(starting)
182 +
183 +
  value_position <- grep2("^VALUE ", codebook)
184 +
  next_section <- grep2("^$|^\\.$|^\\*/$|^;$|^; ?\\*/$", codebook)
185 +
  value_position <- value_position[value_position >= starting]
186 +
  value_labels <- codebook[(value_position[1]):(next_section[next_section >= max(value_position)][1]-1)]
187 +
  value_labels <- gsub(";\\*\\/", "", value_labels)
188 +
  value_labels <- gsub("([[:alpha:]]+);", "\\1", value_labels)
189 +
  value_labels <- unlist(strsplit(value_labels, ";"))
190 +
191 +
192 +
  for (f_name in f_names) {
193 +
    value_labels <- gsub(paste0("^VALUE ", f_name, " \\(\\S*\\)"),
194 +
                         paste0("VALUE ", f_name), value_labels,
195 +
                         ignore.case = TRUE)
196 +
  }
197 +
198 +
199 +
  value_labels <- gsub("^VALUE ", "", value_labels,
200 +
                       ignore.case = TRUE)
201 +
  value_labels <- stringr::str_trim(value_labels)
202 +
  value_labels <- gsub("&\\s+", "& ", value_labels)
203 +
204 +
205 +
  value_labels <- gsub("([[:alpha:]]+)\\s+(<?[0-9]+)", "\\1 \\2",
206 +
                       value_labels)
207 +
  value_labels <- gsub("([[:alpha:]]+)\\s+([[:alpha:]]+)", "\\1 \\2",
208 +
                       value_labels)
209 +
  value_labels <- gsub("([[:alnum:]]+\\.[[:alnum:]]+=)", "      \\1",
210 +
                       value_labels)
211 +
212 +
213 +
  value_labels <- gsub("([^\\.]-?[[:alnum:]]+=[^A-z])", "      \\1",
214 +
                       value_labels)
215 +
  # value_labels <- gsub("(\\(?<=[^\\.]-?[[:alnum:]]+=[^A-z])", "      \\1",
216 +
  #      value_labels)
217 +
218 +
219 +
  value_labels <- trimws(value_labels)
220 +
  value_labels <- unlist(strsplit(value_labels, "\\s{2,}"))
221 +
222 +
  value_labels <- data.frame(value_labels,
223 +
                             column = value_labels[1],
224 +
                             stringsAsFactors = FALSE)
225 +
226 +
  column <- value_labels$value_labels[1]
227 +
  for (i in 1:nrow(value_labels)) {
228 +
    value_labels$column[i] <- column
229 +
    if (value_labels$value_labels[i + 1] %in% setup$f_name) {
230 +
      column <- value_labels$value_labels[i + 1]
231 +
    }
232 +
  }
233 +
234 +
  final_value_labels <- data.frame(stringsAsFactors = FALSE)
235 +
  for (col in unique(value_labels$column)) {
236 +
    single_value_label <- value_labels[value_labels$column %in% col, ]
237 +
    real_names <- unique(setup$column_number[setup$f_name %in% col])
238 +
    for (real_name in real_names) {
239 +
      temp <- single_value_label
240 +
      temp$value_labels[1] <- real_name
241 +
      temp$column <- real_name
242 +
      final_value_labels <- rbind(final_value_labels, temp)
243 +
    }
244 +
  }
245 +
246 +
  return(final_value_labels)
247 +
}
248 +
249 +
get_value_labels_sps <- function(codebook, setup) {
250 +
  value_start <- grep2("^value labels$",
251 +
                       codebook)
252 +
  end_row <- grep("^\\.$|^$", codebook)
253 +
  end_row <- end_row[end_row > value_start[length(value_start)]][1] - 1
254 +
  if (is.na(end_row)) {
255 +
    end_row <- length(codebook)
256 +
  }
257 +
  value_start <- value_start[1]
258 +
  value_labels <- codebook[value_start:end_row]
259 +
   # Get rid of "Truncated value... stuff in PSID files
260 +
  value_labels <- gsub('\\/\\*.*\\*\\/', "", value_labels)
261 +
  value_labels <- stringr::str_trim(value_labels)
262 +
263 +
264 +
  value_labels <- gsub('\\s+\\"$', '"', value_labels)
265 +
  value_labels <- gsub('\\"\\s+([[:alnum:]])', '\\"\\1', value_labels)
266 +
  value_labels <- gsub("\\s+\\(", " \\(", value_labels)
267 +
  value_labels <- gsub("&\\s+", "& ", value_labels)
268 +
  value_labels <- gsub("([[:alpha:]])\\s{2,}([[:alpha:]])", "\\1 \\2",
269 +
                       value_labels)
270 +
271 +
  value_labels <- gsub('^([0-9]+)\\s{2,}\\"', '\\1 \\"', value_labels)
272 +
  value_labels <- gsub('(\\s{2,})([0-9]+)\\s{2,}\\"', '\\1\\2 \\"',
273 +
                       value_labels)
274 +
275 +
  add_spaces <- paste0(setup$column_number, "   ")
276 +
  names(add_spaces) <- paste0('^', setup$column_number, " ")
277 +
  add_spaces <- add_spaces[!duplicated(add_spaces)]
278 +
  add_spaces <- add_spaces[!grepl("^\\*", add_spaces)]
279 +
  value_labels <- stringr::str_replace_all(value_labels, add_spaces)
280 +
  value_labels <- gsub("\"([[:alnum:]]+)\\s+([0-9])", "\"\\1 \\2",
281 +
                       value_labels)
282 +
283 +
  value_labels <- gsub("([[:alpha:]])\\s\\s([0-9])",
284 +
                       "\\1 \\2", value_labels)
285 +
  value_labels <- gsub("([[:alpha:]])\\s\\s([[:punct:]])",
286 +
                       "\\1 \\2", value_labels)
287 +
  value_labels <- gsub(":  ", ": ", value_labels)
288 +
289 +
  value_labels <- unlist(strsplit(value_labels, "\\s{2,}"))
290 +
  value_labels <- value_labels[!value_labels %in% c(".", "/")]
291 +
  value_labels <- value_labels[-1]
292 +
  value_labels <- gsub('"', "'", value_labels)
293 +
  value_labels <- gsub('^/', "", value_labels)
294 +
  value_labels <- gsub("''''", "''", value_labels)
295 +
  value_labels <- gsub('\\"', "\\'", value_labels)
296 +
  value_labels <- data.frame(value_labels,
297 +
                             column = value_labels,
298 +
                             stringsAsFactors = FALSE)
299 +
300 +
301 +
  column <- value_labels$value_labels[1]
302 +
  for (i in 1:nrow(value_labels)) {
303 +
    value_labels$column[i] <- column
304 +
    if (grepl("\\' \\/$", value_labels$value_labels[i]) |
305 +
        value_labels$value_labels[i + 1] %in%
306 +
        setup$column_number |
307 +
        (!grepl("\\'", value_labels$value_labels[i + 1]) &
308 +
         !grepl("^[0-9]+$", value_labels$value_labels[i + 1]))) {
309 +
      column <- value_labels$value_labels[i + 1]
310 +
    }
311 +
  }
312 +
  return(value_labels)
313 +
}
314 +
315 +
fix_names <- function(names) {
316 +
  names <- gsub('^.* = |^.* \\"|\\W|#|\\/', "_", names)
317 +
  names <- gsub("_+", "_", names)
318 +
  names <- gsub("^_|_$", "", names)
319 +
  names <- gsub("^([0-9])", "X\\1", names)
320 +
  return(names)
321 +
}
322 +
323 +
# Make numeric columns numeric
324 +
all_numeric <- function(column) {
325 +
  column_NAs <- sum(is.na(column))
326 +
  column <- suppressWarnings(as.numeric(as.character(column)))
327 +
  return(is.numeric(column) && sum(is.na(column)) == column_NAs)
328 +
}
329 +
330 +
make_cols_numeric <- function(dataset) {
331 +
  times <- nrow(dataset) * .10
332 +
  if (times < 50000 && nrow(dataset) > 50000)  times <- 50000
333 +
  times <- sample(nrow(dataset), times, replace = FALSE)
334 +
  if (nrow(dataset) < 50000) times <- 1:nrow(dataset)
335 +
  for (i in seq_along(dataset)) {
336 +
    if ( (all_numeric(dataset[[i]][times])) ) {
337 +
      suppressWarnings(data.table::set(dataset, j = i,
338 +
                                       value = as.numeric(as.character(dataset[[i]]))))
339 +
    }
340 +
  }
341 +
  return(dataset)
342 +
}

@@ -0,0 +1,381 @@
Loading
1 +
#' Parse the setup file (.sps or .sas).
2 +
#'
3 +
#' @param setup_file Name of the SPSS or SAS setup file - should be a .sps or .sas (.txt also accepted as are these files in zipped format)
4 +
#'
5 +
#' @return A list of length 3. The first object ("setup") is a data frame
6 +
#' containing 4 columns: first the non-descriptive name of each column,
7 +
#' The second column is the descriptive name of the column. Columns three
8 +
#' and four and the beginning and ending number of the column (used to
9 +
#' determine the columns location in the fixed-with data file).
10 +
#'
11 +
#' The second object ("value_labels") in the list is list of named vectors for the value labels. The
12 +
#' list has a length equal to the number of columns with value labels. If there are
13 +
#' no value labels, this will be NULL.
14 +
#'
15 +
#' The third object ("missing") in the list is a data.frame with two columns. The first
16 +
#' column says the variable name and the second column says the value that is missing
17 +
#' and will be replaced with NA.
18 +
#' @export
19 +
#'
20 +
#' @examples
21 +
#' \dontrun{
22 +
#' sas_name <- system.file("extdata", "example_setup.sas",
23 +
#'                          package = "asciiSetupReader")
24 +
#' sas_example <- parse_setup(sas_name)
25 +
#'
26 +
#' sps_name <- system.file("extdata", "example_setup.sps",
27 +
#'                          package = "asciiSetupReader")
28 +
#' sps_example <- parse_setup(sps_name)
29 +
#' }
30 +
parse_setup <- function(setup_file) {
31 +
32 +
  if (grepl(".sps(\\.zip)?$", setup_file, ignore.case = TRUE)) {
33 +
    type <- "sps"
34 +
  } else {
35 +
    type <- "sas"
36 +
  }
37 +
38 +
  codebook  <- parse_codebook(setup_file, type = type)
39 +
  variables <- parse_column_names(codebook, type = type)
40 +
41 +
  if (type == "sps") {
42 +
    second_grep_value <- grep2("^variable labels$", codebook)
43 +
    second_grep_value <- second_grep_value[1]
44 +
    if (is.na(second_grep_value)) {
45 +
      second_grep_value <- length(codebook)
46 +
    }
47 +
    setup <- codebook[grep2("DATA LIST|/VARIABLES =$", codebook):second_grep_value]
48 +
    setup <- gsub("\\([0-9]+\\) |\\.[0-9]+ ", "", setup)
49 +
50 +
    # If starts with a number, combine with previous row
51 +
    start_with_number <- grep("^[0-9]", setup)
52 +
    if (length(start_with_number) > 0) {
53 +
      for (n in length(start_with_number):1) {
54 +
        setup[start_with_number[n] - 1] <- paste(setup[start_with_number[n] - 1],
55 +
                                                 setup[start_with_number[n]],
56 +
                                                 collapse = " ")
57 +
      }
58 +
      setup <- setup[-start_with_number]
59 +
    }
60 +
    setup <- gsub("([[:alpha:]]+[0-9]* [0-9]+) ([[:alpha:]]+[0-9]*)", "\\1   \\2",
61 +
                  setup)
62 +
63 +
    setup <- gsub("([[:alpha:]]+[0-9]*)\\s+", "\\1 ",
64 +
                  setup)
65 +
66 +
    setup <- gsub(" \\(A\\) ", " ", setup)
67 +
    setup <- gsub(" ([0-9]+-[0-9]+) ([[:alpha:]])", " \\1   \\2",
68 +
                  setup)
69 +
    setup <- gsub(" ([0-9]+) ([[:alpha:]])", " \\1   \\2",
70 +
                  setup)
71 +
    setup <- unlist(strsplit(setup, '"\\s{3,}'))
72 +
  } else {
73 +
    second_grep_value <- grep("^$|^;$", codebook)[grep("^$|^;$", codebook) > grep2("^INPUT$", codebook) + 5]
74 +
    second_grep_value <- second_grep_value[1]
75 +
    if (is.na(second_grep_value)) {
76 +
      second_grep_value <- length(codebook)
77 +
    }
78 +
    setup <- codebook[grep2("^INPUT$", codebook):second_grep_value]
79 +
    setup <- gsub("\\([0-9]+\\) |\\.[0-9]+ ", "", setup)
80 +
81 +
    # If starts with a number, combine with previous row
82 +
    start_with_number <- grep("^[0-9]", setup)
83 +
84 +
    if (length(start_with_number) > 0) {
85 +
      for (n in length(start_with_number):1) {
86 +
        setup[start_with_number[n] - 1] <- paste(setup[start_with_number[n] - 1],
87 +
                                                 setup[start_with_number[n]],
88 +
                                                 collapse = " ")
89 +
      }
90 +
      setup <- setup[-start_with_number]
91 +
    }
92 +
93 +
    setup <- gsub("([[:alpha:]]+[0-9]* [0-9]+) ([[:alpha:]]+[0-9]*)", "\\1   \\2",
94 +
                  setup)
95 +
    setup <- gsub("([[:alnum:]])\\s{2,}([0-9]+) ", "\\1 \\2 ", setup)
96 +
    setup <- gsub("([[:alnum:]])\\s{2,}([0-9]+)$", "\\1 \\2", setup)
97 +
    setup <- gsub("([[:alnum:]])\\s{2,}([0-9]+-[0-9]+) ", "\\1 \\2 ", setup)
98 +
    setup <- gsub("([[:alnum:]])\\s{2,}([0-9]+-[0-9]+)$", "\\1 \\2", setup)
99 +
  }
100 +
  setup <- get_column_spaces(setup, variables, codebook)
101 +
  setup <- setup[setup$column_number != "*", ]
102 +
  rownames(setup) <- 1:nrow(setup)
103 +
  if (any(grepl2("MISSING VALUE", codebook))) {
104 +
    if (type == "sps") {
105 +
      missing <- parse_missing_sps(codebook, setup)
106 +
    } else {
107 +
      missing <- parse_missing_sas(codebook, setup)
108 +
    }
109 +
  } else {
110 +
    missing <- NULL
111 +
  }
112 +
  missing <- missing[!duplicated(missing), ]
113 +
114 +
  value_labels <- get_value_labels(codebook, setup, type = type)
115 +
  setup <- stats::setNames(list(setup, value_labels, missing),
116 +
                           c("setup",
117 +
                             "value_labels",
118 +
                             "missing"))
119 +
  if (!is.null(setup$value_labels)) {
120 +
    setup$value_labels <- parse_value_labels(setup, type = type)
121 +
  }
122 +
  setup$setup <- setup$setup[, c("column_number",
123 +
                                 "column_name",
124 +
                                 "begin",
125 +
                                 "end")]
126 +
127 +
  return(setup)
128 +
129 +
}
130 +
131 +
132 +
parse_missing_sps <- function(codebook, setup) {
133 +
134 +
  start <- grep2("MISSING VALUES?$|Convert missing data", codebook)
135 +
  if (length(start) == 0) {
136 +
    start <- grep2("MISSING VALUE RECODE", codebook)
137 +
  }
138 +
  end <- grep2("EXECUTE|^\\*.*SPSS", codebook)
139 +
  if (length(end) == 0 | all(end <= start)) {
140 +
    end <- length(codebook)
141 +
  } else {
142 +
    end <- min(end[end > start])
143 +
  }
144 +
  missing <- codebook[start:end]
145 +
146 +
  missing <- gsub("\\*{10,}.*|.*Convert missing data to system missing.*|SAVE OUTFILE = .*", "", missing)
147 +
  missing <- missing[missing != ""]
148 +
  missing <- gsub("^RECODE ", "", missing)
149 +
  missing <- gsub("^\\(-", " -", missing)
150 +
151 +
  missing <- gsub('\\"\\s+\\"', '""', missing)
152 +
  missing <- gsub("(\\S),(\\S)", "\\1, \\2", missing)
153 +
  missing <- gsub("\\s{3,}\\(", " \\(", missing)
154 +
  missing <- gsub("\\) ", "\\)   ", missing)
155 +
  missing <- gsub(",\\s+(-?[0-9]),", ", \\1,", missing)
156 +
  missing <- gsub(",\\s+(-?[0-9])\\)", ", \\1\\)", missing)
157 +
  missing <- gsub("([0-9]),([0-9])", "\\1, \\2", missing)
158 +
  missing <- gsub("^RECODE (V\\S*) *", "\\1 ", missing)
159 +
  missing <- gsub("=SYSMIS", "", missing)
160 +
  missing <- gsub("([0-9], ) +(-?[0-9])", "\\1\\2", missing)
161 +
162 +
  missing <- gsub("([0-9])  (\\(.*\\)\\.)", "\\1 \\2", missing)
163 +
164 +
  missing <- unlist(strsplit(missing, ",|\\s{2,}"))
165 +
166 +
  missing <- data.frame(variable = gsub(" .*", "", missing),
167 +
                        values = gsub(".*\\(|\\).*", "", missing),
168 +
                        stringsAsFactors = FALSE)
169 +
  missing$variable[missing$variable == ""] <- NA
170 +
  missing$variable <- zoo::na.locf(missing$variable, na.rm = FALSE)
171 +
  missing$values <- gsub('\\"', "\\'", missing$values)
172 +
  missing$values <- gsub("\\'", "", missing$values)
173 +
  missing$values <- trimws(missing$values)
174 +
175 +
  missing <- missing[missing$variable != missing$values, ]
176 +
177 +
  for (i in 1:nrow(missing)) {
178 +
    if (tolower(missing$variable[i]) %in% tolower(setup$column_number)) {
179 +
      missing$variable[i] <-
180 +
        setup$column_number[tolower(setup$column_number) %in% tolower(missing$variable[i])]
181 +
    }
182 +
  }
183 +
184 +
  missing <- missing[missing$variable %in% c(setup$column_number), ]
185 +
  if (nrow(missing) > 0) {
186 +
    missing <- make_thru_missing_rows(missing)
187 +
    rownames(missing) <- 1:nrow(missing)
188 +
  } else {
189 +
    missing <- NULL
190 +
  }
191 +
  return(missing)
192 +
}
193 +
194 +
parse_missing_sas <- function(codebook, setup) {
195 +
196 +
  start <- grep2("MISSING VALUE", codebook)
197 +
  start <- grep2("^IF", codebook)[grep2("^IF", codebook) > max(start)][1]
198 +
  # Some .sas files have "example" code at the beginnig which screws this up
199 +
  if (is.na(start)) return(NULL)
200 +
  end <- grep2("\\*/", codebook)[grep2("\\*/", codebook) > start][1]
201 +
  if (is.na(end) | length(end) == 0 | all(end <= start)) {
202 +
    end <- length(codebook)
203 +
  } else {
204 +
    end <- min(end[end > start])
205 +
  }
206 +
  missing <- codebook[start:end]
207 +
  missing <- unlist(strsplit(missing, ";"))
208 +
  missing <- trimws(missing)
209 +
  missing <- missing[grepl2("^IF", missing)]
210 +
  missing <- gsub(" then.*= ?\\..*| then.*= ?\\''.*|^IF ", "", missing, ignore.case = TRUE)
211 +
  missing <- gsub(" >= (.+) AND [[:alnum:]]+ <= (.*)", " \\1 thru \\2", missing,
212 +
                  ignore.case = TRUE)
213 +
  missing <- gsub(" (.*) thru ", " (\\1 thru ", missing,
214 +
                  ignore.case = TRUE)
215 +
  missing <- gsub("^\\(", "", missing,
216 +
                  ignore.case = TRUE)
217 +
218 +
  missing <- gsub("(\\S),(\\S)", "\\1, \\2", missing)
219 +
  missing <- gsub(" = ", "=", missing)
220 +
  missing <- gsub("=", " \\(", missing)
221 +
  missing <- paste0(missing, ")")
222 +
  missing <- gsub("\\)\\)$", "\\)", missing)
223 +
  missing <- gsub(" in \\(", " \\(", missing, ignore.case = TRUE)
224 +
  missing <- gsub(" GE ([0-9]+)(\\s|\\))", " \\(\\1 thru highest\\) ", missing, ignore.case = TRUE)
225 +
  missing <- gsub(" OR ", "    ", missing, ignore.case = TRUE)
226 +
  missing <- gsub('\\"', "\\'", missing)
227 +
  missing <- gsub("\\' ", "\\') ", missing)
228 +
  missing <- gsub("\\'", "", missing)
229 +
230 +
231 +
  # missing <- data.frame(variable = gsub("=.*", "", missing),
232 +
  #                       values = gsub(".*=", "", missing),
233 +
  #                       stringsAsFactors = FALSE)  missing <- gsub("\\) ", "\\)   ", missing)
234 +
  missing <- unlist(strsplit(missing, ",|\\s{2,}"))
235 +
236 +
  missing <- data.frame(variable = gsub(" .*", "", missing),
237 +
                        values = gsub(".*\\(|\\).*", "", missing),
238 +
                        stringsAsFactors = FALSE)
239 +
  missing$variable[missing$variable == ""] <- NA
240 +
  missing$variable <- zoo::na.locf(missing$variable, na.rm = FALSE)
241 +
  # missing$values <- gsub("\\.$", "", missing$values)
242 +
  missing$values <- gsub('\\"', "\\'", missing$values)
243 +
  missing$values <- gsub("\\'", "", missing$values)
244 +
  missing$values <- trimws(missing$values)
245 +
246 +
  missing <- missing[missing$variable %in% setup$column_number, ]
247 +
  if (nrow(missing) > 0) {
248 +
    missing <- make_thru_missing_rows(missing)
249 +
    rownames(missing) <- 1:nrow(missing)
250 +
  } else {
251 +
    missing <- NULL
252 +
  }
253 +
  return(missing)
254 +
}
255 +
256 +
257 +
make_thru_missing_rows <- function(missing) {
258 +
  thru_rows <- missing[grep("[0-9] thru -?[0-9]", missing$values, ignore.case = TRUE), ]
259 +
  # thru_highest_rows <- missing[grep("thru hi", missing$values, ignore.case = TRUE), ]
260 +
261 +
  if (nrow(thru_rows) < 1) {
262 +
    return(missing)
263 +
  }
264 +
265 +
  for (i in 1:nrow(thru_rows)) {
266 +
    temp <- thru_rows$values[i]
267 +
    temp <- strsplit(temp, " thru | THRU ")[[1]]
268 +
    values <- temp[1]:temp[2]
269 +
    temp <- data.frame(variable = rep(thru_rows$variable[i], length(values)),
270 +
                       values = values,
271 +
                       stringsAsFactors = FALSE)
272 +
    missing <- rbind(missing, temp)
273 +
  }
274 +
275 +
  # Removes all rows with "thru"
276 +
  missing <- missing[-grep("[0-9] thru -?[0-9]", missing$values, ignore.case = TRUE), ]
277 +
  return(missing)
278 +
}
279 +
280 +
parse_codebook <- function(setup_file, type) {
281 +
  codebook <- readr::read_lines(setup_file)
282 +
  codebook <- stringr::str_trim(codebook)
283 +
284 +
  if (type == "sps") {
285 +
    if (any(grepl("^DATA LIST", codebook, ignore.case = TRUE))) {
286 +
      codebook <- codebook[-c(1:(grep2("^DATA LIST", codebook) - 1))]
287 +
    }
288 +
  }
289 +
  return(codebook)
290 +
}
291 +
292 +
293 +
parse_value_labels <- function(setup, type) {
294 +
295 +
  value_labels <- setup$value_labels
296 +
  value_labels <- value_labels[value_labels$column %in% setup$setup$column_number, ]
297 +
298 +
  value_label_order <- unique(value_labels$column)
299 +
  value_labels <- split.data.frame(value_labels, value_labels$column)
300 +
  value_label_cols <- c()
301 +
  for (i in seq_along(value_labels)) {
302 +
    column <- unique(value_labels[[i]]$column)
303 +
    value_labels[[i]] <- value_label_matrixer(value_labels[[i]][[1]], type)
304 +
    value_label_cols <- c(value_label_cols, column)
305 +
  }
306 +
307 +
  names(value_labels) <- value_label_cols
308 +
  value_labels <- value_labels[value_label_order]
309 +
  return(value_labels)
310 +
}
311 +
312 +
parse_column_names <- function(codebook, type) {
313 +
  # Get the column names
314 +
  if (type == "sps") {
315 +
    variable_label_location <- grep2("^variable labels$", codebook)
316 +
    if (length(variable_label_location) == 0) {
317 +
      return(NULL)
318 +
    }
319 +
    next_location <- grep2("^value labels$|missing values|user-defined missing values|^execute$|^.$|\\*RECODE$",
320 +
                           codebook)
321 +
    next_location <- next_location[next_location > variable_label_location]
322 +
    next_location <- next_location[1]
323 +
    next_location <- next_location - 1
324 +
    if (is.na(next_location)) {
325 +
      next_location <- length(codebook) + 1
326 +
    }
327 +
    variables <- codebook[variable_label_location:next_location]
328 +
    variables <- gsub("\\'\\'", "\\'", variables)
329 +
    variables <- gsub("( \\'[[:alnum:]])\\'([[:alnum:]])", "\\1\\2",
330 +
                      variables)
331 +
    variables <- gsub("\'", "\"", variables)
332 +
    # In case some variables are on multiple lines
333 +
    plus <- grep('\\"\\+$', variables)
334 +
    if (length(plus) > 0) {
335 +
      for (n in 1:length(plus)) {
336 +
        variables[plus[n] + 1] <- paste(variables[plus[n]],
337 +
                                        variables[plus[n] + 1],
338 +
                                        collapse = " ")
339 +
        variables[plus[n] + 1] <- gsub('\\"\\+ *\\"', "",
340 +
                                       variables[plus[n] + 1])
341 +
      }
342 +
      variables <- variables[-plus]
343 +
    }
344 +
  } else if (type == "sas") {
345 +
    variable_label_location <- grep2("^LABEL$|^ATTRIB$", codebook)
346 +
    if (length(variable_label_location) == 0) {
347 +
      return(NULL)
348 +
    }
349 +
    next_location <- grep2("^$|^;", codebook)
350 +
    next_location <- next_location[next_location > variable_label_location]
351 +
    next_location <- next_location[1]
352 +
    if (is.na(next_location)) {
353 +
      next_location <- length(codebook)
354 +
    }
355 +
356 +
357 +
    variables <- codebook[variable_label_location:next_location]
358 +
    variables <- variables[grep("=", variables)]
359 +
    variables <- gsub('\\"\\s+FORMAT=.*$', '\\"', variables)
360 +
    variables <- gsub("(\\S)=", "\\1 =", variables)
361 +
    variables <- gsub("=(\\S)", "= \\1", variables)
362 +
    variables <- gsub('([[:alpha:]]+\\") ', '\\1   ', variables)
363 +
364 +
    variables <- gsub(' LABEL =\\"', '  "', variables)
365 +
  }
366 +
  variables <- unlist(strsplit(variables, '"\\s{3,}'))
367 +
368 +
369 +
  variables <- data.frame(column_name   = fix_names(variables),
370 +
                          column_number = gsub(" .*", "",
371 +
                                               variables),
372 +
                          stringsAsFactors = FALSE)
373 +
374 +
  if (any(grepl("^$", variables$column_name))) {
375 +
    variables <- variables[1:(grep("^$", variables$column_name)[1]), ]
376 +
  }
377 +
  variables <- variables[!variables$column_number %in% "*", ]
378 +
  variables <- variables[!duplicated(variables$column_number), ]
379 +
  return(variables)
380 +
}
381 +

@@ -0,0 +1,91 @@
Loading
1 +
#' Read fixed-width ASCII file using SPSS Setup file.
2 +
#'
3 +
#' spss_ascii_reader() and sas_ascii_reader() are used when you need to
4 +
#' read an fixed-width ASCII (text) file that comes with a setup file.
5 +
#' These file combinations are sometimes referred to as .txt+.sps, .txt+.sas,
6 +
#' .dat+.sps, or .dat+.sas.
7 +
#' The setup file provides instructions on how to create and name the columns,
8 +
#' and fix the key-value pairs (sometimes called value labels). This is common
9 +
#' in government data, particular data produced before 2010.
10 +
#'
11 +
#' @family ASCII Reader functions
12 +
#' @seealso \code{\link{sas_ascii_reader}} For using an SAS setup file
13 +
#'
14 +
#' @param dataset_name Name of the ASCII (.txt) file that contains the data.
15 +
#'   This file may be zipped with a file extension of .zip.
16 +
#' @param sps_name Name of the SPSS Setup file - should be a .sps or .txt
17 +
#'   (zipped text files also work) file.
18 +
#' @param value_label_fix If TRUE, fixes value labels of the data. e.g. If a
19 +
#'   column is "sex" and has values of 0 or 1, and the setup file says 0 = male
20 +
#'   and 1 = female, it will make that change. The reader is much faster is this
21 +
#'    parameter is FALSE.
22 +
#' @param real_names If TRUE fixes column names from default column name in the
23 +
#'   SPSS setup file (e.g. V1, V2) to the name is says the column is called
24 +
#'   (e.g. age, sex, etc.).
25 +
#' @param keep_columns Specify which columns from the dataset you want. If
26 +
#'   NULL, will return all columns. Accepts the column number (e.g. 1:5),
27 +
#'   column name (e.g. V1, V2, etc.) or column label (e.g. VICTIM_NAME, CITY,
28 +
#'   etc.).
29 +
#' @param coerce_numeric
30 +
#' If TRUE (default) will make columns where all values can be made numeric
31 +
#' into numeric columns.Useful as FALSE if variables have leading zeros - such
32 +
#' as US Census FIPS codes.
33 +
#'
34 +
#' @return Data.frame of the data from the ASCII file
35 +
#' @export
36 +
#' @examples
37 +
#' # Text file is zipped to save space.
38 +
#' dataset_name <- system.file("extdata", "example_data.zip",
39 +
#'   package = "asciiSetupReader")
40 +
#' sps_name <- system.file("extdata", "example_setup.sps",
41 +
#'   package = "asciiSetupReader")
42 +
#'
43 +
#' \dontrun{
44 +
#' example <- spss_ascii_reader(dataset_name = dataset_name,
45 +
#'   sps_name = sps_name)
46 +
#'
47 +
#'
48 +
#' # Does not fix value labels
49 +
#' example2 <- spss_ascii_reader(dataset_name = dataset_name,
50 +
#'   sps_name = sps_name, value_label_fix = FALSE)
51 +
#'
52 +
#' # Keeps original column names
53 +
#' example3 <- spss_ascii_reader(dataset_name = dataset_name,
54 +
#'   sps_name = sps_name, real_names = FALSE)
55 +
#'
56 +
#' }
57 +
#' # Only returns the first 5 columns
58 +
#' example4 <- spss_ascii_reader(dataset_name = dataset_name,
59 +
#'   sps_name = sps_name, keep_columns = 1:5)
60 +
#'
61 +
spss_ascii_reader <- function(dataset_name,
62 +
                              sps_name,
63 +
                              value_label_fix = TRUE,
64 +
                              real_names = TRUE,
65 +
                              keep_columns = NULL,
66 +
                              coerce_numeric = TRUE) {
67 +
68 +
  #  .Deprecated("read_ascii_setup")
69 +
70 +
  stopifnot(is.character(dataset_name), length(dataset_name) == 1,
71 +
            is.character(sps_name), length(sps_name) == 1,
72 +
            is.logical(value_label_fix), length(value_label_fix) == 1,
73 +
            is.logical(real_names), length(real_names) == 1)
74 +
75 +
  setup <- parse_setup(sps_name)
76 +
  setup$setup <- selected_columns(keep_columns, setup$setup)
77 +
78 +
  data <- read_data(dataset_name, setup)
79 +
80 +
81 +
  data <- fix_value_labels(data, setup, value_label_fix)
82 +
83 +
84 +
  data <- fix_names_missing_numeric(data,
85 +
                                    setup,
86 +
                                    missing,
87 +
                                    real_names,
88 +
                                    coerce_numeric)
89 +
90 +
  return(data)
91 +
}

@@ -0,0 +1,84 @@
Loading
1 +
#' @title
2 +
#' Launch an RStudio addin to select options for read_ascii_setup()
3 +
#' @description
4 +
#' Launch an RStudio addin to select options for read_ascii_setup().
5 +
#' @return
6 +
#' read_ascii_setup() code to console with options based on user input
7 +
#' @export
8 +
#' @examples
9 +
#' \dontrun{
10 +
#' read_ascii_setup_addin()
11 +
#' }
12 +
read_ascii_setup_addin <- function() {
13 +
14 +
15 +
  ui <- miniUI::miniPage(
16 +
17 +
    ## Your UI items go here.
18 +
    miniUI::miniTabstripPanel(
19 +
      miniUI::miniTabPanel(miniUI::gadgetTitleBar("Read fixed-width ASCII file using SPSS or SAS Setup file"),
20 +
                           miniUI::miniContentPanel(
21 +
                             shiny::fillRow(
22 +
                               shiny::fillCol(
23 +
                                 shiny::textInput("new_data_name", label = shiny::h5("Select data name")),
24 +
                                 shiny::fluidRow(shiny::column(3, shiny::verbatimTextOutput("new_data_name"))),
25 +
                                 shiny::fileInput("data", label = shiny::h5("Data file input (.txt or .dat)")),
26 +
                                 shiny::fluidRow(shiny::column(4, shiny::verbatimTextOutput("data"))),
27 +
                                 shiny::fileInput("setup", label = shiny::h5("Setup file input (.sps or .sas)")),
28 +
                                 shiny::fluidRow(shiny::column(4, shiny::verbatimTextOutput("setup")))),
29 +
                               shiny::fillCol(
30 +
                                 shiny::checkboxInput("value_labels", label = shiny::h5("Use value labels"),
31 +
                                                      value = TRUE),
32 +
                                 shiny::checkboxInput("clean_names", label = shiny::h5("Use clean names"),
33 +
                                                      value = TRUE),
34 +
                                 shiny::checkboxInput("coerce_numeric", label = shiny::h5("Coerce numeric columns"),
35 +
                                                      value = TRUE),
36 +
                                 shiny::textInput("columns", label = shiny::h5("Select columns"),
37 +
                                                  placeholder = "Optional - select specific columns")
38 +
                               ))
39 +
                           )
40 +
      )
41 +
    )
42 +
  )
43 +
44 +
  # Server code for the gadget.
45 +
  server <- function(input, output, session) {
46 +
47 +
    shiny::observeEvent(input$done, {
48 +
      shiny::req(input$data)
49 +
      shiny::req(input$setup)
50 +
      shiny::req(input$new_data_name)
51 +
52 +
      if (input$columns == "") {
53 +
        select_columns_temp <- "NULL"
54 +
      } else {
55 +
        select_columns_temp <- input$columns
56 +
      }
57 +
58 +
      data_path <- input$data$name
59 +
      data_path <- gsub("\\\\", "/", data_path)
60 +
      setup_path <- input$setup$name
61 +
      setup_path <- gsub("\\\\", "/", setup_path)
62 +
63 +
      rstudioapi::sendToConsole(paste0(input$new_data_name,
64 +
                                       " <- read_ascii_setup(data = '",
65 +
                                       data_path,
66 +
                                       "', setup_file = '",
67 +
                                       setup_path,
68 +
                                       "', use_value_labels = ",
69 +
                                       input$value_labels,
70 +
                                       ", use_clean_names = ",
71 +
                                       input$clean_names,
72 +
                                       ", select_columns = ",
73 +
                                       select_columns_temp,
74 +
                                       ", coerce_numeric = ",
75 +
                                       input$coerce_numeric, ")"))
76 +
77 +
      shiny::stopApp()
78 +
    })
79 +
  }
80 +
81 +
  # Use a modal dialog as a viewr.
82 +
  viewer <- shiny::dialogViewer("Ascii Setup Reader")
83 +
  shiny::runGadget(ui, server, viewer = viewer)
84 +
}

@@ -0,0 +1,231 @@
Loading
1 +
get_column_spaces <- function(setup, variables, codebook) {
2 +
3 +
4 +
  setup <- setup[grep("[0-9]-[0-9]| [0-9]| \\$[0-9]", setup)]
5 +
6 +
  setup <- gsub("([0-9])- +([0-9])", "\\1-\\2", setup)
7 +
8 +
  setup <- gsub("\tF[0-9].0$|\tA[0-9]+$|\t\\(.*\\)$", "", setup)
9 +
  setup <- gsub("\t", " ", setup)
10 +
  setup <- gsub("([0-9]) - ([0-9])", "\\1-\\2", setup)
11 +
  setup <- gsub("([0-9]+-[0-9]+) ([[:alpha:]]+)", "\\1   \\2",
12 +
                setup)
13 +
  setup <- gsub("([[:alpha:]]+[0-9]* [0-9]+) ([[:alpha:]]+[0-9]*)", "\\1   \\2",
14 +
                setup)
15 +
  setup <- gsub("\\s{2,}\\$ ([0-9]+)", " \\1",
16 +
                setup)
17 +
  setup <- unlist(strsplit(setup, "\\s{2,}"))
18 +
  setup <- gsub("\\$|\\;|\\(.*|\\.[0-9]+", "", setup)
19 +
  setup <- gsub("\\.$", "", setup)
20 +
  setup <- stringr::str_trim(setup)
21 +
  setup <- setup[grep("[0-9]$", setup)]
22 +
  setup <- data.frame(column_number = setup,
23 +
                      stringsAsFactors = FALSE)
24 +
  setup$column_number <- gsub("^\\/", "", setup$column_number)
25 +
26 +
  setup$begin <- gsub(".* ", "", setup$column_number)
27 +
  setup$end   <- gsub(".*-", "", setup$begin)
28 +
  setup$begin <- gsub("-.*", "", setup$begin)
29 +
  setup <- setup[setup$column_number != setup$begin, ]
30 +
31 +
  setup$column_number <- gsub(" .*", "", setup$column_number)
32 +
  if (!is.null(variables)) {
33 +
    setup <- setup[tolower(setup$column_number) %in%
34 +
                     tolower(variables$column_number), ]
35 +
36 +
    # In the CDC SADC data set, the column number changes from lowercase
37 +
    # to uppercase depending on section of setup file.
38 +
    for (i in 1:nrow(setup)) {
39 +
      if (tolower(setup$column_number[i]) %in% tolower(variables$column_number)) {
40 +
        setup$column_number[i] <-
41 +
          variables$column_number[tolower(variables$column_number) %in%
42 +
                                    tolower(setup$column_number[i])]
43 +
      }
44 +
    }
45 +
    setup <- merge(setup, variables,
46 +
                   by = "column_number", all.x = TRUE)
47 +
  } else {
48 +
    setup$column_name <- setup$column_number
49 +
  }
50 +
51 +
52 +
  setup$begin <- as.numeric(setup$begin)
53 +
  setup$end   <- as.numeric(setup$end)
54 +
55 +
  # Checks if any begin values are > than end values
56 +
  if (any(setup$begin > setup$end)) {
57 +
58 +
    stop(paste0("The following columns have a start number greater than the end number. Please check your setup file and fix this issue before rerunning this code.",
59 +
                paste("\nColumns with issue: ",
60 +
                      setup$column_number[setup$begin > setup$end])))
61 +
  }
62 +
63 +
64 +
  format_section <- grep2("^FORMAT$|SAS FORMAT STATEMENT|\\/\\* format$", codebook)
65 +
  if (any(grepl2("^FORMAT$|SAS FORMAT STATEMENT|\\/\\* format$", codebook))) {
66 +
    # Get format - column names and column names with f ====================
67 +
    format_section = grep2("^FORMAT$|SAS FORMAT STATEMENT|\\/\\* format$", codebook)
68 +
69 +
    format <- codebook[format_section[1]:length(codebook)]
70 +
    format <- gsub("^FORMAT ", "", format, ignore.case = TRUE)
71 +
    format <- unlist(strsplit(format, "\\."))
72 +
    format <- stringr::str_trim(format)
73 +
    format <- data.frame(column_name = gsub(" .*", "", format),
74 +
                         f_name      = gsub(".* ", "", format))
75 +
    setup <- merge(setup, format, by.x = "column_number",
76 +
                   by.y = "column_name", all.x = TRUE)
77 +
  }
78 +
79 +
  setup <- setup[order(setup$begin), ]
80 +
81 +
  return(setup)
82 +
}
83 +
84 +
85 +
selected_columns <- function(keep_columns, column_spaces) {
86 +
  if (is.null(keep_columns)) {
87 +
    return(column_spaces)
88 +
  } else if (is.numeric(keep_columns)) {
89 +
    column_spaces <- column_spaces[keep_columns, ]
90 +
  } else if (all(keep_columns %in% column_spaces$column_number)) {
91 +
    column_spaces <- column_spaces[
92 +
      column_spaces$column_number %in% keep_columns, ]
93 +
  } else if (all(keep_columns %in% column_spaces$column_name)) {
94 +
    column_spaces <- column_spaces[
95 +
      column_spaces$column_name %in% keep_columns, ]
96 +
  } else {
97 +
    stop(paste0("Not all column names in 'keep_columns' are in data. ",
98 +
                "Please check spelling"))
99 +
  }
100 +
101 +
  return(column_spaces)
102 +
}
103 +
104 +
grep2 <- function(pattern, x) grep(pattern, x, ignore.case = TRUE)
105 +
grepl2 <- function(pattern, x) grepl(pattern, x, ignore.case = TRUE)
106 +
107 +
108 +
109 +
fix_missing <- function(data, missing) {
110 +
  missing <- missing[missing$variable %in% names(data), ]
111 +
112 +
  for (column in unique(missing$variable)) {
113 +
    missing_values <- missing$values[missing$variable == column]
114 +
    missing_values <- as.character(trimws(missing_values))
115 +
    missing_values <- gsub("\\'", "", missing_values)
116 +
117 +
    if (any(grepl("thru hi", missing, ignore.case = TRUE))) {
118 +
      thru_higher <- missing_values[grep("thru hi", missing_values, ignore.case = TRUE)]
119 +
      thru_higher <- gsub(" thru hi.*", "", thru_higher, ignore.case = TRUE)
120 +
      thru_higher <- as.numeric(thru_higher)
121 +
      unique_values_numeric <- suppressWarnings(as.numeric(unique(data[[column]])))
122 +
      unique_values_numeric <- unique_values_numeric[!is.na(unique_values_numeric)]
123 +
      missing_values <- c(missing_values,
124 +
                          unique_values_numeric[unique_values_numeric >= thru_higher])
125 +
      missing_values <- gsub(" thru hi.*", "", missing_values, ignore.case = TRUE)
126 +
127 +
    }
128 +
129 +
    if (any(grepl("low thru", missing, ignore.case = TRUE))) {
130 +
      low_thru <- missing_values[grep("low thru", missing_values, ignore.case = TRUE)]
131 +
      low_thru <- gsub(".*low thru ", "", low_thru, ignore.case = TRUE)
132 +
      low_thru <- as.numeric(low_thru)
133 +
      unique_values_numeric <- suppressWarnings(as.numeric(unique(data[[column]])))
134 +
      unique_values_numeric <- unique_values_numeric[!is.na(unique_values_numeric)]
135 +
      missing_values <- c(missing_values,
136 +
                          unique_values_numeric[unique_values_numeric <= low_thru])
137 +
      missing_values <- gsub(".*low thru ", "", missing_values, ignore.case = TRUE)
138 +
139 +
    }
140 +
    names(missing_values) <- NA
141 +
142 +
    # Sets character to column type in case it isn't already.
143 +
    if (!is.character(data[[column]])) {
144 +
      data.table::set(data, j = column, value = as.character(data[[column]]))
145 +
    }
146 +
147 +
    data.table::set(data, j = column,
148 +
                    value = haven::as_factor(haven::labelled(data[[column]],
149 +
                                                             missing_values)))
150 +
    # Re-saves as character type instead of factor type
151 +
    data.table::set(data, j = column, value = as.character(data[[column]]))
152 +
153 +
  }
154 +
  return(data)
155 +
}
156 +
157 +
158 +
read_data <- function(data, setup) {
159 +
  positions <- vroom::fwf_positions(setup$setup$begin,
160 +
                                    setup$setup$end,
161 +
                                    setup$setup$column_number)
162 +
163 +
  out <- tryCatch(
164 +
    {
165 +
      temp <- suppressMessages(vroom::vroom_fwf(data,
166 +
                                                col_positions = positions,
167 +
                                                col_types = vroom::cols(.default =
168 +
                                                                          vroom::col_character())))
169 +
      temp <- data.table::as.data.table(temp)
170 +
    },
171 +
    error=function(cond) {
172 +
      temp <- suppressMessages(readr::read_fwf(data,
173 +
                                               col_positions = positions,
174 +
                                               col_types = vroom::cols(.default =
175 +
                                                                         vroom::col_character())))
176 +
      temp <- data.table::as.data.table(temp)
177 +
    }
178 +
  )
179 +
180 +
  return(out)
181 +
}
182 +
183 +
184 +
fix_names_missing_numeric <- function(data,
185 +
                                      setup,
186 +
                                      missing,
187 +
                                      real_names,
188 +
                                      coerce_numeric) {
189 +
  # Fixes missing values ----------------------------------------------------
190 +
  missing <- setup$missing
191 +
  if (!is.null(missing)) {
192 +
    data <- fix_missing(data, missing)
193 +
  }
194 +
  if (real_names) {
195 +
    # Fixes column names to real names
196 +
    variables <- setup$setup[setup$setup$column_number
197 +
                             %in% names(data), ]
198 +
    data.table::setnames(data, old = variables$column_number,
199 +
                         new = variables$column_name)
200 +
  }
201 +
202 +
203 +
  # Makes columns that should be numeric numeric
204 +
  if (coerce_numeric) {
205 +
    data <- make_cols_numeric(data)
206 +
  }
207 +
  attributes(data)$spec <- NULL
208 +
  data <- as.data.frame(data)
209 +
  return(data)
210 +
}
211 +
212 +
fix_value_labels <- function(data, setup, value_label_fix) {
213 +
  # Value Labels ------------------------------------------------------------
214 +
  # Removes columns not asked for
215 +
  value_labels <- setup$value_labels
216 +
  if (!is.null(value_labels)) {
217 +
    value_labels <- value_labels[names(value_labels) %in%
218 +
                                   setup$setup$column_number]
219 +
  }
220 +
221 +
  if (value_label_fix && length(value_labels) > 0) {
222 +
    column_order <- names(data)
223 +
    for (i in seq_along(value_labels)) {
224 +
      data <- fix_variable_values(data,
225 +
                                  value_labels[[i]],
226 +
                                  names(value_labels)[i])
227 +
      data.table::setcolorder(data, column_order)
228 +
    }
229 +
  }
230 +
  return(data)
231 +
}

@@ -0,0 +1,78 @@
Loading
1 +
#' Read fixed-width ASCII file using SPSS or SAS Setup file.
2 +
#'
3 +
#' read_ascii_setup() is used when you need to
4 +
#' read an fixed-width ASCII (text) file that comes with a setup file.
5 +
#' The setup file provides instructions on how to create and name the columns,
6 +
#' and fix the key-value pairs (sometimes called value labels). This is common
7 +
#' in government data, particular data produced before 2010.
8 +
#'
9 +
#' @param data Name of the ASCII (.txt or .dat) file that contains the data.
10 +
#'   This file may be zipped with a file extension of .zip.
11 +
#' @param setup_file Name of the SPSS or SAS setup file - should be a .sps or .sas (.txt also accepted as are these files in zipped format)
12 +
#' @param use_value_labels If TRUE, fixes value labels of the data. e.g. If a
13 +
#'   column is "sex" and has values of 0 or 1, and the setup file says 0 = male
14 +
#'   and 1 = female, it will make that change. Using this parameter for enormous files may slow down the package considerably.
15 +
#' @param use_clean_names If TRUE fixes column names from default column name in the
16 +
#'   setup file (e.g. V1, V2) to the descriptive label for the column provided in the file (e.g. age, sex, etc.).
17 +
#' @param select_columns Specify which columns from the dataset you want. If
18 +
#'   NULL, will return all columns. Accepts the column number (e.g. 1:5),
19 +
#'   column name (e.g. V1, V2, etc.) or column label (e.g. VICTIM_NAME, CITY,
20 +
#'   etc.).
21 +
#' @param coerce_numeric
22 +
#' If TRUE (default) will make columns where all values can be made numeric
23 +
#' into numeric columns. Useful as FALSE if variables have leading zeros - such
24 +
#' as US Census FIPS codes.
25 +
#'
26 +
#' @return data.frame of the data from the ASCII file
27 +
#' @export
28 +
#' @examples
29 +
#' # Text file is zipped to save space.
30 +
#' dataset_name <- system.file("extdata", "example_data.zip",
31 +
#'   package = "asciiSetupReader")
32 +
#' sps_name <- system.file("extdata", "example_setup.sps",
33 +
#'   package = "asciiSetupReader")
34 +
#'
35 +
#' \dontrun{
36 +
#' example <- read_ascii_setup(data = dataset_name,
37 +
#'   setup_file = sps_name)
38 +
#'
39 +
#'
40 +
#' # Does not fix value labels
41 +
#' example2 <- read_ascii_setup(data = dataset_name,
42 +
#'   setup_file = sps_name, use_value_labels = FALSE)
43 +
#'
44 +
#' # Keeps original column names
45 +
#' example3 <- read_ascii_setup(data = dataset_name,
46 +
#'   setup_file = sps_name, use_clean_names = FALSE)
47 +
#' }
48 +
#'
49 +
#' # Only returns the first 5 columns
50 +
#' example4 <- read_ascii_setup(data = dataset_name,
51 +
#'   setup_file = sps_name, select_columns = 1:5)
52 +
read_ascii_setup <- function(data,
53 +
                             setup_file,
54 +
                             use_value_labels = TRUE,
55 +
                             use_clean_names  = TRUE,
56 +
                             select_columns   = NULL,
57 +
                             coerce_numeric   = TRUE) {
58 +
59 +
  if (grepl(".sps(\\.zip)?$", setup_file)) {
60 +
61 +
    data <- spss_ascii_reader(dataset_name   = data,
62 +
                              sps_name        = setup_file,
63 +
                              value_label_fix = use_value_labels,
64 +
                              real_names      = use_clean_names,
65 +
                              keep_columns    = select_columns,
66 +
                              coerce_numeric  = coerce_numeric)
67 +
  } else {
68 +
69 +
    data <- sas_ascii_reader(dataset_name    = data,
70 +
                             sas_name        = setup_file,
71 +
                             value_label_fix = use_value_labels,
72 +
                             real_names      = use_clean_names,
73 +
                             keep_columns    = select_columns,
74 +
                             coerce_numeric  = coerce_numeric)
75 +
  }
76 +
77 +
  return(data)
78 +
}

@@ -0,0 +1,178 @@
Loading
1 +
2 +
#' Create an SPSS setup file (.sps) to use for reading in fixed-width text files
3 +
#'
4 +
#' make_sps_setup() is used to create the setup file used in reading in
5 +
#' fixed-width text files. Often the setup file comes with the data file but
6 +
#' in some cases (usually with government data) you will need to create
7 +
#' the setup file yourself.
8 +
#'
9 +
#' @param file_name
10 +
#' Name of the file to be saved (e.g. "setup_file1"). There is no need to put
11 +
#' the .sps extension in the file name.
12 +
#' @param col_positions
13 +
#' Either a vector of strings indicating the start and end position of each column
14 +
#' (e.g. "1-3", "4-5") or a vector of the widths of the columns (e.g. 3, 2).
15 +
#' @param col_names
16 +
#' A vector of names for the columns. If none are provided, will automatically
17 +
#' create names based on column number (e.g. V1, V2, V3).
18 +
#' @param col_labels
19 +
#' A vector of labels for the columns. These are often longer and more descriptive
20 +
#' than the col_names. These are the values used as column names if
21 +
#' real_names = TRUE in reading in the data.
22 +
#' @param value_labels
23 +
#' A vector with the value first, then an ' = ' then the label. Each
24 +
#' new column should have the column named followed by ' ='.
25 +
#' @param missing_values
26 +
#' A vector of strings with the column name followed by the values to be
27 +
#' replaced by NA.
28 +
#' @return
29 +
#' Does not return any object. Saves the .sps file that is created.
30 +
#' @export
31 +
#'
32 +
#' @examples
33 +
#' \dontrun{
34 +
#'   value_labels <- c("var1 = ",
35 +
#'                      "1 = label 1",
36 +
#'                      "2 = label 2",
37 +
#'                      "3 = label 3",
38 +
#'                      "4 = label 4",
39 +
#'                      "5 = label 5",
40 +
#'                      "var3 = ",
41 +
#'                      "1A = alpha",
42 +
#'                      "1B = bravo",
43 +
#'                      "1C = cat")
44 +
#' missing_values <- c("state name", "9", "-8", "county", "-8")
45 +
#' make_sps_setup(file_name     = "example_name",
46 +
#'                col_positions = c(1, 3, 4, 2),
47 +
#'                col_names     = c("var1", "var2", "var3", "var4"),
48 +
#'                col_labels    = c("state name", "county",
49 +
#'                               "population", "census region code"),
50 +
#'                value_labels  = value_labels,
51 +
#'                missing_values = missing_values)
52 +
#' }
53 +
make_sps_setup <- function(file_name,
54 +
                           col_positions,
55 +
                           col_names = NULL,
56 +
                           col_labels = NULL,
57 +
                           value_labels = NULL,
58 +
                           missing_values = NULL) {
59 +
60 +
  stopifnot(is.character(file_name),
61 +
            (is.character(col_positions) | is.numeric(col_positions)),
62 +
            (is.null(value_labels) | is.vector(value_labels)),
63 +
            (is.null(missing_values) | is.vector(missing_values)))
64 +
65 +
66 +
67 +
68 +
69 +
  intro <- paste0("This setup file was created using the R package asciiSetupReader",
70 +
                  "(version ",
71 +
                  utils::packageVersion("asciiSetupReader"),
72 +
                  ") on ",
73 +
                  Sys.time(), ".",
74 +
                  " For any feedback or problems (or if the file looks odd), ",
75 +
                  "please make a report on https://github.com/jacobkap/asciiSetupReader/issues.",
76 +
                  " For more information on this package see here: https://jacobkap.github.io/asciiSetupReader/.")
77 +
  intro <- c(intro, "", "")
78 +
  line_break <- c(".", "")
79 +
  file_name <- gsub(".sps$", "", file_name, "")
80 +
81 +
  # Takes the input of column widths and makes the column position strings
82 +
  if (is.numeric(col_positions)) {
83 +
    col_positions       <- readr::fwf_widths(col_positions)
84 +
    col_positions$begin <- col_positions$begin + 1
85 +
    col_positions       <- paste0(col_positions$begin,
86 +
                                  "-",
87 +
                                  col_positions$end)
88 +
  }
89 +
90 +
91 +
92 +
93 +
# Make column names if none provided --------------------------------------
94 +
95 +
96 +
  if (is.null(col_names)) {
97 +
    col_names <- paste0("V", 1:length(col_positions))
98 +
  }
99 +
100 +
  data_list <- format(col_names,
101 +
                      width = max(nchar(col_names)) + 5)
102 +
  data_list <- paste(data_list, col_positions)
103 +
  data_list <- c("data list", data_list, line_break)
104 +
105 +
106 +
# Make value labels -------------------------------------------------------
107 +
  if (!is.null(value_labels)) {
108 +
    value_labels       <- stringr::str_split_fixed(value_labels, pattern = " = ", n = 2)
109 +
    val_labels_columns <- as.character(value_labels[, 1][value_labels[, 2] == ""])
110 +
    val_name_columns   <- col_names[match(val_labels_columns, col_labels)]
111 +
    val_labels_columns <- paste0("^", val_labels_columns, "$")
112 +
    if (all(!is.na(val_name_columns))) {
113 +
      names(val_name_columns) <- val_labels_columns
114 +
      value_labels[, 1] <-
115 +
        stringr::str_replace_all(value_labels[, 1], val_name_columns)
116 +
    }
117 +
    values <- value_labels[, 1]
118 +
    values[value_labels[, 2] != ""] <- paste0("'", values[value_labels[, 2] != ""] , "'")
119 +
    values <- format(values,
120 +
                     width = max(nchar(as.character(values))) + 5)
121 +
    labels   <- paste0('"', value_labels[, 2], '"')
122 +
    labels[labels == '""'] <- ""
123 +
    value_labels <- paste(values, labels)
124 +
    value_labels <- c("value labels", value_labels, line_break)
125 +
  } else {
126 +
    value_labels <- ""
127 +
  }
128 +
129 +
130 +
  if (!is.null(missing_values)) {
131 +
132 +
    temp <- rep(0, length(missing_values))
133 +
    counter <- 0
134 +
    for (i in 1:length(temp)) {
135 +
      if (missing_values[i] %in% col_labels) {
136 +
        counter <- counter + 1
137 +
        missing_values[i] <- col_names[col_labels %in% missing_values[i]]
138 +
      }
139 +
      temp[i] <- counter
140 +
    }
141 +
    split_missing <- split(missing_values, as.factor(temp))
142 +
    missing_values <- c()
143 +
    for (i in 1:length(split_missing)) {
144 +
      temp <- paste(unlist(split_missing[i]), collapse = ", ")
145 +
      temp <- stringr::str_replace(temp, ", ", "      (")
146 +
      temp <- paste0(temp, ")")
147 +
      missing_values <- c(missing_values, temp)
148 +
    }
149 +
150 +
    # Formats to make all same number of characters (so it looks better)
151 +
    missing_values <- format(missing_values,
152 +
                        width = max(nchar(missing_values)) + 5)
153 +
    missing_values <- c("missing values", missing_values, line_break)
154 +
  } else {
155 +
    missing_values <- ""
156 +
  }
157 +
158 +
  if (!is.null(col_labels)) {
159 +
    variable_labels <- format(col_names,
160 +
                              width = max(nchar(col_names)) + 5)
161 +
    col_labels      <- paste0('"', col_labels, '"')
162 +
    variable_labels <- paste(variable_labels, col_labels)
163 +
    variable_labels <- c("variable labels", variable_labels, line_break)
164 +
  }
165 +
166 +
167 +
  sink(paste0(file_name, ".sps"))
168 +
  writeLines(file_name)
169 +
  writeLines("")
170 +
  writeLines(intro)
171 +
  writeLines(data_list)
172 +
  writeLines(variable_labels)
173 +
  writeLines(value_labels)
174 +
  writeLines(missing_values)
175 +
  writeLines(c("", "execute"))
176 +
  sink()
177 +
178 +
}
Files Coverage
R 78.18%
Project Totals (8 files) 78.18%