1
|
|
###########################################################################/**
|
2
|
|
# @RdocFunction .readText
|
3
|
|
#
|
4
|
|
# @title "Reads the content of a local or an online text file"
|
5
|
|
#
|
6
|
|
# \description{
|
7
|
|
# @get "title".
|
8
|
|
# }
|
9
|
|
#
|
10
|
|
# @synopsis
|
11
|
|
#
|
12
|
|
# \arguments{
|
13
|
|
# \item{con}{A @character string specifying a local file or a URL,
|
14
|
|
# or a @connection.}
|
15
|
|
# \item{...}{Not used.}
|
16
|
|
# \item{maxAge}{A @numeric scalar specifying the maximum age (in seconds)
|
17
|
|
# of cached URL contents before downloading and recaching.
|
18
|
|
# If zero or less, the URL will always be downloaded.}
|
19
|
|
# }
|
20
|
|
#
|
21
|
|
# \value{
|
22
|
|
# Returns a @character string.
|
23
|
|
# }
|
24
|
|
#
|
25
|
|
# \section{Caching URL}{
|
26
|
|
# When reading online URLs, it may take time a significant time to
|
27
|
|
# read its content. If the content is changing rarely, it is possible
|
28
|
|
# to cache the content locally. If a cached version is found, then it
|
29
|
|
# is read instead.
|
30
|
|
# It is possible to control how often a file should be recached. If the
|
31
|
|
# cache is older than argument \code{maxAge} (in seconds), then the file
|
32
|
|
# is redownloaded and recached.
|
33
|
|
# }
|
34
|
|
#
|
35
|
|
# \section{Newline substitution}{
|
36
|
|
# All occurrences of \code{\\r\\n} and \code{\\r} are replaced with
|
37
|
|
# \code{\\n} such that all lines are ending in \code{\\n} regardless
|
38
|
|
# of encoding.
|
39
|
|
# }
|
40
|
|
#
|
41
|
|
# @author
|
42
|
|
#
|
43
|
|
# @keyword file
|
44
|
|
# @keyword IO
|
45
|
|
# @keyword internal
|
46
|
|
#*/###########################################################################
|
47
|
|
.readText <- function(con, ..., maxAge=getOption("R.rsp::downloadIfOlderThan", -Inf)) {
|
48
|
1
|
if (is.character(con)) {
|
49
|
1
|
file <- con
|
50
|
|
|
51
|
|
# Is the file local and an URL?
|
52
|
1
|
isUrl <- isUrl(file)
|
53
|
|
|
54
|
|
|
55
|
|
# (a) If URL, download to temporary directory
|
56
|
1
|
if (isUrl) {
|
57
|
0
|
url <- file
|
58
|
0
|
path <- tempdir()
|
59
|
0
|
filename <- getChecksum(url)
|
60
|
0
|
pathname <- file.path(path, filename)
|
61
|
|
|
62
|
|
# By default, download URL
|
63
|
0
|
download <- TRUE
|
64
|
|
|
65
|
|
# Unless...
|
66
|
0
|
if (isFile(pathname)) {
|
67
|
|
# Age (in seconds) when downloaded file is considered too old
|
68
|
0
|
maxAge <- as.double(maxAge)
|
69
|
0
|
if (is.na(maxAge)) maxAge <- -Inf
|
70
|
0
|
maxAge <- Arguments$getDouble(maxAge)
|
71
|
|
# Time when file was downloaded
|
72
|
0
|
mtime <- file.info(pathname)$mtime
|
73
|
|
# Age of downloaded file in seconds
|
74
|
0
|
dtime <- Sys.time() - mtime
|
75
|
0
|
units(dtime) <- "secs"
|
76
|
0
|
download <- isTRUE(dtime > maxAge)
|
77
|
|
}
|
78
|
|
|
79
|
0
|
if (download) {
|
80
|
0
|
withoutGString({
|
81
|
0
|
pathname <- downloadFile(url, filename=pathname, skip=FALSE)
|
82
|
|
})
|
83
|
|
}
|
84
|
|
|
85
|
0
|
if (isFile(pathname)) file <- pathname
|
86
|
|
} # if (isUrl)
|
87
|
|
|
88
|
|
|
89
|
|
# (b) Try to open file connection
|
90
|
1
|
con <- tryCatch({
|
91
|
1
|
suppressWarnings({
|
92
|
1
|
file(file, open="rb")
|
93
|
|
})
|
94
|
1
|
}, error = function(ex) {
|
95
|
|
# (b) If failed, try to download file first
|
96
|
0
|
if (regexpr("^https://", file, ignore.case=TRUE) == -1L) {
|
97
|
0
|
throw(ex)
|
98
|
|
}
|
99
|
0
|
url <- file
|
100
|
0
|
withoutGString({
|
101
|
0
|
pathname <- downloadFile(url, path=tempdir())
|
102
|
|
})
|
103
|
0
|
file(pathname, open="rb")
|
104
|
|
})
|
105
|
1
|
on.exit(close(con))
|
106
|
|
}
|
107
|
|
|
108
|
|
# Sanity check
|
109
|
1
|
stop_if_not(inherits(con, "connection"))
|
110
|
|
|
111
|
|
|
112
|
1
|
bfr <- NULL
|
113
|
1
|
while (TRUE) {
|
114
|
1
|
bfrT <- readChar(con, nchars=1e6)
|
115
|
1
|
if (length(bfrT) == 0L) break
|
116
|
1
|
bfrT <- gsub("\r\n", "\n", bfrT, fixed=TRUE)
|
117
|
1
|
bfrT <- gsub("\r", "\n", bfrT, fixed=TRUE)
|
118
|
1
|
bfr <- c(bfr, bfrT)
|
119
|
|
}
|
120
|
1
|
bfr <- paste(bfr, collapse="")
|
121
|
|
|
122
|
1
|
if (FALSE) {
|
123
|
0
|
bfr <- strsplit(bfr, split="\n", fixed=TRUE)
|
124
|
0
|
bfr <- unlist(bfr, use.names=FALSE)
|
125
|
|
}
|
126
|
|
|
127
|
|
## Sanity check
|
128
|
1
|
stop_if_not(is.character(bfr), length(bfr) == 1L)
|
129
|
|
|
130
|
1
|
bfr
|
131
|
|
} # .readText()
|