ropensci / aRxiv
1
# This is the template for the search API methods for arXiv
2
# User manual: http://arxiv.org/help/api/user-manual
3
# A simple search: http://arxiv.org/help/api/user-manual
4
# A boolean search: http://export.arxiv.org/api/query?search_query=all:electron+AND+all:proton
5

6
#' The main search function for aRxiv
7
#'
8
#' Allows for progammatic searching of the arXiv pre-print repository.
9
#'
10
#' @param query Search pattern as a string; a vector of such strings
11
#' also allowed, in which case the elements are combined with `AND`.
12
#' @param id_list arXiv doc IDs, as comma-delimited string or a vector
13
#' of such strings
14
#' @param start An offset for the start of search
15
#' @param limit Maximum number of records to return.
16
#' @param sort_by How to sort the results (ignored if `id_list` is
17
#' provided)
18
#' @param ascending If TRUE, sort in ascending order; else descending
19
#' (ignored if `id_list` is provided)
20
#' @param batchsize Maximum number of records to request at one time
21
#' @param force If TRUE, force search request even if it seems extreme
22
#' @param output_format Indicates whether output should be a data frame or a list.
23
#' @param sep String to use to separate multiple authors,
24
#' affiliations, DOI links, and categories, in the case that
25
#' `output_format="data.frame"`.
26
#'
27
#' @return If `output_format="data.frame"`, the result is a data
28
#' frame with each row being a manuscript and columns being the
29
#' various fields.
30
#'
31
#' If `output_format="list"`, the result is a list parsed from
32
#' the XML output of the search, closer to the raw output from arXiv.
33
#'
34
#' The data frame format has the following columns.
35
#' \tabular{rll}{
36
#'  \[,1\] \tab id               \tab arXiv ID \cr
37
#'  \[,2\] \tab submitted        \tab date first submitted \cr
38
#'  \[,3\] \tab updated          \tab date last updated \cr
39
#'  \[,4\] \tab title            \tab manuscript title \cr
40
#'  \[,5\] \tab summary          \tab abstract \cr
41
#'  \[,6\] \tab authors          \tab author names \cr
42
#'  \[,7\] \tab affiliations     \tab author affiliations \cr
43
#'  \[,8\] \tab link_abstract    \tab hyperlink to abstract \cr
44
#'  \[,9\] \tab link_pdf         \tab hyperlink to pdf \cr
45
#' \[,10\] \tab link_doi         \tab hyperlink to DOI \cr
46
#' \[,11\] \tab comment          \tab authors' comment \cr
47
#' \[,12\] \tab journal_ref      \tab journal reference \cr
48
#' \[,13\] \tab doi              \tab published DOI \cr
49
#' \[,14\] \tab primary_category \tab primary category \cr
50
#' \[,15\] \tab categories       \tab all categories \cr
51
#' }
52
#'
53
#' The contents are all strings; missing values are empty strings (`""`).
54
#'
55
#' The columns `authors`, `affiliations`, `link_doi`,
56
#' and `categories` may have multiple entries separated by
57
#' `sep` (by default, `"|"`).
58
#'
59
#' The result includes an attribute `"search_info"` that includes
60
#' information about the details of the search parameters, including
61
#' the time at which it was completed. Another attribute
62
#' `"total_results"` is the total number of records that match
63
#' the query.
64
#'
65
#' @seealso [arxiv_count()], [arxiv_open()],
66
#' [query_terms()], [arxiv_cats()]
67
#'
68
#' @examples
69
#' \dontshow{old_delay <- getOption("aRxiv_delay")
70
#'           options(aRxiv_delay=1)}
71
#' \donttest{
72
#' # search for author Peter Hall with deconvolution in title
73
#' z <- arxiv_search(query = 'au:"Peter Hall" AND ti:deconvolution', limit=2)
74
#' attr(z, "total_results") # total no. records matching query
75
#' z$title
76
#'
77
#' # search for a set of documents by arxiv identifiers
78
#' z <- arxiv_search(id_list = c("0710.3491v1", "0804.0713v1", "1003.0315v1"))
79
#' # can also use a comma-separated string
80
#' z <- arxiv_search(id_list = "0710.3491v1,0804.0713v1,1003.0315v1")
81
#' # Journal references, if available
82
#' z$journal_ref
83
#'
84
#' # search for a range of dates (in this case, one day)
85
#' z <- arxiv_search("submittedDate:[199701010000 TO 199701012400]", limit=2)
86
#' }
87
#' \dontshow{options(aRxiv_delay=old_delay)}
88
#'
89
#' @export
90
arxiv_search <-
91
function(query=NULL, id_list=NULL, start=0, limit=10,
92
         sort_by=c("submitted", "updated", "relevance"),
93
         ascending=TRUE, batchsize=100, force=FALSE,
94
         output_format=c("data.frame", "list"), sep="|")
95
{
96 1
    query_url <- "http://export.arxiv.org/api/query"
97

98 1
    query <- paste_query(query)
99 1
    id_list <- paste_id_list(id_list)
100

101 1
    if(is_blank(query) && is_blank(id_list)) return(empty_result())
102

103 1
    sort_by <- match.arg(sort_by)
104 1
    sort_order <- ifelse(ascending, "ascending", "descending")
105 1
    output_format <- match.arg(output_format)
106

107 0
    if(is.null(start)) start <- 0
108 1
    if(is.null(limit)) limit <- arxiv_count(query, id_list)
109

110 1
    stopifnot(start >= 0)
111 1
    stopifnot(limit >= 0)
112 1
    stopifnot(batchsize >= 1)
113

114
    # if force=FALSE, check that we aren't asking for too much
115 1
    if(!force) {
116 1
        too_many_res <- is_too_many(query, id_list, start, limit)
117 1
        if(too_many_res)
118 1
            stop("Expecting ", too_many_res, " results; refine your search")
119 1
        if(too_many_res > batchsize && batchsize > 1000)
120 0
            stop("Expecting ", too_many_res, " and batchsize is ",
121 0
                 batchsize, " which looks too large.\n",
122 0
                 "Refine your search or reduce batchsize.")
123
    }
124

125 1
    if(limit > batchsize) { # use batches
126 1
        return(arxiv_search_inbatches(query=query, id_list=id_list,
127 1
                                      start=start, limit=limit,
128 1
                                      sort_by=sort_by, ascending=ascending,
129 1
                                      batchsize=batchsize, force=force,
130 1
                                      output_format=output_format, sep=sep))
131
    }
132

133 1
    delay_if_necessary()
134
    # do search
135
    # (extra messy to avoid possible problems when testing on CRAN
136
    #    timeout_action defined in timeout.R)
137 1
    body <- list(search_query=query, id_list=id_list,
138 1
                 start=start, max_results=limit,
139 1
                 sortBy=recode_sortby(sort_by), sortOrder=sort_order)
140 1
    body <- drop_nulls(body)
141 1
    search_result <- try(httr::POST(query_url,
142 1
                                    body=body,
143 1
                                    httr::timeout(get_arxiv_timeout())))
144 1
    if(inherits(search_result, "try-error")) {
145 0
        timeout_action()
146 0
        return(invisible(NULL))
147
    }
148

149 1
    set_arxiv_time() # set time for last call to arXiv
150

151
    # convert XML results to a list
152 1
    listresult <- result2list(search_result)
153

154
    # check for arXiv error
155 1
    error_message <- arxiv_error_message(listresult)
156 1
    if(!is.null(error_message)) {
157 1
        stop("arXiv error: ", error_message)
158
    }
159

160
    # check for general http error
161 1
    httr::stop_for_status(search_result)
162

163
    # total no. records matching query
164 1
    total_results <- as.integer(listresult$totalResults)
165

166
    # pull out just the entries
167 1
    results <- get_entries(listresult)
168

169
    # convert to data frame
170 1
    if(output_format=="data.frame")
171 1
        results <- listresult2df(results, sep=sep)
172

173 1
    attr(results, "search_info") <-
174 1
        search_attributes(query, id_list, start, limit,
175 1
                          sort_by, sort_order)
176

177 1
    attr(results, "total_results") <- total_results
178

179 1
    results
180
}
181

182

183
# search in batches
184
arxiv_search_inbatches <-
185
function(query=NULL, id_list=NULL, start=0, limit=10,
186
         sort_by=c("submitted", "updated", "relevance"),
187
         ascending=TRUE, batchsize=500, force=FALSE,
188
         output_format=c("data.frame", "list"), sep="|")
189
{
190 1
    sort_by <- match.arg(sort_by)
191 1
    sort_order <- ifelse(ascending, "ascending", "descending")
192 1
    output_format <- match.arg(output_format)
193

194 1
    nbatch <- (limit %/% batchsize) + ifelse(limit %% batchsize, 1, 0) # integer arithmetic, to be safe
195 1
    results <- NULL
196

197 1
    starts <- seq(start, start+limit-1, by=batchsize)
198

199
    # maximum record to return
200 1
    max_record <- start + limit - 1
201

202 1
    for(i in seq(along=starts)) {
203

204
        # avoid returning more than a total of limit records
205 1
        this_limit <- ifelse(max_record - starts[i] + 1 < batchsize,
206 1
                             max_record - starts[i] + 1,
207 1
                             batchsize)
208 0
        if(this_limit == 0) break
209

210 1
        these_results <- arxiv_search(query=query, id_list=id_list,
211 1
                                      start=starts[i], limit=this_limit,
212 1
                                      sort_by=sort_by, ascending=ascending,
213 1
                                      batchsize=batchsize, force=force,
214 1
                                      output_format="list", sep=sep)
215

216 1
        message("retrieved batch ", i)
217

218
        # grab total_results attribute (total no. records matching query)
219 1
        total_results <- attr(these_results, "total_results")
220

221
        # if no more results? then return
222 0
        if(count_entries(these_results) == 0) break
223

224 1
        results <- c(results, these_results)
225
    }
226

227 1
    if(output_format=="data.frame")
228 1
        results <- listresult2df(results, sep=sep)
229

230 1
    attr(results, "search_info") <-
231 1
        search_attributes(query, id_list, start, limit,
232 1
                          sort_by, sort_order)
233

234 1
    attr(results, "total_results") <- total_results
235

236 1
    results
237
}
238

239

240
recode_sortby <-
241
function(sort_by=c("submitted", "updated", "relevance"))
242
{
243 1
    sort_by <- match.arg(sort_by)
244 1
    switch(sort_by,
245 1
           submitted="submittedDate",
246 1
           updated="lastUpdatedDate",
247 0
           relevance="relevance")
248
}
249

250

251
# an attribute to add to the result
252
search_attributes <-
253
function(query, id_list, start, limit, sort_by,
254
         sort_order)
255
{
256 1
    c(query=ifelse(is.null(query), "", query),
257 1
      id_list=ifelse(is.null(id_list), "", id_list),
258 1
      start=start, limit=limit, sort_by=sort_by,
259 1
      sort_order=sort_order, time=paste(Sys.time(), Sys.timezone()))
260
}

Read our documentation on viewing source code .

Loading