1
make_equal_length <- function(lt) {
2 1
    len <- unlist(sapply(lt, length))
3 1
    if (is.null(len))
4 1
        return(lt)
5

6 1
    if (length(unique(len)) == 1) {
7 1
        return(lt)
8
    }
9
    else{
10 0
        maxlen <- max(len)
11 0
        return(lapply(lt, function(x) c(x, rep("NA", maxlen-length(x)))))
12
    }
13
}
14

15

16
xmlEpisodeToList <- function(episode_node) {
17 1
    traverseNode <- function(node) {
18 1
        if (is.null(node)) {
19
            #leaf node reached. Turn back
20 1
            return()
21
        }
22

23 1
        num.children <- xmlSize(node)  
24 1
        if(num.children == 0 ) {
25 1
            label <- xmlName(xmlParent(node))
26 1
            if (is.null(.which.type(label))) {
27 0
                warning("XML structure error:", label)
28
            }
29
            else {
30 1
                if (.which.type(label) == "item1d") {
31 1
                    node_env$data[[label]][[label]] <- xmlValue(node)
32
                }
33

34
                else {
35 1
                    parent_size <- xmlSize(xmlParent(xmlParent(node)))
36 1
                    sib <- getSibling(xmlParent(node))
37

38 1
                    if (parent_size == 2) {
39 1
                        if (!is.null(sib)) {
40 1
                            old_data <- node_env$data[[label]][[label]]
41

42 1
                            if(is.null(old_data))
43 1
                                node_env$data[[label]][[label]] <- vector()
44
                            
45 1
                            node_env$data[[label]][[label]] <- c(old_data, xmlValue(node))
46

47 1
                            old_sibling <- node_env$data[[label]][[xmlName(sib)]]
48

49 1
                            if (is.null(old_sibling))
50 1
                                node_env$data[[label]][[xmlName(sib)]] <- vector()
51
                                
52 1
                            node_env$data[[label]][[xmlName(sib)]] <- c(old_sibling, xmlValue(sib))
53
                        }
54
                    }
55 1
                    else if (parent_size == 3) {
56 1
                        if (!is.null(sib)){
57 1
                            sibsib <- getSibling(sib)
58 1
                            if (!is.null(sibsib)) {
59 1
                                sibling1_name <- xmlName(sib)
60 1
                                sibling2_name <- xmlName(sibsib)
61

62 1
                                node_env$data[[label]] <- make_equal_length(node_env$data[[label]])
63

64 1
                                old_data <- node_env$data[[label]][[label]]
65 1
                                old_sibling1 <- node_env$data[[label]][[sibling1_name]]
66 1
                                old_sibling2 <- node_env$data[[label]][[sibling2_name]]
67

68 1
                                if (is.null(old_sibling1))
69 1
                                    node_env$data[[label]][[sibling1_name]] <- vector()
70 1
                                if (is.null(old_sibling2))
71 1
                                    node_env$data[[label]][[sibling2_name]] <- vector()
72

73

74 1
                                node_env$data[[label]][[label]] <- c(old_data, xmlValue(node))
75 1
                                node_env$data[[label]][[sibling1_name]] <- c(old_sibling1, xmlValue(sib))
76 1
                                node_env$data[[label]][[sibling2_name]] <- c(old_sibling2, xmlValue(sibsib))
77
                            }
78
                        }
79
                    }
80

81
                }
82
            }
83
        }
84

85
        #Go one level denode_env$dataer
86 1
        for (i in 1 : num.children) {
87 1
            if (xmlValue(node) == "") {
88 0
                warning("XML structure is wrong in ", xmlName(node))
89 0
                next
90
            }
91 1
            traverseNode(node[[i]]) #the i-th child of node
92
        }
93
    }
94

95 1
    node_env <- new.env()
96 1
    node_env$data <- list() # store data from the XML traverser 
97 1
    traverseNode(episode_node)
98 1
    node_env$ccdata <- list() # format can be directely called by ccEpisode.
99
    
100
    # rearrange the vector data from node_env$data to ccdata format.
101
    # Regarding to the performance, XML traverser only put all the data in a
102
    # vector form, as vector appending is much faster than data.frame
103
    # appending.
104 1
    lapply(node_env$data, function(x) {
105 1
               len <- length(x)
106 1
               if (len == 1) { # 1d item (simple item)
107 1
                   if (.which.type(names(x)) == "item1d" & is.character(x[[1]]))
108 1
                       node_env$ccdata[[names(x)]] <- as.character(x)
109
                   else
110 0
                       stop("wrong simple data", names(x))
111
               }
112 1
               else if (len == 2) { # 2d item (items in time)
113 1
                   nm <- c(.which.type(names(x)[1]), .which.type(names(x)[2]))
114 1
                   label <- names(x)[nm == "item2d"]
115 1
                   if (length(label) == 1) {
116 1
                       node_env$ccdata[[label]] <- .simple.data.frame(x)
117 1
                       names(node_env$ccdata[[label]]) <- nm
118
                   }
119
               }
120 1
               else if (len == 3) { # time data with meta data, i.e. 3 columns
121
                   # list has the same length. In the case that 3 columns
122
                   # have missing column in the end, so putting NAs in the end.
123 1
                   x <- make_equal_length(x)                    
124
                   
125 1
                   nm <- c(.which.type(names(x)[1]), .which.type(names(x)[2]),
126 1
                           .which.type(names(x)[3]))
127 1
                   label <- names(x)[nm == "item2d"]
128
                   # usually label, i.e. item2d should be unique, however just
129
                   # in case of incomplete 4-column data in which more than 1
130
                   # item2d will be found.
131 1
                   for (i in label) {
132 1
                       node_env$ccdata[[i]] <- .simple.data.frame(x)
133 1
                       names(node_env$ccdata[[i]]) <- nm
134
                   }
135
               }
136
               # still wrong and keep it as it is for now.
137
               # 4 columns case only happens in laboratory microbiology
138
               # culture, where item labels 0186 (Site), 0187 (Organism) 
139
               # and 0189 (Sensitivity) share the same time label and being
140
               # allocated in the same XML block. In `ccdata` a 4 columns data
141
               # frame will be created and duplicated under the item names. 
142 0
               else if (len == 4) {
143 0
                   nm <- c(.which.type(names(x)[1]), .which.type(names(x)[2]), 
144 0
                           .which.type(names(x)[3]), .which.type(names(x)[4]))
145 0
                   label <- names(x)[nm == "item2d"]
146 0
                   for(i in label) {
147 0
                       node_env$ccdata[[i]] <- data.frame(x[[1]], x[[2]],
148 0
                                                          x[[3]], x[[4]])
149 0
                       names(node_env$ccdata[[i]]) <- nm
150
                   }
151
               }
152
               else{
153 0
                   print(x)
154 0
                   stop("0 or more than 4 columns here!")
155
               }
156
})
157 1
    return(node_env$ccdata)
158
}
159

160

161
#' load xml clinical data
162
#'
163
#' @param file character string. The path of the XML file.
164
#' @return the root of the xml data. 
165
xmlLoad <- function(file) {
166 1
    file.parse <- xmlParse(file)
167 1
    xml.root <- xmlRoot(file.parse)
168 1
    return(xml.root)
169
}
170

171
#' get the episode data from xml 
172
#'
173
#' @param xml.root root of xml data returned by xmlLoad()
174
#' @param id integer
175
getXmlepisode <- function(xml.root, id) {
176 1
    xml.root[[1]][[2]][[id]]
177
}
178

179
#' Extract the original file name from a path and file removing
180
#' all the suffixes.
181
#'
182
#' @param pathfile a particular file name which may have a suffix
183
#' @param removestr last bit from the original filename
184
#' @return string
185
extract_file_origin <- function(pathfile, removestr='.xml'){
186 1
  split_path <- unlist(strsplit(pathfile, "/"))
187 1
  filename <- split_path[length(split_path)]
188 1
  original <- unlist(strsplit(filename, removestr))
189 1
  return(paste(original[1], removestr, sep=""))
190
  }
191

192

193
#' Convert the XML file to ccRecord 
194
#' 
195
#' Convert the XML file to ccRecord. For more details, see ccRecord-class. 
196
#' 
197
#' @param file character string. The path of XML file. 
198
#' @param select.episode integer vector. Load only a selected number of episodes. 
199
#' It is NULL by default which loads all the episodes in a file. 
200
#' @param quiet logical. Switch on/off the progress bar. 
201
#' @param xml XML object. Usually not needed. 
202
#' @param file_origin character string. The XML file name. The file name will be 
203
#' extracted automatically while argument xml is NULL. 
204
#' @param parse_time POSIXct. By default is the time of the execution of this function. 
205
#' @return ccRecord-class 
206
#' @export xml2Data
207
xml2Data <- function (file, select.episode=NULL, quiet=TRUE, xml=NULL,
208
                      file_origin="NA", parse_time=Sys.time()){
209 1
  if (is.null(xml)) {
210 1
    if (file_origin == "NA") {
211 1
      file_origin <- extract_file_origin(file)
212
    }
213 1
    xml <- xmlLoad(file)
214
  }
215

216 1
    episode.num <- xmlSize(xml[[1]][[2]])
217 1
    if(is.null(select.episode))
218 1
        select.episode <- seq(episode.num)
219

220

221 1
    if (!quiet)
222 0
        pb <- utils::txtProgressBar(min = min(select.episode)-1, 
223 0
                             max = max(select.episode), style = 3)
224 1
    eps <- list()
225 1
    if (!quiet)
226 0
        cat("parsing XML file: ", file, "\n")
227

228 1
    for(episode.id in select.episode){
229 1
        episode <- getXmlepisode(xml, episode.id)
230 1
        episode_list <- tryCatch(xmlEpisodeToList(episode), 
231 1
                                 error=function(err) {
232 0
                                     cat(paste(err, "episode.id = ", episode.id, "\n"))
233 0
                                     stop()
234
                                 })
235 1
        eps[[episode.id]] <- new.episode(episode_list, file_origin, parse_time)
236

237 1
        if (!quiet)
238 0
            utils::setTxtProgressBar(pb, episode.id)
239
    }
240

241 1
    record <- ccRecord() + eps
242 1
    if (!quiet)
243 0
        cat("\n")
244
    
245 1
    return(record)
246
}

Read our documentation on viewing source code .

Loading