1
|
|
module DBFTables
|
2
|
|
|
3
|
|
using Printf, Tables, WeakRefStrings
|
4
|
|
|
5
|
|
"Field/column descriptor, part of the Header"
|
6
|
|
struct FieldDescriptor
|
7
|
11
|
name::Symbol
|
8
|
|
type::Type
|
9
|
|
length::UInt8
|
10
|
|
ndec::UInt8
|
11
|
|
end
|
12
|
|
|
13
|
|
"DBF header, which also holds all field definitions"
|
14
|
|
struct Header
|
15
|
11
|
version::UInt8
|
16
|
|
last_update::String
|
17
|
|
records::UInt32
|
18
|
|
hsize::UInt16
|
19
|
|
rsize::UInt16
|
20
|
|
incomplete::Bool
|
21
|
|
encrypted::Bool
|
22
|
|
mdx::Bool
|
23
|
|
lang_id::UInt8
|
24
|
|
fields::Vector{FieldDescriptor}
|
25
|
|
fieldcolumns::Dict{Symbol, Int}
|
26
|
|
end
|
27
|
|
|
28
|
|
"Struct representing the DBF Table"
|
29
|
|
struct Table
|
30
|
7
|
header::Header
|
31
|
|
data::Vector{UInt8} # WeakRefString references this
|
32
|
|
strings::StringArray{String,2}
|
33
|
|
end
|
34
|
|
|
35
|
|
"Struct representing a single row or record of the DBF Table"
|
36
|
|
struct Row
|
37
|
11
|
table::Table
|
38
|
|
row::Int
|
39
|
|
end
|
40
|
|
|
41
|
|
"Convert DBF data type characters to Julia types"
|
42
|
|
function typemap(fld::Char, ndec::UInt8)
|
43
|
|
# https://www.clicketyclick.dk/databases/xbase/format/data_types.html
|
44
|
7
|
rt = Nothing
|
45
|
11
|
if fld == 'C'
|
46
|
11
|
rt = String
|
47
|
11
|
elseif fld == 'D'
|
48
|
11
|
rt = String
|
49
|
11
|
elseif fld == 'N'
|
50
|
11
|
if ndec > 0
|
51
|
11
|
rt = Float64
|
52
|
|
else
|
53
|
|
# TODO do we want this?
|
54
|
11
|
rt = Int
|
55
|
|
end
|
56
|
11
|
elseif fld == 'F' || fld == 'O'
|
57
|
11
|
rt = Float64
|
58
|
11
|
elseif fld == 'I' || fld == '+'
|
59
|
0
|
rt = Int
|
60
|
11
|
elseif fld == 'L'
|
61
|
11
|
rt = Bool
|
62
|
|
else
|
63
|
0
|
throw(ArgumentError("Unknown record type $fld"))
|
64
|
|
end
|
65
|
11
|
return rt
|
66
|
|
end
|
67
|
|
|
68
|
|
"Read a field descriptor from the stream, and create a FieldDescriptor struct"
|
69
|
3
|
function read_dbf_field(io::IO)
|
70
|
11
|
field_name_raw = String(read!(io, Vector{UInt8}(undef, 11)))
|
71
|
7
|
field_name = Symbol(strip(replace(field_name_raw, '\0' => ' ')))
|
72
|
11
|
field_type = read(io, Char)
|
73
|
7
|
skip(io, 4) # skip
|
74
|
7
|
field_len = read(io, UInt8)
|
75
|
7
|
field_dec = read(io, UInt8)
|
76
|
7
|
skip(io, 14) # reserved
|
77
|
7
|
jltype = typemap(field_type, field_dec)
|
78
|
11
|
return FieldDescriptor(field_name, jltype, field_len, field_dec)
|
79
|
|
end
|
80
|
|
|
81
|
|
"Read a DBF header from a stream"
|
82
|
3
|
function Header(io::IO)
|
83
|
11
|
ver = read(io, UInt8)
|
84
|
7
|
date1 = read(io, UInt8)
|
85
|
7
|
date2 = read(io, UInt8)
|
86
|
7
|
date3 = read(io, UInt8)
|
87
|
11
|
last_update = @sprintf("%4d%02d%02d", date1 + 1900, date2, date3)
|
88
|
7
|
records = read(io, UInt32)
|
89
|
7
|
hsize = read(io, UInt16)
|
90
|
7
|
rsize = read(io, UInt16)
|
91
|
7
|
skip(io, 2) # reserved
|
92
|
11
|
incomplete = Bool(read(io, UInt8))
|
93
|
11
|
encrypted = Bool(read(io, UInt8))
|
94
|
7
|
skip(io, 12) # reserved
|
95
|
11
|
mdx = Bool(read(io, UInt8))
|
96
|
7
|
lang_id = read(io, UInt8)
|
97
|
7
|
skip(io, 2) # reserved
|
98
|
7
|
fields = FieldDescriptor[]
|
99
|
|
|
100
|
|
# use Dict for quicker column index lookup
|
101
|
11
|
fieldcolumns = Dict{Symbol, Int}()
|
102
|
7
|
col = 1
|
103
|
11
|
while !eof(io)
|
104
|
11
|
field = read_dbf_field(io)
|
105
|
11
|
fieldcolumns[field.name] = col
|
106
|
7
|
push!(fields, field)
|
107
|
7
|
col += 1
|
108
|
|
|
109
|
|
# peek if we are at the end
|
110
|
7
|
mark(io)
|
111
|
7
|
trm = read(io, UInt8)
|
112
|
11
|
if trm == 0xD
|
113
|
11
|
break
|
114
|
|
else
|
115
|
11
|
reset(io)
|
116
|
|
end
|
117
|
|
end
|
118
|
|
|
119
|
11
|
return Header(
|
120
|
|
ver,
|
121
|
|
last_update,
|
122
|
|
records,
|
123
|
|
hsize,
|
124
|
|
rsize,
|
125
|
|
incomplete,
|
126
|
|
encrypted,
|
127
|
|
mdx,
|
128
|
|
lang_id,
|
129
|
|
fields,
|
130
|
|
fieldcolumns,
|
131
|
|
)
|
132
|
|
end
|
133
|
|
|
134
|
11
|
miss(x) = ifelse(x === nothing, missing, x)
|
135
|
|
|
136
|
|
"Concert a DBF entry string to a Julia value"
|
137
|
3
|
function dbf_value(T::Type{Bool}, str::AbstractString)
|
138
|
11
|
char = first(str)
|
139
|
11
|
if char in "YyTt"
|
140
|
11
|
true
|
141
|
11
|
elseif char in "NnFf"
|
142
|
11
|
false
|
143
|
11
|
elseif char == '?'
|
144
|
11
|
missing
|
145
|
|
else
|
146
|
0
|
throw(ArgumentError("Unknown logical $char"))
|
147
|
|
end
|
148
|
|
end
|
149
|
|
|
150
|
11
|
dbf_value(T::Union{Type{Int},Type{Float64}}, str::AbstractString) =
|
151
|
|
miss(tryparse(T, str))
|
152
|
|
# String to avoid returning SubString{String}
|
153
|
3
|
function dbf_value(T::Type{String}, str::AbstractString)
|
154
|
7
|
stripped = rstrip(str)
|
155
|
11
|
if isempty(stripped)
|
156
|
|
# return missing rather than ""
|
157
|
11
|
return missing
|
158
|
|
else
|
159
|
11
|
return String(stripped)
|
160
|
|
end
|
161
|
|
end
|
162
|
0
|
dbf_value(T::Type{Nothing}, str::AbstractString) = missing
|
163
|
|
|
164
|
|
# define get functions using getfield since we overload getproperty
|
165
|
|
"Access the header of a DBF Table"
|
166
|
11
|
getheader(dbf::Table) = getfield(dbf, :header)
|
167
|
7
|
getfields(dbf::Table) = getheader(dbf).fields
|
168
|
11
|
getstrings(dbf::Table) = getfield(dbf, :strings)
|
169
|
11
|
getrow(row::Row) = getfield(row, :row)
|
170
|
11
|
gettable(row::Row) = getfield(row, :table)
|
171
|
|
|
172
|
8
|
Base.length(dbf::Table) = Int(getheader(dbf).records)
|
173
|
11
|
Base.size(dbf::Table) = (length(dbf), length(getfields(dbf)))
|
174
|
11
|
Base.size(dbf::Table, i) = size(dbf)[i]
|
175
|
|
|
176
|
|
"""
|
177
|
|
DBFTables.Table(source) => DBFTables.Table
|
178
|
|
|
179
|
|
Read a source, a path to a file or an opened stream, to a DBFTables.Table.
|
180
|
|
This type conforms to the Tables interface, so it can be easily converted
|
181
|
|
to other formats. It is possible to iterate through the rows of this object,
|
182
|
|
or to retrieve columns like `dbf.fieldname`.
|
183
|
|
"""
|
184
|
3
|
function Table(io::IO)
|
185
|
7
|
header = Header(io)
|
186
|
|
# consider using mmap here for big dbf files
|
187
|
3
|
data = Vector{UInt8}(undef, header.rsize * header.records)
|
188
|
3
|
read!(io, data)
|
189
|
7
|
strings = _create_stringarray(header, data)
|
190
|
7
|
Table(header, data, strings)
|
191
|
|
end
|
192
|
|
|
193
|
3
|
function Table(path::AbstractString)
|
194
|
11
|
open(path) do io
|
195
|
7
|
Table(io)
|
196
|
|
end
|
197
|
|
end
|
198
|
|
|
199
|
|
"Collect all the offsets and lenghts from the header to create a StringArray"
|
200
|
3
|
function _create_stringarray(header::Header, data::AbstractVector)
|
201
|
|
# first make the lengths and offsets for a single record
|
202
|
11
|
lengths_record = UInt32.(getfield.(header.fields, :length))
|
203
|
7
|
offsets_record = vcat(0, cumsum(lengths_record)[1:end-1]) .+ 1
|
204
|
|
|
205
|
|
# the lengths are equal for each record
|
206
|
11
|
lengths = repeat(lengths_record, 1, header.records)
|
207
|
|
# the offsets accumulate over records with the record size
|
208
|
7
|
row_offsets = range(0; length = header.records, step = header.rsize)
|
209
|
11
|
offsets = repeat(offsets_record, 1, header.records)
|
210
|
11
|
offsets .+= reshape(row_offsets, 1, :)
|
211
|
|
|
212
|
11
|
StringArray{String,2}(data, offsets, lengths)
|
213
|
|
end
|
214
|
|
|
215
|
|
"Create a NamedTuple representing a single row"
|
216
|
3
|
function Base.NamedTuple(row::Row)
|
217
|
11
|
dbf = gettable(row)
|
218
|
7
|
str = getstrings(dbf)
|
219
|
7
|
fields = getfields(dbf)
|
220
|
7
|
ncol = length(fields)
|
221
|
7
|
rowidx = getrow(row)
|
222
|
7
|
@inbounds record = @view str[:, rowidx]
|
223
|
11
|
@inbounds prs = (fields[col].name => dbf_value(
|
224
|
|
fields[col].type,
|
225
|
|
record[col],
|
226
|
|
) for col = 1:ncol)
|
227
|
11
|
return (; prs...)
|
228
|
|
end
|
229
|
|
|
230
|
3
|
function Base.show(io::IO, row::Row)
|
231
|
11
|
show(io, NamedTuple(row))
|
232
|
|
end
|
233
|
|
|
234
|
3
|
function Base.show(io::IO, dbf::Table)
|
235
|
11
|
nr = length(dbf)
|
236
|
7
|
nc = length(getfields(dbf))
|
237
|
7
|
println(io, "DBFTables.Table with $nr rows and $nc columns")
|
238
|
11
|
println(io, Tables.schema(dbf))
|
239
|
|
end
|
240
|
|
|
241
|
0
|
Base.isempty(dbf::Table) = getheader(dbf).records == 0
|
242
|
|
|
243
|
|
"Get a BitVector which is true for rows that are marked as deleted"
|
244
|
3
|
function isdeleted(dbf::Table)
|
245
|
11
|
data = getfield(dbf, :data)
|
246
|
7
|
rsize = getheader(dbf).rsize
|
247
|
7
|
nrow = getheader(dbf).records
|
248
|
7
|
idx = range(1, step = rsize, length = nrow)
|
249
|
11
|
data[idx] .== 0x2a
|
250
|
|
end
|
251
|
|
|
252
|
|
"Check if the row is marked as deleted"
|
253
|
3
|
function isdeleted(dbf::Table, row::Integer)
|
254
|
11
|
data = getfield(dbf, :data)
|
255
|
7
|
i = (row - 1) * getheader(dbf).rsize + 1
|
256
|
11
|
data[i] == 0x2a
|
257
|
|
end
|
258
|
|
|
259
|
|
"Iterate over the rows of a DBF Table, yielding a DBFTables.Row for each row"
|
260
|
3
|
function Base.iterate(dbf::Table, st = 1)
|
261
|
11
|
st > length(dbf) && return nothing
|
262
|
11
|
return Row(dbf, st), st + 1
|
263
|
|
end
|
264
|
|
|
265
|
3
|
function Base.getproperty(row::Row, name::Symbol)
|
266
|
11
|
dbf = gettable(row)
|
267
|
7
|
header = getheader(dbf)
|
268
|
7
|
str = getstrings(dbf)
|
269
|
11
|
colidx = get(header.fieldcolumns, name, nothing)
|
270
|
11
|
colidx === nothing && throw(ArgumentError("Column not present: $name"))
|
271
|
11
|
type = @inbounds getfields(dbf)[colidx].type
|
272
|
7
|
rowidx = getrow(row)
|
273
|
11
|
return @inbounds dbf_value(type, str[colidx, rowidx])
|
274
|
|
end
|
275
|
|
|
276
|
7
|
Tables.istable(::Type{Table}) = true
|
277
|
0
|
Tables.rowaccess(::Type{Table}) = true
|
278
|
0
|
Tables.columnaccess(::Type{Table}) = true
|
279
|
0
|
Tables.rows(dbf::Table) = dbf
|
280
|
7
|
Tables.columns(dbf::Table) = dbf
|
281
|
|
|
282
|
|
"Get the Tables.Schema of a DBF Table"
|
283
|
3
|
function Tables.schema(dbf::Table)
|
284
|
11
|
names = Tuple(field.name for field in getfields(dbf))
|
285
|
|
# since missing is always supported, add it to the schema types
|
286
|
11
|
types = Tuple(Union{field.type,Missing} for field in getfields(dbf))
|
287
|
11
|
Tables.Schema(names, types)
|
288
|
|
end
|
289
|
|
|
290
|
|
"List all available DBF column names"
|
291
|
11
|
Base.propertynames(dbf::Table) = getfield.(getfield(dbf, :header).fields, :name)
|
292
|
11
|
Base.propertynames(row::Row) = propertynames(gettable(row))
|
293
|
|
|
294
|
|
"Create a copy of an entire DBF column as a Vector. Usage: `dbf.myfield`"
|
295
|
3
|
function Base.getproperty(dbf::Table, name::Symbol)
|
296
|
11
|
header = getheader(dbf)
|
297
|
11
|
col = get(header.fieldcolumns, name, nothing)
|
298
|
11
|
col === nothing && throw(ArgumentError("Column not present: $name"))
|
299
|
7
|
nrow = header.records
|
300
|
11
|
@inbounds type = getfields(dbf)[col].type
|
301
|
7
|
str = getstrings(dbf)
|
302
|
11
|
@inbounds colarr = [dbf_value(type, str[col, i]) for i = 1:nrow]
|
303
|
11
|
return colarr
|
304
|
|
end
|
305
|
|
|
306
|
|
end # module
|