1 30
module ParquetFiles
2

3
using Parquet, IteratorInterfaceExtensions, TableTraits, FileIO
4
import IterableTables, DataValues, TableShowUtils
5

6
export load, File, @format_str
7

8
struct ParquetFile
9 30
    filename::String
10
end
11

12
function Base.show(io::IO, source::ParquetFile)
13 30
    TableShowUtils.printtable(io, getiterator(source), "Parquet file")
14
end
15

16
function Base.show(io::IO, ::MIME"text/html", source::ParquetFile)
17 30
    TableShowUtils.printHTMLtable(io, getiterator(source))
18
end
19 0
Base.Multimedia.showable(::MIME"text/html", source::ParquetFile) = true
20

21
function Base.show(io::IO, ::MIME"application/vnd.dataresource+json", source::ParquetFile)
22 30
    TableShowUtils.printdataresource(io, getiterator(source))
23
end
24 0
Base.Multimedia.showable(::MIME"application/vnd.dataresource+json", source::ParquetFile) = true
25

26
struct ParquetNamedTupleIterator{T,T_row}
27 30
    rc::RecCursor
28
    nrows::Int
29
end
30

31
function Base.eltype(itr::ParquetNamedTupleIterator{T,T_row}) where {T,T_row}
32 5
    return T
33
end
34

35
function Base.length(itr::ParquetNamedTupleIterator)
36 25
    return itr.nrows
37
end
38

39 5
@generated function Base.iterate(itr::ParquetNamedTupleIterator{T,T_row}, state...) where {T,T_row}
40 30
    names = fieldnames(T)
41 30
    quote
42 30
        y = iterate(itr.rc, state...)
43 30
        if y === nothing
44 30
            return nothing
45
        else
46 30
            v = y[1]
47 30
            next_state = y[2]
48 30
            return T(($([fieldtype(T, i) <: String ? :(String(copy(v.$(names[i])))) : :(v.$(names[i])) for i = 1:length(names)]...),)), next_state
49
        end
50
    end
51
end
52

53
function fileio_load(f::FileIO.File{FileIO.format"Parquet"})
54 30
    return ParquetFile(f.filename)
55
end
56

57 30
IteratorInterfaceExtensions.isiterable(x::ParquetFile) = true
58 30
TableTraits.isiterabletable(x::ParquetFile) = true
59

60
function IteratorInterfaceExtensions.getiterator(file::ParquetFile)
61 30
    p = ParFile(file.filename)
62

63 30
    T_row_name = Symbol("RCType$(String(gensym())[3:end])")
64

65 30
    schema(JuliaConverter(ParquetFiles), p, T_row_name)
66

67 25
    T_row = eval(T_row_name)
68

69 30
    col_names = fieldnames(T_row)
70 30
    col_types = [i <: Vector{UInt8} ? String : i for i in T_row.types]
71

72 30
    T = NamedTuple{(col_names...,),Tuple{col_types...}}
73

74 30
    rc = RecCursor(p, 1:nrows(p), colnames(p), JuliaBuilder(p, T_row))
75

76 30
    it = ParquetNamedTupleIterator{T,T_row}(rc, nrows(p))
77

78 30
    return it
79
end
80

81
end # module

Read our documentation on viewing source code .

Loading