improve loop checks
1 |
import Tokenize.Lexers: peekchar, readchar, iswhitespace, emit, emit_error, accept_batch, eof |
|
2 |
|
|
3 |
const EmptyWS = Tokens.EMPTY_WS |
|
4 |
const SemiColonWS = Tokens.SEMICOLON_WS |
|
5 |
const NewLineWS = Tokens.NEWLINE_WS |
|
6 |
const WS = Tokens.WS |
|
7 |
const EmptyWSToken = RawToken(EmptyWS, (0, 0), (0, 0), -1, -1) |
|
8 |
|
|
9 |
mutable struct Closer |
|
10 | 23 |
newline::Bool |
11 |
semicolon::Bool |
|
12 |
tuple::Bool |
|
13 |
comma::Bool |
|
14 |
paren::Bool |
|
15 |
brace::Bool |
|
16 |
inmacro::Bool |
|
17 |
insquare::Bool |
|
18 |
inref::Bool |
|
19 |
inwhere::Bool |
|
20 |
square::Bool |
|
21 |
block::Bool |
|
22 |
ifop::Bool |
|
23 |
range::Bool |
|
24 |
ws::Bool |
|
25 |
wsop::Bool |
|
26 |
unary::Bool |
|
27 |
precedence::Int |
|
28 |
end
|
|
29 | 21 |
Closer() = Closer(true, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, -1) |
30 |
|
|
31 |
mutable struct ParseState |
|
32 | 23 |
l::Lexer{Base.GenericIOBuffer{Array{UInt8,1}},RawToken} |
33 |
done::Bool # Remove this |
|
34 |
lt::RawToken |
|
35 |
t::RawToken |
|
36 |
nt::RawToken |
|
37 |
nnt::RawToken |
|
38 |
lws::RawToken |
|
39 |
ws::RawToken |
|
40 |
nws::RawToken |
|
41 |
nnws::RawToken |
|
42 |
closer::Closer |
|
43 |
errored::Bool |
|
44 |
end
|
|
45 |
function ParseState(str::Union{IO,String}) |
|
46 | 21 |
ps = ParseState(tokenize(str, RawToken), false, RawToken(), RawToken(), RawToken(), RawToken(), RawToken(), RawToken(), RawToken(), RawToken(), Closer(), false) |
47 | 23 |
return next(next(ps)) |
48 |
end
|
|
49 |
|
|
50 |
function ParseState(str::Union{IO,String}, loc::Int) |
|
51 |
ps = ParseState(str) |
|
52 |
prevpos = position(ps) |
|
53 |
while ps.nt.startbyte < loc |
|
54 |
next(ps) |
|
55 |
prevpos = loop_check(ps, prevpos) |
|
56 |
end
|
|
57 |
return ps |
|
58 |
end
|
|
59 |
|
|
60 |
function Base.show(io::IO, ps::ParseState) |
|
61 |
println(io, "ParseState at $(position(ps.l.io))") |
|
62 |
println(io, "last : ", kindof(ps.lt), " ($(ps.lt))", " ($(wstype(ps.lws)))") |
|
63 |
println(io, "current : ", kindof(ps.t), " ($(ps.t))", " ($(wstype(ps.ws)))") |
|
64 |
println(io, "next : ", kindof(ps.nt), " ($(ps.nt))", " ($(wstype(ps.nws)))") |
|
65 |
end
|
|
66 |
peekchar(ps::ParseState) = peekchar(ps.l) |
|
67 |
wstype(t::AbstractToken) = kindof(t) == EmptyWS ? "empty" : |
|
68 |
kindof(t) == NewLineWS ? "ws w/ newline" : |
|
69 |
kindof(t) == SemiColonWS ? "ws w/ semicolon" : "ws" |
|
70 |
|
|
71 |
function next(ps::ParseState) |
|
72 |
# shift old tokens
|
|
73 | 23 |
ps.lt = ps.t |
74 | 21 |
ps.t = ps.nt |
75 | 21 |
ps.nt = ps.nnt |
76 | 21 |
ps.lws = ps.ws |
77 | 21 |
ps.ws = ps.nws |
78 | 21 |
ps.nws = ps.nnws |
79 |
|
|
80 | 21 |
ps.nnt = Tokenize.Lexers.next_token(ps.l) |
81 |
|
|
82 |
# combines whitespace, comments and semicolons
|
|
83 | 23 |
if iswhitespace(peekchar(ps.l)) || peekchar(ps.l) == '#' || peekchar(ps.l) == ';' |
84 | 23 |
ps.nnws = lex_ws_comment(ps.l, readchar(ps.l)) |
85 |
else
|
|
86 | 21 |
ps.nnws = EmptyWSToken |
87 |
end
|
|
88 |
|
|
89 | 23 |
return ps |
90 |
end
|
|
91 |
|
|
92 |
function Base.seek(ps::ParseState, offset) |
|
93 |
seek(ps.l, offset) |
|
94 |
next(next(ps)) |
|
95 |
end
|
|
96 |
|
|
97 | 21 |
Base.position(ps::ParseState) = ps.nt.startbyte |
98 |
|
|
99 |
"""
|
|
100 |
lex_ws_comment(l::Lexer, c)
|
|
101 |
|
|
102 |
Having hit an initial whitespace/comment/semicolon continues collecting similar
|
|
103 |
`Chars` until they end. Returns a WS token with an indication of newlines/ semicolons. Indicating a semicolons takes precedence over line breaks as the former is equivalent to the former in most cases.
|
|
104 |
"""
|
|
105 |
function read_ws_comment(l, c::Char) |
|
106 | 23 |
newline = c == '\n' |
107 | 21 |
semicolon = c == ';' |
108 | 23 |
if c == '#' |
109 | 23 |
newline = read_comment(l) |
110 |
else
|
|
111 | 23 |
newline, semicolon = read_ws(l, newline, semicolon) |
112 |
end
|
|
113 | 23 |
while iswhitespace(peekchar(l)) || peekchar(l) == '#' || peekchar(l) == ';' |
114 | 23 |
c = readchar(l) |
115 | 23 |
if c == '#' |
116 | 23 |
read_comment(l) |
117 | 23 |
newline = newline || peekchar(l) == '\n' |
118 | 23 |
semicolon = semicolon || peekchar(l) == ';' |
119 | 23 |
elseif c == ';' |
120 | 23 |
semicolon = true |
121 |
else
|
|
122 | 23 |
newline, semicolon = read_ws(l, newline, semicolon) |
123 |
end
|
|
124 |
end
|
|
125 | 23 |
return newline, semicolon |
126 |
end
|
|
127 |
|
|
128 |
function lex_ws_comment(l::Lexer, c::Char) |
|
129 | 23 |
newline, semicolon = read_ws_comment(l, c) |
130 | 23 |
return emit(l, semicolon ? SemiColonWS : |
131 |
newline ? NewLineWS : WS) |
|
132 |
end
|
|
133 |
|
|
134 |
|
|
135 |
function read_ws(l, newline, semicolon) |
|
136 | 23 |
while iswhitespace(peekchar(l)) |
137 | 23 |
c = readchar(l) |
138 | 23 |
c == '\n' && (newline = true) |
139 | 23 |
c == ';' && (semicolon = true) |
140 |
end
|
|
141 | 23 |
return newline, semicolon |
142 |
end
|
|
143 |
|
|
144 |
function read_comment(l) |
|
145 | 23 |
if peekchar(l) != '=' |
146 |
while true |
|
147 | 21 |
pc = peekchar(l) |
148 | 23 |
if pc == '\n' || eof(pc) |
149 | 23 |
return true |
150 |
end
|
|
151 | 23 |
readchar(l) |
152 |
end
|
|
153 |
else
|
|
154 | 23 |
c = readchar(l) # consume the '=' |
155 |
n_start, n_end = 1, 0 |
|
156 | 23 |
while true |
157 | 23 |
if eof(c) |
158 |
return false |
|
159 |
end
|
|
160 | 23 |
nc = readchar(l) |
161 | 23 |
if c == '#' && nc == '=' |
162 |
n_start += 1 |
|
163 | 23 |
elseif c == '=' && nc == '#' |
164 | 21 |
n_end += 1 |
165 |
end
|
|
166 | 23 |
if n_start == n_end |
167 | 23 |
return true |
168 |
end
|
|
169 | 23 |
c = nc |
170 |
end
|
|
171 |
end
|
|
172 |
end
|
|
173 |
|
|
174 |
# Functions relating to tokens
|
|
175 | 23 |
isemptyws(t::AbstractToken) = kindof(t) == EmptyWS |
176 | 23 |
isnewlinews(t::AbstractToken) = kindof(t) === NewLineWS |
177 | 23 |
isendoflinews(t::AbstractToken) = kindof(t) == SemiColonWS || kindof(t) == NewLineWS |
178 | 23 |
@inline val(token::AbstractToken, ps::ParseState) = String(ps.l.io.data[token.startbyte + 1:token.endbyte + 1]) |
179 | 23 |
both_symbol_and_op(t::AbstractToken) = kindof(t) === Tokens.WHERE || kindof(t) === Tokens.IN || kindof(t) === Tokens.ISA |
180 | 23 |
isprefixableliteral(t::AbstractToken) = (kindof(t) === Tokens.STRING || kindof(t) === Tokens.TRIPLE_STRING || kindof(t) === Tokens.CMD || kindof(t) === Tokens.TRIPLE_CMD) |
181 | 23 |
isassignment(t::AbstractToken) = Tokens.begin_assignments < kindof(t) < Tokens.end_assignments |
182 |
|
|
183 | 23 |
isidentifier(t::AbstractToken) = kindof(t) === Tokens.IDENTIFIER |
184 | 23 |
isliteral(t::AbstractToken) = Tokens.begin_literal < kindof(t) < Tokens.end_literal |
185 | 23 |
isbool(t::AbstractToken) = Tokens.TRUE ≤ kindof(t) ≤ Tokens.FALSE |
186 | 23 |
iscomma(t::AbstractToken) = kindof(t) === Tokens.COMMA |
187 | 23 |
iscolon(t::AbstractToken) = kindof(t) === Tokens.COLON |
188 | 23 |
iskw(t::AbstractToken) = Tokens.iskeyword(kindof(t)) |
189 | 23 |
isinstance(t::AbstractToken) = isidentifier(t) || isliteral(t) || isbool(t) || iskw(t) |
190 | 23 |
ispunctuation(t::AbstractToken) = iscomma(t) || kindof(t) === Tokens.END || Tokens.LSQUARE ≤ kindof(t) ≤ Tokens.RPAREN || kindof(t) === Tokens.AT_SIGN |
191 |
|
Read our documentation on viewing source code .