ceedubs / irrec
1
package ceedubs.irrec
2
package parse
3

4
import ceedubs.irrec.regex._, Match.MatchSet
5
import ceedubs.irrec.regex.RegexC
6
import ceedubs.irrec.regex.RegexPrettyPrinter.{
7
  charClassCharsToEscape,
8
  nonCharClassCharsToEscape,
9
  specialNonCharClassCharToLit
10
}
11
import combinator._
12

13
import fastparse._, NoWhitespace._
14
import fastparse.Parsed.{Failure, Success}
15
import cats.collections.{Diet, Range}
16
import cats.implicits._
17

18
object Parser {
19

20 1
  private val escapableCharToLit: Map[Char, Char] = specialNonCharClassCharToLit + ('-' -> '-')
21

22
  /**
23
   * Matches on special characters that should be escaped like `*` and `{`.
24
   */
25
  def specialChar[_: P]: P[Char] =
26
    CharPred(escapableCharToLit.contains(_)).!.map(s => escapableCharToLit(s.head))
27 1
      .opaque(
28
        s"special regular expression character that should be escaped such as '(', '}', '*', etc")
29

30
  def unicodeCodePoint[_: P]: P[Char] =
31
    P(
32
      CharPred(CharacterClasses.hexDigit.contains(_))
33
        .rep(exactly = 4)
34
        .!
35
        .map(hexChars => Integer.parseInt(hexChars, 16).toChar)
36 1
    ).opaque("A valid unicode code point in 4-digit hex form (ex: '006F')")
37

38
  /**
39
   * A shorthand class such as `\d` or `\w`. This parser itself doesn't look for the `\`; it starts
40
   * with the character after it.
41
   */
42
  def shorthandClass[_: P]: P[Match.MatchSet[Char]] =
43
    (
44
      P("d").map(_ => MatchSet.allow(CharacterClasses.digit)) |
45
        P("D").map(_ => MatchSet.forbid(CharacterClasses.digit)) |
46
        P("w").map(_ => MatchSet.allow(CharacterClasses.wordChar)) |
47
        P("W").map(_ => MatchSet.forbid(CharacterClasses.wordChar)) |
48
        P("h").map(_ => MatchSet.allow(CharacterClasses.horizontalWhitespaceChar)) |
49
        P("H").map(_ => MatchSet.forbid(CharacterClasses.horizontalWhitespaceChar)) |
50
        P("s").map(_ => MatchSet.allow(CharacterClasses.whitespaceChar)) |
51
        P("S").map(_ => MatchSet.forbid(CharacterClasses.whitespaceChar))
52 1
    ).opaque("""character class such as \w, \d, \s, \S, etc""")
53

54
  /**
55
   * Standard characters to match like `a` or `%`.
56
   */
57
  def standardMatchChar[_: P]: P[Char] =
58
    CharPred(c => !nonCharClassCharsToEscape.contains(c)).!.map(s => s.head)
59 1
      .opaque("""standard charact to match like `a` or `%`""")
60

61
  /**
62
   * Standard characters to match like `a` or `%` but also characters that aren't special within
63
   * character classes such as `*` (ex: `[*+]` matches on literal `*` and `+`).
64
   */
65
  def charClassStandardMatchChar[_: P]: P[Char] =
66
    P(CharPred(c => !charClassCharsToEscape.contains(c)).!.map(s => s.head))
67

68
  /**
69
   * Matches the wildcard character `.`.
70
   */
71
  def wildcard[_: P]: P[RegexC[Char]] = P(".").map(_ => combinator.wildcard)
72

73
  /**
74
   * Positive integers within the max range of Scala's `Int`.
75
   */
76
  def posInt[_: P]: P[Int] =
77
    P(
78
      CharIn("0-9")
79
        .rep(1)
80
        .!
81
        .flatMap(s => Either.catchNonFatal(s.toInt).fold(_ => Fail, Pass(_))))
82 1
      .opaque(s"integer between 0 and ${Int.MaxValue}")
83

84
  def singleLitCharClassChar[_: P]: P[Char] =
85
    P(("\\u" ~ unicodeCodePoint) | ("\\" ~ specialChar | charClassStandardMatchChar))
86

87
  def matchLitCharClassChar[_: P]: P[Match.Literal[Char]] =
88
    P(singleLitCharClassChar.map(Match.Literal(_)))
89

90
  /**
91
   * Character range like `a-z`.
92
   */
93
  def matchCharRange[_: P]: P[Range[Char]] =
94
    P(
95
      (singleLitCharClassChar ~ "-" ~ singleLitCharClassChar).map { case (l, h) =>
96
        Range(l, h)
97
      }
98
    )
99

100
  /**
101
   * Matches repeat counts like `{3}` or `{1,4}`.
102
   */
103
  def quantifier[_: P]: P[Quantifier] =
104
    P(
105
      "{" ~/ (
106
        (posInt ~ "," ~/ posInt.? ~/ "}" ~/ (P("?").map(_ => Greediness.NonGreedy) | Pass(
107
          Greediness.Greedy))).map { case (l, h, g) =>
108
          Quantifier.Range(l, h, g)
109
        } |
110
          (posInt.map(Quantifier.Exact(_)) ~ "}")
111
      )
112 1
    ).opaque("repeat count such as '{3}', '{1,4}', `{1, 4}?`, '{3,}', or `{3,}?")
113

114
  def charOrRange[_: P]: P[Match.MatchSet[Char]] =
115
    matchCharRange.map(r => MatchSet.allow(Diet.fromRange(r))) |
116
      singleLitCharClassChar.map(c => MatchSet.allow(Diet.one(c)))
117

118
  def positivePOSIXCharClass[_: P]: P[MatchSet[Char]] =
119
    P("alnum").map(_ => MatchSet.allow(CharacterClasses.alphaNumeric)) |
120
      P("alpha").map(_ => MatchSet.allow(CharacterClasses.alpha)) |
121
      P("ascii").map(_ => MatchSet.allow(CharacterClasses.ascii)) |
122
      P("blank").map(_ => MatchSet.allow(CharacterClasses.horizontalWhitespaceChar)) |
123
      P("cntrl").map(_ => MatchSet.allow(CharacterClasses.controlChar)) |
124
      P("digit").map(_ => MatchSet.allow(CharacterClasses.digit)) |
125
      P("graph").map(_ => MatchSet.allow(CharacterClasses.graphChar)) |
126
      P("lower").map(_ => MatchSet.allow(CharacterClasses.lowerAlpha)) |
127
      P("print").map(_ => MatchSet.allow(CharacterClasses.printableChar)) |
128
      P("punct").map(_ => MatchSet.allow(CharacterClasses.punctuationChar)) |
129
      P("space").map(_ => MatchSet.allow(CharacterClasses.whitespaceChar)) |
130
      P("upper").map(_ => MatchSet.allow(CharacterClasses.upperAlpha)) |
131
      P("word").map(_ => MatchSet.allow(CharacterClasses.wordChar)) |
132
      P("xdigit").map(_ => MatchSet.allow(CharacterClasses.hexDigit))
133

134
  def positiveCharClassContent[_: P]: P[MatchSet[Char]] =
135
    (!"&&" ~ (("\\" ~ shorthandClass) | charOrRange))
136
      .rep(1)
137
      .map(_.reduce(_ union _))
138

139
  def charClassBase[_: P]: P[MatchSet[Char]] =
140
    P(
141
      positiveCharClassContent |
142
        ("[:" ~ positivePOSIXCharClass ~ ":]") |
143
        charClass)
144

145
  def charClassUnion[_: P]: P[MatchSet[Char]] =
146
    P(charClassBase.rep(1).map(_.reduce(_ union _)))
147

148
  def charClassTerm[_: P]: P[MatchSet[Char]] =
149
    charClassUnion.flatMap { c1 =>
150
      ("&&" ~/ charClassTerm).map(c2 => c1 intersect c2) |
151
        Pass(c1)
152
    }
153

154
  /**
155
   * Character classes like `[acz]` or `[^a-cHP-W]`.
156
   */
157
  def charClass[_: P]: P[MatchSet[Char]] =
158
    P(
159
      ("[^" ~ (positiveCharClassContent.map(_.negate) ~ "&&" ~ charClassTerm).map { case (c1, c2) =>
160
        c1 intersect c2
161
      } ~ "]") |
162
        ("[^" ~ (positiveCharClassContent.map(_.negate) ~ charClassTerm).map { case (c1, c2) =>
163
          c1 union c2
164
        } ~ "]") |
165
        ("[^" ~ charClassTerm.map(_.negate) ~ "]") |
166
        ("[" ~ charClassTerm ~ "]")
167
    )
168

169
  def base[_: P]: P[RegexC[Unit]] =
170
    P(
171
      standardMatchChar.map(lit(_).void) |
172
        ("\\" ~/ (("u" ~ unicodeCodePoint | specialChar).map(lit(_).void) | shorthandClass.map(
173
          matching(_).void))) |
174
        wildcard.map(_.void) |
175
        charClass.map(matching(_).void) |
176
        // TODO distinguish between capturing and not?
177
        ("(?:" ~ regex ~ ")") |
178
        ("(" ~ regex ~ ")")
179
    )
180

181
  def factor[_: P]: P[RegexC[Unit]] =
182
    P {
183
      base.flatMap { r =>
184
        P("*?").map(_ => r.star(Greediness.NonGreedy).void) |
185
          P("*").map(_ => r.star(Greediness.Greedy).void) |
186
          P("+").map(_ => r.oneOrMore(Greediness.Greedy).void) |
187
          P("??").map(_ => r.optional(Greediness.NonGreedy).void) |
188
          P("?").map(_ => r.optional(Greediness.Greedy).void) |
189
          quantifier.map(q => r.quantifyFold(q, ())((_, _) => ())) |
190
          Pass(r)
191
      }
192
    }
193

194
  // TODO can probably do better than toList call. Do we care?
195
  def term[_: P]: P[RegexC[Unit]] = P(factor.rep(0).map(_.toList.sequence_))
196

197
  /**
198
   * A parser for a regular expression. You probably want to use [[regexExpr]] instead, as this
199
   * parser will succeed even if there are trailing characters after a valid regular expression.
200
   */
201
  def regex[_: P]: P[RegexC[Unit]] =
202
    P(
203
      term.flatMap { r1 =>
204
        ("|" ~/ regex).map(r2 => r1 | r2) |
205
          Pass(r1)
206
      }
207
    )
208

209
  /**
210
   * A parser for strings that are complete regular expressions, up until the end of the string.
211
   */
212
  def regexExpr[_: P]: P[RegexC[String]] = P(regex ~ End).map(_.matched.map(_.mkString_("")))
213

214
  def parseRegex(regex: String): Either[String, RegexC[String]] =
215 1
    parse(regex, regexExpr(_), verboseFailures = true) match {
216 1
      case f @ Failure(_, _, _) => Left(f.msg)
217 1
      case Success(value, _) => Right(value)
218
    }
219
}

Read our documentation on viewing source code .

Loading