1
# -*- coding: utf-8 -*-
2 68
"""
3
This module offers a generic date/time string parser which is able to parse
4
most known formats to represent a date and/or time.
5

6
This module attempts to be forgiving with regards to unlikely input formats,
7
returning a datetime object even for dates which are ambiguous. If an element
8
of a date/time stamp is omitted, the following rules are applied:
9

10
- If AM or PM is left unspecified, a 24-hour clock is assumed, however, an hour
11
  on a 12-hour clock (``0 <= hour <= 12``) *must* be specified if AM or PM is
12
  specified.
13
- If a time zone is omitted, a timezone-naive datetime is returned.
14

15
If any other elements are missing, they are taken from the
16
:class:`datetime.datetime` object passed to the parameter ``default``. If this
17
results in a day number exceeding the valid number of days per month, the
18
value falls back to the end of the month.
19

20
Additional resources about date/time string formats can be found below:
21

22
- `A summary of the international standard date and time notation
23
  <https://www.cl.cam.ac.uk/~mgk25/iso-time.html>`_
24
- `W3C Date and Time Formats <https://www.w3.org/TR/NOTE-datetime>`_
25
- `Time Formats (Planetary Rings Node) <https://pds-rings.seti.org:443/tools/time_formats.html>`_
26
- `CPAN ParseDate module
27
  <https://metacpan.org/pod/release/MUIR/Time-modules-2013.0912/lib/Time/ParseDate.pm>`_
28
- `Java SimpleDateFormat Class
29
  <https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html>`_
30
"""
31 68
from __future__ import unicode_literals
32

33 68
import datetime
34 68
import re
35 68
import string
36 68
import time
37 68
import warnings
38

39 68
from calendar import monthrange
40 68
from io import StringIO
41

42 68
import six
43 68
from six import integer_types, text_type
44

45 68
from decimal import Decimal
46

47 68
from warnings import warn
48

49 68
from .. import relativedelta
50 68
from .. import tz
51

52 68
__all__ = ["parse", "parserinfo", "ParserError"]
53

54

55
# TODO: pandas.core.tools.datetimes imports this explicitly.  Might be worth
56
# making public and/or figuring out if there is something we can
57
# take off their plate.
58 68
class _timelex(object):
59
    # Fractional seconds are sometimes split by a comma
60 68
    _split_decimal = re.compile("([.,])")
61

62 68
    def __init__(self, instream):
63 68
        if six.PY2:
64
            # In Python 2, we can't duck type properly because unicode has
65
            # a 'decode' function, and we'd be double-decoding
66 11
            if isinstance(instream, (bytes, bytearray)):
67 11
                instream = instream.decode()
68
        else:
69 57
            if getattr(instream, 'decode', None) is not None:
70 57
                instream = instream.decode()
71

72 68
        if isinstance(instream, text_type):
73 68
            instream = StringIO(instream)
74 68
        elif getattr(instream, 'read', None) is None:
75 68
            raise TypeError('Parser must be a string or character stream, not '
76
                            '{itype}'.format(itype=instream.__class__.__name__))
77

78 68
        self.instream = instream
79 68
        self.charstack = []
80 68
        self.tokenstack = []
81 68
        self.eof = False
82

83 68
    def get_token(self):
84
        """
85
        This function breaks the time string into lexical units (tokens), which
86
        can be parsed by the parser. Lexical units are demarcated by changes in
87
        the character set, so any continuous string of letters is considered
88
        one unit, any continuous string of numbers is considered one unit.
89

90
        The main complication arises from the fact that dots ('.') can be used
91
        both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
92
        "4:30:21.447"). As such, it is necessary to read the full context of
93
        any dot-separated strings before breaking it into tokens; as such, this
94
        function maintains a "token stack", for when the ambiguous context
95
        demands that multiple tokens be parsed at once.
96
        """
97 68
        if self.tokenstack:
98 68
            return self.tokenstack.pop(0)
99

100 68
        seenletters = False
101 68
        token = None
102 68
        state = None
103

104 68
        while not self.eof:
105
            # We only realize that we've reached the end of a token when we
106
            # find a character that's not part of the current token - since
107
            # that character may be part of the next token, it's stored in the
108
            # charstack.
109 68
            if self.charstack:
110 68
                nextchar = self.charstack.pop(0)
111
            else:
112 68
                nextchar = self.instream.read(1)
113 68
                while nextchar == '\x00':
114 68
                    nextchar = self.instream.read(1)
115

116 68
            if not nextchar:
117 68
                self.eof = True
118 68
                break
119 68
            elif not state:
120
                # First character of the token - determines if we're starting
121
                # to parse a word, a number or something else.
122 68
                token = nextchar
123 68
                if self.isword(nextchar):
124 68
                    state = 'a'
125 68
                elif self.isnum(nextchar):
126 68
                    state = '0'
127 68
                elif self.isspace(nextchar):
128 68
                    token = ' '
129 68
                    break  # emit token
130
                else:
131 61
                    break  # emit token
132 68
            elif state == 'a':
133
                # If we've already started reading a word, we keep reading
134
                # letters until we find something that's not part of a word.
135 68
                seenletters = True
136 68
                if self.isword(nextchar):
137 68
                    token += nextchar
138 68
                elif nextchar == '.':
139 68
                    token += nextchar
140 68
                    state = 'a.'
141
                else:
142 68
                    self.charstack.append(nextchar)
143 68
                    break  # emit token
144 68
            elif state == '0':
145
                # If we've already started reading a number, we keep reading
146
                # numbers until we find something that doesn't fit.
147 68
                if self.isnum(nextchar):
148 68
                    token += nextchar
149 68
                elif nextchar == '.' or (nextchar == ',' and len(token) >= 2):
150 68
                    token += nextchar
151 68
                    state = '0.'
152
                else:
153 68
                    self.charstack.append(nextchar)
154 68
                    break  # emit token
155 68
            elif state == 'a.':
156
                # If we've seen some letters and a dot separator, continue
157
                # parsing, and the tokens will be broken up later.
158 68
                seenletters = True
159 68
                if nextchar == '.' or self.isword(nextchar):
160 68
                    token += nextchar
161 68
                elif self.isnum(nextchar) and token[-1] == '.':
162 68
                    token += nextchar
163 68
                    state = '0.'
164
                else:
165 68
                    self.charstack.append(nextchar)
166 68
                    break  # emit token
167 68
            elif state == '0.':
168
                # If we've seen at least one dot separator, keep going, we'll
169
                # break up the tokens later.
170 68
                if nextchar == '.' or self.isnum(nextchar):
171 68
                    token += nextchar
172 68
                elif self.isword(nextchar) and token[-1] == '.':
173 68
                    token += nextchar
174 68
                    state = 'a.'
175
                else:
176 68
                    self.charstack.append(nextchar)
177 68
                    break  # emit token
178

179 68
        if (state in ('a.', '0.') and (seenletters or token.count('.') > 1 or
180
                                       token[-1] in '.,')):
181 68
            l = self._split_decimal.split(token)
182 68
            token = l[0]
183 68
            for tok in l[1:]:
184 68
                if tok:
185 68
                    self.tokenstack.append(tok)
186

187 68
        if state == '0.' and token.count('.') == 0:
188 68
            token = token.replace(',', '.')
189

190 68
        return token
191

192 68
    def __iter__(self):
193 68
        return self
194

195 68
    def __next__(self):
196 68
        token = self.get_token()
197 68
        if token is None:
198 68
            raise StopIteration
199

200 68
        return token
201

202 68
    def next(self):
203 11
        return self.__next__()  # Python 2.x support
204

205 68
    @classmethod
206 21
    def split(cls, s):
207 68
        return list(cls(s))
208

209 68
    @classmethod
210 21
    def isword(cls, nextchar):
211
        """ Whether or not the next character is part of a word """
212 68
        return nextchar.isalpha()
213

214 68
    @classmethod
215 21
    def isnum(cls, nextchar):
216
        """ Whether the next character is part of a number """
217 68
        return nextchar.isdigit()
218

219 68
    @classmethod
220 21
    def isspace(cls, nextchar):
221
        """ Whether the next character is whitespace """
222 68
        return nextchar.isspace()
223

224

225 68
class _resultbase(object):
226

227 68
    def __init__(self):
228 68
        for attr in self.__slots__:
229 68
            setattr(self, attr, None)
230

231 68
    def _repr(self, classname):
232 57
        l = []
233 57
        for attr in self.__slots__:
234 57
            value = getattr(self, attr)
235 57
            if value is not None:
236 57
                l.append("%s=%s" % (attr, repr(value)))
237 57
        return "%s(%s)" % (classname, ", ".join(l))
238

239 68
    def __len__(self):
240 68
        return (sum(getattr(self, attr) is not None
241
                    for attr in self.__slots__))
242

243 68
    def __repr__(self):
244 57
        return self._repr(self.__class__.__name__)
245

246

247 68
class parserinfo(object):
248
    """
249
    Class which handles what inputs are accepted. Subclass this to customize
250
    the language and acceptable values for each parameter.
251

252
    :param dayfirst:
253
        Whether to interpret the first value in an ambiguous 3-integer date
254
        (e.g. 01/05/09) as the day (``True``) or month (``False``). If
255
        ``yearfirst`` is set to ``True``, this distinguishes between YDM
256
        and YMD. Default is ``False``.
257

258
    :param yearfirst:
259
        Whether to interpret the first value in an ambiguous 3-integer date
260
        (e.g. 01/05/09) as the year. If ``True``, the first number is taken
261
        to be the year, otherwise the last number is taken to be the year.
262
        Default is ``False``.
263
    """
264

265
    # m from a.m/p.m, t from ISO T separator
266 68
    JUMP = [" ", ".", ",", ";", "-", "/", "'",
267
            "at", "on", "and", "ad", "m", "t", "of",
268
            "st", "nd", "rd", "th"]
269

270 68
    WEEKDAYS = [("Mon", "Monday"),
271
                ("Tue", "Tuesday"),     # TODO: "Tues"
272
                ("Wed", "Wednesday"),
273
                ("Thu", "Thursday"),    # TODO: "Thurs"
274
                ("Fri", "Friday"),
275
                ("Sat", "Saturday"),
276
                ("Sun", "Sunday")]
277 68
    MONTHS = [("Jan", "January"),
278
              ("Feb", "February"),      # TODO: "Febr"
279
              ("Mar", "March"),
280
              ("Apr", "April"),
281
              ("May", "May"),
282
              ("Jun", "June"),
283
              ("Jul", "July"),
284
              ("Aug", "August"),
285
              ("Sep", "Sept", "September"),
286
              ("Oct", "October"),
287
              ("Nov", "November"),
288
              ("Dec", "December")]
289 68
    HMS = [("h", "hour", "hours"),
290
           ("m", "minute", "minutes"),
291
           ("s", "second", "seconds")]
292 68
    AMPM = [("am", "a"),
293
            ("pm", "p")]
294 68
    UTCZONE = ["UTC", "GMT", "Z", "z"]
295 68
    PERTAIN = ["of"]
296 68
    TZOFFSET = {}
297
    # TODO: ERA = ["AD", "BC", "CE", "BCE", "Stardate",
298
    #              "Anno Domini", "Year of Our Lord"]
299

300 68
    def __init__(self, dayfirst=False, yearfirst=False):
301 68
        self._jump = self._convert(self.JUMP)
302 68
        self._weekdays = self._convert(self.WEEKDAYS)
303 68
        self._months = self._convert(self.MONTHS)
304 68
        self._hms = self._convert(self.HMS)
305 68
        self._ampm = self._convert(self.AMPM)
306 68
        self._utczone = self._convert(self.UTCZONE)
307 68
        self._pertain = self._convert(self.PERTAIN)
308

309 68
        self.dayfirst = dayfirst
310 68
        self.yearfirst = yearfirst
311

312 68
        self._year = time.localtime().tm_year
313 68
        self._century = self._year // 100 * 100
314

315 68
    def _convert(self, lst):
316 68
        dct = {}
317 68
        for i, v in enumerate(lst):
318 68
            if isinstance(v, tuple):
319 68
                for v in v:
320 68
                    dct[v.lower()] = i
321
            else:
322 68
                dct[v.lower()] = i
323 68
        return dct
324

325 68
    def jump(self, name):
326 68
        return name.lower() in self._jump
327

328 68
    def weekday(self, name):
329 68
        try:
330 68
            return self._weekdays[name.lower()]
331 68
        except KeyError:
332 68
            pass
333 68
        return None
334

335 68
    def month(self, name):
336 68
        try:
337 68
            return self._months[name.lower()] + 1
338 68
        except KeyError:
339 68
            pass
340 68
        return None
341

342 68
    def hms(self, name):
343 68
        try:
344 68
            return self._hms[name.lower()]
345 68
        except KeyError:
346 68
            return None
347

348 68
    def ampm(self, name):
349 68
        try:
350 68
            return self._ampm[name.lower()]
351 68
        except KeyError:
352 68
            return None
353

354 68
    def pertain(self, name):
355 68
        return name.lower() in self._pertain
356

357 68
    def utczone(self, name):
358 68
        return name.lower() in self._utczone
359

360 68
    def tzoffset(self, name):
361 68
        if name in self._utczone:
362 68
            return 0
363

364 68
        return self.TZOFFSET.get(name)
365

366 68
    def convertyear(self, year, century_specified=False):
367
        """
368
        Converts two-digit years to year within [-50, 49]
369
        range of self._year (current local time)
370
        """
371

372
        # Function contract is that the year is always positive
373 68
        assert year >= 0
374

375 68
        if year < 100 and not century_specified:
376
            # assume current century to start
377 68
            year += self._century
378

379 68
            if year >= self._year + 50:  # if too far in future
380 68
                year -= 100
381 68
            elif year < self._year - 50:  # if too far in past
382 0
                year += 100
383

384 68
        return year
385

386 68
    def validate(self, res):
387
        # move to info
388 68
        if res.year is not None:
389 68
            res.year = self.convertyear(res.year, res.century_specified)
390

391 68
        if ((res.tzoffset == 0 and not res.tzname) or
392
             (res.tzname == 'Z' or res.tzname == 'z')):
393 68
            res.tzname = "UTC"
394 68
            res.tzoffset = 0
395 68
        elif res.tzoffset != 0 and res.tzname and self.utczone(res.tzname):
396 68
            res.tzoffset = 0
397 68
        return True
398

399

400 68
class _ymd(list):
401 68
    def __init__(self, *args, **kwargs):
402 68
        super(self.__class__, self).__init__(*args, **kwargs)
403 68
        self.century_specified = False
404 68
        self.dstridx = None
405 68
        self.mstridx = None
406 68
        self.ystridx = None
407

408 68
    @property
409 21
    def has_year(self):
410 68
        return self.ystridx is not None
411

412 68
    @property
413 21
    def has_month(self):
414 68
        return self.mstridx is not None
415

416 68
    @property
417 21
    def has_day(self):
418 68
        return self.dstridx is not None
419

420 68
    def could_be_day(self, value):
421 68
        if self.has_day:
422 68
            return False
423 68
        elif not self.has_month:
424 68
            return 1 <= value <= 31
425 68
        elif not self.has_year:
426
            # Be permissive, assume leap year
427 68
            month = self[self.mstridx]
428 68
            return 1 <= value <= monthrange(2000, month)[1]
429
        else:
430 68
            month = self[self.mstridx]
431 68
            year = self[self.ystridx]
432 68
            return 1 <= value <= monthrange(year, month)[1]
433

434 68
    def append(self, val, label=None):
435 68
        if hasattr(val, '__len__'):
436 68
            if val.isdigit() and len(val) > 2:
437 68
                self.century_specified = True
438
                if label not in [None, 'Y']:  # pragma: no cover
439
                    raise ValueError(label)
440 68
                label = 'Y'
441 68
        elif val > 100:
442 68
            self.century_specified = True
443
            if label not in [None, 'Y']:  # pragma: no cover
444
                raise ValueError(label)
445 68
            label = 'Y'
446

447 68
        super(self.__class__, self).append(int(val))
448

449 68
        if label == 'M':
450 68
            if self.has_month:
451 0
                raise ValueError('Month is already set')
452 68
            self.mstridx = len(self) - 1
453 68
        elif label == 'D':
454 68
            if self.has_day:
455 0
                raise ValueError('Day is already set')
456 68
            self.dstridx = len(self) - 1
457 68
        elif label == 'Y':
458 68
            if self.has_year:
459 0
                raise ValueError('Year is already set')
460 68
            self.ystridx = len(self) - 1
461

462 68
    def _resolve_from_stridxs(self, strids):
463
        """
464
        Try to resolve the identities of year/month/day elements using
465
        ystridx, mstridx, and dstridx, if enough of these are specified.
466
        """
467 68
        if len(self) == 3 and len(strids) == 2:
468
            # we can back out the remaining stridx value
469 68
            missing = [x for x in range(3) if x not in strids.values()]
470 68
            key = [x for x in ['y', 'm', 'd'] if x not in strids]
471 68
            assert len(missing) == len(key) == 1
472 68
            key = key[0]
473 68
            val = missing[0]
474 68
            strids[key] = val
475

476 68
        assert len(self) == len(strids)  # otherwise this should not be called
477 68
        out = {key: self[strids[key]] for key in strids}
478 68
        return (out.get('y'), out.get('m'), out.get('d'))
479

480 68
    def resolve_ymd(self, yearfirst, dayfirst):
481 68
        len_ymd = len(self)
482 68
        year, month, day = (None, None, None)
483

484 68
        strids = (('y', self.ystridx),
485
                  ('m', self.mstridx),
486
                  ('d', self.dstridx))
487

488 68
        strids = {key: val for key, val in strids if val is not None}
489 68
        if (len(self) == len(strids) > 0 or
490
                (len(self) == 3 and len(strids) == 2)):
491 68
            return self._resolve_from_stridxs(strids)
492

493 68
        mstridx = self.mstridx
494

495 68
        if len_ymd > 3:
496 68
            raise ValueError("More than three YMD values")
497 68
        elif len_ymd == 1 or (mstridx is not None and len_ymd == 2):
498
            # One member, or two members with a month string
499 68
            if mstridx is not None:
500 68
                month = self[mstridx]
501
                # since mstridx is 0 or 1, self[mstridx-1] always
502
                # looks up the other element
503 68
                other = self[mstridx - 1]
504
            else:
505 0
                other = self[0]
506

507 68
            if len_ymd > 1 or mstridx is None:
508 68
                if other > 31:
509 0
                    year = other
510
                else:
511 68
                    day = other
512

513 68
        elif len_ymd == 2:
514
            # Two members with numbers
515 68
            if self[0] > 31:
516
                # 99-01
517 0
                year, month = self
518 68
            elif self[1] > 31:
519
                # 01-99
520 68
                month, year = self
521 0
            elif dayfirst and self[1] <= 12:
522
                # 13-01
523 0
                day, month = self
524
            else:
525
                # 01-13
526 0
                month, day = self
527

528 68
        elif len_ymd == 3:
529
            # Three members
530 68
            if mstridx == 0:
531 68
                if self[1] > 31:
532
                    # Apr-2003-25
533 0
                    month, year, day = self
534
                else:
535 68
                    month, day, year = self
536 68
            elif mstridx == 1:
537 68
                if self[0] > 31 or (yearfirst and self[2] <= 31):
538
                    # 99-Jan-01
539 0
                    year, month, day = self
540
                else:
541
                    # 01-Jan-01
542
                    # Give precedence to day-first, since
543
                    # two-digit years is usually hand-written.
544 68
                    day, month, year = self
545

546 68
            elif mstridx == 2:
547
                # WTF!?
548 68
                if self[1] > 31:
549
                    # 01-99-Jan
550 0
                    day, year, month = self
551
                else:
552
                    # 99-01-Jan
553 68
                    year, day, month = self
554

555
            else:
556 68
                if (self[0] > 31 or
557
                    self.ystridx == 0 or
558
                        (yearfirst and self[1] <= 12 and self[2] <= 31)):
559
                    # 99-01-01
560 68
                    if dayfirst and self[2] <= 12:
561 68
                        year, day, month = self
562
                    else:
563 68
                        year, month, day = self
564 68
                elif self[0] > 12 or (dayfirst and self[1] <= 12):
565
                    # 13-01-01
566 68
                    day, month, year = self
567
                else:
568
                    # 01-13-01
569 68
                    month, day, year = self
570

571 68
        return year, month, day
572

573

574 68
class parser(object):
575 68
    def __init__(self, info=None):
576 68
        self.info = info or parserinfo()
577

578 68
    def parse(self, timestr, default=None,
579
              ignoretz=False, tzinfos=None, **kwargs):
580
        """
581
        Parse the date/time string into a :class:`datetime.datetime` object.
582

583
        :param timestr:
584
            Any date/time string using the supported formats.
585

586
        :param default:
587
            The default datetime object, if this is a datetime object and not
588
            ``None``, elements specified in ``timestr`` replace elements in the
589
            default object.
590

591
        :param ignoretz:
592
            If set ``True``, time zones in parsed strings are ignored and a
593
            naive :class:`datetime.datetime` object is returned.
594

595
        :param tzinfos:
596
            Additional time zone names / aliases which may be present in the
597
            string. This argument maps time zone names (and optionally offsets
598
            from those time zones) to time zones. This parameter can be a
599
            dictionary with timezone aliases mapping time zone names to time
600
            zones or a function taking two parameters (``tzname`` and
601
            ``tzoffset``) and returning a time zone.
602

603
            The timezones to which the names are mapped can be an integer
604
            offset from UTC in seconds or a :class:`tzinfo` object.
605

606
            .. doctest::
607
               :options: +NORMALIZE_WHITESPACE
608

609
                >>> from dateutil.parser import parse
610
                >>> from dateutil.tz import gettz
611
                >>> tzinfos = {"BRST": -7200, "CST": gettz("America/Chicago")}
612
                >>> parse("2012-01-19 17:21:00 BRST", tzinfos=tzinfos)
613
                datetime.datetime(2012, 1, 19, 17, 21, tzinfo=tzoffset(u'BRST', -7200))
614
                >>> parse("2012-01-19 17:21:00 CST", tzinfos=tzinfos)
615
                datetime.datetime(2012, 1, 19, 17, 21,
616
                                  tzinfo=tzfile('/usr/share/zoneinfo/America/Chicago'))
617

618
            This parameter is ignored if ``ignoretz`` is set.
619

620
        :param \\*\\*kwargs:
621
            Keyword arguments as passed to ``_parse()``.
622

623
        :return:
624
            Returns a :class:`datetime.datetime` object or, if the
625
            ``fuzzy_with_tokens`` option is ``True``, returns a tuple, the
626
            first element being a :class:`datetime.datetime` object, the second
627
            a tuple containing the fuzzy tokens.
628

629
        :raises ParserError:
630
            Raised for invalid or unknown string format, if the provided
631
            :class:`tzinfo` is not in a valid format, or if an invalid date
632
            would be created.
633

634
        :raises TypeError:
635
            Raised for non-string or character stream input.
636

637
        :raises OverflowError:
638
            Raised if the parsed date exceeds the largest valid C integer on
639
            your system.
640
        """
641

642 68
        if default is None:
643 68
            default = datetime.datetime.now().replace(hour=0, minute=0,
644
                                                      second=0, microsecond=0)
645

646 68
        res, skipped_tokens = self._parse(timestr, **kwargs)
647

648 68
        if res is None:
649 68
            raise ParserError("Unknown string format: %s", timestr)
650

651 68
        if len(res) == 0:
652 68
            raise ParserError("String does not contain a date: %s", timestr)
653

654 68
        try:
655 68
            ret = self._build_naive(res, default)
656 68
        except ValueError as e:
657 68
            six.raise_from(ParserError(str(e) + ": %s", timestr), e)
658

659 68
        if not ignoretz:
660 68
            ret = self._build_tzaware(ret, res, tzinfos)
661

662 68
        if kwargs.get('fuzzy_with_tokens', False):
663 68
            return ret, skipped_tokens
664
        else:
665 68
            return ret
666

667 68
    class _result(_resultbase):
668 68
        __slots__ = ["year", "month", "day", "weekday",
669
                     "hour", "minute", "second", "microsecond",
670
                     "tzname", "tzoffset", "ampm","any_unused_tokens"]
671

672 68
    def _parse(self, timestr, dayfirst=None, yearfirst=None, fuzzy=False,
673
               fuzzy_with_tokens=False):
674
        """
675
        Private method which performs the heavy lifting of parsing, called from
676
        ``parse()``, which passes on its ``kwargs`` to this function.
677

678
        :param timestr:
679
            The string to parse.
680

681
        :param dayfirst:
682
            Whether to interpret the first value in an ambiguous 3-integer date
683
            (e.g. 01/05/09) as the day (``True``) or month (``False``). If
684
            ``yearfirst`` is set to ``True``, this distinguishes between YDM
685
            and YMD. If set to ``None``, this value is retrieved from the
686
            current :class:`parserinfo` object (which itself defaults to
687
            ``False``).
688

689
        :param yearfirst:
690
            Whether to interpret the first value in an ambiguous 3-integer date
691
            (e.g. 01/05/09) as the year. If ``True``, the first number is taken
692
            to be the year, otherwise the last number is taken to be the year.
693
            If this is set to ``None``, the value is retrieved from the current
694
            :class:`parserinfo` object (which itself defaults to ``False``).
695

696
        :param fuzzy:
697
            Whether to allow fuzzy parsing, allowing for string like "Today is
698
            January 1, 2047 at 8:21:00AM".
699

700
        :param fuzzy_with_tokens:
701
            If ``True``, ``fuzzy`` is automatically set to True, and the parser
702
            will return a tuple where the first element is the parsed
703
            :class:`datetime.datetime` datetimestamp and the second element is
704
            a tuple containing the portions of the string which were ignored:
705

706
            .. doctest::
707

708
                >>> from dateutil.parser import parse
709
                >>> parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True)
710
                (datetime.datetime(2047, 1, 1, 8, 21), (u'Today is ', u' ', u'at '))
711

712
        """
713 68
        if fuzzy_with_tokens:
714 68
            fuzzy = True
715

716 68
        info = self.info
717

718 68
        if dayfirst is None:
719 68
            dayfirst = info.dayfirst
720

721 68
        if yearfirst is None:
722 68
            yearfirst = info.yearfirst
723

724 68
        res = self._result()
725 68
        l = _timelex.split(timestr)         # Splits the timestr into tokens
726

727 68
        skipped_idxs = []
728

729
        # year/month/day list
730 68
        ymd = _ymd()
731

732 68
        len_l = len(l)
733 68
        i = 0
734 68
        try:
735 68
            while i < len_l:
736

737
                # Check if it's a number
738 68
                value_repr = l[i]
739 68
                try:
740 68
                    value = float(value_repr)
741 68
                except ValueError:
742 68
                    value = None
743

744 68
                if value is not None:
745
                    # Numeric token
746 68
                    i = self._parse_numeric_token(l, i, info, ymd, res, fuzzy)
747

748
                # Check weekday
749 68
                elif info.weekday(l[i]) is not None:
750 68
                    value = info.weekday(l[i])
751 68
                    res.weekday = value
752

753
                # Check month name
754 68
                elif info.month(l[i]) is not None:
755 68
                    value = info.month(l[i])
756 68
                    ymd.append(value, 'M')
757

758 68
                    if i + 1 < len_l:
759 68
                        if l[i + 1] in ('-', '/'):
760
                            # Jan-01[-99]
761 68
                            sep = l[i + 1]
762 68
                            ymd.append(l[i + 2])
763

764 68
                            if i + 3 < len_l and l[i + 3] == sep:
765
                                # Jan-01-99
766 68
                                ymd.append(l[i + 4])
767 68
                                i += 2
768

769 68
                            i += 2
770

771 68
                        elif (i + 4 < len_l and l[i + 1] == l[i + 3] == ' ' and
772
                              info.pertain(l[i + 2])):
773
                            # Jan of 01
774
                            # In this case, 01 is clearly year
775 68
                            if l[i + 4].isdigit():
776
                                # Convert it here to become unambiguous
777 68
                                value = int(l[i + 4])
778 68
                                year = str(info.convertyear(value))
779 68
                                ymd.append(year, 'Y')
780
                            else:
781
                                # Wrong guess
782 0
                                pass
783
                                # TODO: not hit in tests
784 68
                            i += 4
785

786
                # Check am/pm
787 68
                elif info.ampm(l[i]) is not None:
788 68
                    value = info.ampm(l[i])
789 68
                    val_is_ampm = self._ampm_valid(res.hour, res.ampm, fuzzy)
790

791 68
                    if val_is_ampm:
792 68
                        res.hour = self._adjust_ampm(res.hour, value)
793 68
                        res.ampm = value
794

795 68
                    elif fuzzy:
796 68
                        skipped_idxs.append(i)
797

798
                # Check for a timezone name
799 68
                elif self._could_be_tzname(res.hour, res.tzname, res.tzoffset, l[i]):
800 68
                    res.tzname = l[i]
801 68
                    res.tzoffset = info.tzoffset(res.tzname)
802

803
                    # Check for something like GMT+3, or BRST+3. Notice
804
                    # that it doesn't mean "I am 3 hours after GMT", but
805
                    # "my time +3 is GMT". If found, we reverse the
806
                    # logic so that timezone parsing code will get it
807
                    # right.
808 68
                    if i + 1 < len_l and l[i + 1] in ('+', '-'):
809 0
                        l[i + 1] = ('+', '-')[l[i + 1] == '+']
810 0
                        res.tzoffset = None
811 0
                        if info.utczone(res.tzname):
812
                            # With something like GMT+3, the timezone
813
                            # is *not* GMT.
814 0
                            res.tzname = None
815

816
                # Check for a numbered timezone
817 68
                elif res.hour is not None and l[i] in ('+', '-'):
818 68
                    signal = (-1, 1)[l[i] == '+']
819 68
                    len_li = len(l[i + 1])
820

821
                    # TODO: check that l[i + 1] is integer?
822 68
                    if len_li == 4:
823
                        # -0300
824 68
                        hour_offset = int(l[i + 1][:2])
825 68
                        min_offset = int(l[i + 1][2:])
826 68
                    elif i + 2 < len_l and l[i + 2] == ':':
827
                        # -03:00
828 68
                        hour_offset = int(l[i + 1])
829 68
                        min_offset = int(l[i + 3])  # TODO: Check that l[i+3] is minute-like?
830 68
                        i += 2
831 0
                    elif len_li <= 2:
832
                        # -[0]3
833 0
                        hour_offset = int(l[i + 1][:2])
834 0
                        min_offset = 0
835
                    else:
836 0
                        raise ValueError(timestr)
837

838 68
                    res.tzoffset = signal * (hour_offset * 3600 + min_offset * 60)
839

840
                    # Look for a timezone name between parenthesis
841 68
                    if (i + 5 < len_l and
842
                            info.jump(l[i + 2]) and l[i + 3] == '(' and
843
                            l[i + 5] == ')' and
844
                            3 <= len(l[i + 4]) and
845
                            self._could_be_tzname(res.hour, res.tzname,
846
                                                  None, l[i + 4])):
847
                        # -0300 (BRST)
848 0
                        res.tzname = l[i + 4]
849 0
                        i += 4
850

851 68
                    i += 1
852

853
                # Check jumps
854 68
                elif not (info.jump(l[i]) or fuzzy):
855 68
                    raise ValueError(timestr)
856

857
                else:
858 68
                    skipped_idxs.append(i)
859 68
                i += 1
860

861
            # Process year/month/day
862 68
            year, month, day = ymd.resolve_ymd(yearfirst, dayfirst)
863

864 68
            res.century_specified = ymd.century_specified
865 68
            res.year = year
866 68
            res.month = month
867 68
            res.day = day
868

869 68
        except (IndexError, ValueError):
870 68
            return None, None
871

872 68
        if not info.validate(res):
873 0
            return None, None
874

875 68
        if fuzzy_with_tokens:
876 68
            skipped_tokens = self._recombine_skipped(l, skipped_idxs)
877 68
            return res, tuple(skipped_tokens)
878
        else:
879 68
            return res, None
880

881 68
    def _parse_numeric_token(self, tokens, idx, info, ymd, res, fuzzy):
882
        # Token is a number
883 68
        value_repr = tokens[idx]
884 68
        try:
885 68
            value = self._to_decimal(value_repr)
886 68
        except Exception as e:
887 68
            six.raise_from(ValueError('Unknown numeric token'), e)
888

889 68
        len_li = len(value_repr)
890

891 68
        len_l = len(tokens)
892

893 68
        if (len(ymd) == 3 and len_li in (2, 4) and
894
            res.hour is None and
895
            (idx + 1 >= len_l or
896
             (tokens[idx + 1] != ':' and
897
              info.hms(tokens[idx + 1]) is None))):
898
            # 19990101T23[59]
899 68
            s = tokens[idx]
900 68
            res.hour = int(s[:2])
901

902 68
            if len_li == 4:
903 68
                res.minute = int(s[2:])
904

905 68
        elif len_li == 6 or (len_li > 6 and tokens[idx].find('.') == 6):
906
            # YYMMDD or HHMMSS[.ss]
907 68
            s = tokens[idx]
908

909 68
            if not ymd and '.' not in tokens[idx]:
910 68
                ymd.append(s[:2])
911 68
                ymd.append(s[2:4])
912 68
                ymd.append(s[4:])
913
            else:
914
                # 19990101T235959[.59]
915

916
                # TODO: Check if res attributes already set.
917 68
                res.hour = int(s[:2])
918 68
                res.minute = int(s[2:4])
919 68
                res.second, res.microsecond = self._parsems(s[4:])
920

921 68
        elif len_li in (8, 12, 14):
922
            # YYYYMMDD
923 68
            s = tokens[idx]
924 68
            ymd.append(s[:4], 'Y')
925 68
            ymd.append(s[4:6])
926 68
            ymd.append(s[6:8])
927

928 68
            if len_li > 8:
929 68
                res.hour = int(s[8:10])
930 68
                res.minute = int(s[10:12])
931

932 68
                if len_li > 12:
933 68
                    res.second = int(s[12:])
934

935 68
        elif self._find_hms_idx(idx, tokens, info, allow_jump=True) is not None:
936
            # HH[ ]h or MM[ ]m or SS[.ss][ ]s
937 68
            hms_idx = self._find_hms_idx(idx, tokens, info, allow_jump=True)
938 68
            (idx, hms) = self._parse_hms(idx, tokens, info, hms_idx)
939 68
            if hms is not None:
940
                # TODO: checking that hour/minute/second are not
941
                # already set?
942 68
                self._assign_hms(res, value_repr, hms)
943

944 68
        elif idx + 2 < len_l and tokens[idx + 1] == ':':
945
            # HH:MM[:SS[.ss]]
946 68
            res.hour = int(value)
947 68
            value = self._to_decimal(tokens[idx + 2])  # TODO: try/except for this?
948 68
            (res.minute, res.second) = self._parse_min_sec(value)
949

950 68
            if idx + 4 < len_l and tokens[idx + 3] == ':':
951 68
                res.second, res.microsecond = self._parsems(tokens[idx + 4])
952

953 68
                idx += 2
954

955 68
            idx += 2
956

957 68
        elif idx + 1 < len_l and tokens[idx + 1] in ('-', '/', '.'):
958 68
            sep = tokens[idx + 1]
959 68
            ymd.append(value_repr)
960

961 68
            if idx + 2 < len_l and not info.jump(tokens[idx + 2]):
962 68
                if tokens[idx + 2].isdigit():
963
                    # 01-01[-01]
964 68
                    ymd.append(tokens[idx + 2])
965
                else:
966
                    # 01-Jan[-01]
967 68
                    value = info.month(tokens[idx + 2])
968

969 68
                    if value is not None:
970 68
                        ymd.append(value, 'M')
971
                    else:
972 0
                        raise ValueError()
973

974 68
                if idx + 3 < len_l and tokens[idx + 3] == sep:
975
                    # We have three members
976 68
                    value = info.month(tokens[idx + 4])
977

978 68
                    if value is not None:
979 68
                        ymd.append(value, 'M')
980
                    else:
981 68
                        ymd.append(tokens[idx + 4])
982 68
                    idx += 2
983

984 68
                idx += 1
985 68
            idx += 1
986

987 68
        elif idx + 1 >= len_l or info.jump(tokens[idx + 1]):
988 68
            if idx + 2 < len_l and info.ampm(tokens[idx + 2]) is not None:
989
                # 12 am
990 68
                hour = int(value)
991 68
                res.hour = self._adjust_ampm(hour, info.ampm(tokens[idx + 2]))
992 68
                idx += 1
993
            else:
994
                # Year, month or day
995 68
                ymd.append(value)
996 68
            idx += 1
997

998 68
        elif info.ampm(tokens[idx + 1]) is not None and (0 <= value < 24):
999
            # 12am
1000 68
            hour = int(value)
1001 68
            res.hour = self._adjust_ampm(hour, info.ampm(tokens[idx + 1]))
1002 68
            idx += 1
1003

1004 68
        elif ymd.could_be_day(value):
1005 68
            ymd.append(value)
1006

1007 68
        elif not fuzzy:
1008 68
            raise ValueError()
1009

1010 68
        return idx
1011

1012 68
    def _find_hms_idx(self, idx, tokens, info, allow_jump):
1013 68
        len_l = len(tokens)
1014

1015 68
        if idx+1 < len_l and info.hms(tokens[idx+1]) is not None:
1016
            # There is an "h", "m", or "s" label following this token.  We take
1017
            # assign the upcoming label to the current token.
1018
            # e.g. the "12" in 12h"
1019 68
            hms_idx = idx + 1
1020

1021 68
        elif (allow_jump and idx+2 < len_l and tokens[idx+1] == ' ' and
1022
              info.hms(tokens[idx+2]) is not None):
1023
            # There is a space and then an "h", "m", or "s" label.
1024
            # e.g. the "12" in "12 h"
1025 68
            hms_idx = idx + 2
1026

1027 68
        elif idx > 0 and info.hms(tokens[idx-1]) is not None:
1028
            # There is a "h", "m", or "s" preceding this token.  Since neither
1029
            # of the previous cases was hit, there is no label following this
1030
            # token, so we use the previous label.
1031
            # e.g. the "04" in "12h04"
1032 68
            hms_idx = idx-1
1033

1034 68
        elif (1 < idx == len_l-1 and tokens[idx-1] == ' ' and
1035
              info.hms(tokens[idx-2]) is not None):
1036
            # If we are looking at the final token, we allow for a
1037
            # backward-looking check to skip over a space.
1038
            # TODO: Are we sure this is the right condition here?
1039 68
            hms_idx = idx - 2
1040

1041
        else:
1042 68
            hms_idx = None
1043

1044 68
        return hms_idx
1045

1046 68
    def _assign_hms(self, res, value_repr, hms):
1047
        # See GH issue #427, fixing float rounding
1048 68
        value = self._to_decimal(value_repr)
1049

1050 68
        if hms == 0:
1051
            # Hour
1052 68
            res.hour = int(value)
1053 68
            if value % 1:
1054 68
                res.minute = int(60*(value % 1))
1055

1056 68
        elif hms == 1:
1057 68
            (res.minute, res.second) = self._parse_min_sec(value)
1058

1059 68
        elif hms == 2:
1060 68
            (res.second, res.microsecond) = self._parsems(value_repr)
1061

1062 68
    def _could_be_tzname(self, hour, tzname, tzoffset, token):
1063 68
        return (hour is not None and
1064
                tzname is None and
1065
                tzoffset is None and
1066
                len(token) <= 5 and
1067
                (all(x in string.ascii_uppercase for x in token)
1068
                 or token in self.info.UTCZONE))
1069

1070 68
    def _ampm_valid(self, hour, ampm, fuzzy):
1071
        """
1072
        For fuzzy parsing, 'a' or 'am' (both valid English words)
1073
        may erroneously trigger the AM/PM flag. Deal with that
1074
        here.
1075
        """
1076