_strptime.py revision 6e4150f36b74622a23ea0fd259bd61bbd753d3fd
1"""Strptime-related classes and functions.
2
3CLASSES:
4    LocaleTime -- Discovers and stores locale-specific time information
5    TimeRE -- Creates regexes for pattern matching a string of text containing
6                time information
7
8FUNCTIONS:
9    _getlang -- Figure out what language is being used for the locale
10    strptime -- Calculates the time struct represented by the passed-in string
11
12"""
13import time
14import locale
15import calendar
16from re import compile as re_compile
17from re import IGNORECASE
18from re import escape as re_escape
19from datetime import (date as datetime_date,
20                      timedelta as datetime_timedelta,
21                      timezone as datetime_timezone)
22try:
23    from _thread import allocate_lock as _thread_allocate_lock
24except ImportError:
25    from _dummy_thread import allocate_lock as _thread_allocate_lock
26
27__all__ = []
28
29def _getlang():
30    # Figure out what the current language is set to.
31    return locale.getlocale(locale.LC_TIME)
32
33class LocaleTime(object):
34    """Stores and handles locale-specific information related to time.
35
36    ATTRIBUTES:
37        f_weekday -- full weekday names (7-item list)
38        a_weekday -- abbreviated weekday names (7-item list)
39        f_month -- full month names (13-item list; dummy value in [0], which
40                    is added by code)
41        a_month -- abbreviated month names (13-item list, dummy value in
42                    [0], which is added by code)
43        am_pm -- AM/PM representation (2-item list)
44        LC_date_time -- format string for date/time representation (string)
45        LC_date -- format string for date representation (string)
46        LC_time -- format string for time representation (string)
47        timezone -- daylight- and non-daylight-savings timezone representation
48                    (2-item list of sets)
49        lang -- Language used by instance (2-item tuple)
50    """
51
52    def __init__(self):
53        """Set all attributes.
54
55        Order of methods called matters for dependency reasons.
56
57        The locale language is set at the offset and then checked again before
58        exiting.  This is to make sure that the attributes were not set with a
59        mix of information from more than one locale.  This would most likely
60        happen when using threads where one thread calls a locale-dependent
61        function while another thread changes the locale while the function in
62        the other thread is still running.  Proper coding would call for
63        locks to prevent changing the locale while locale-dependent code is
64        running.  The check here is done in case someone does not think about
65        doing this.
66
67        Only other possible issue is if someone changed the timezone and did
68        not call tz.tzset .  That is an issue for the programmer, though,
69        since changing the timezone is worthless without that call.
70
71        """
72        self.lang = _getlang()
73        self.__calc_weekday()
74        self.__calc_month()
75        self.__calc_am_pm()
76        self.__calc_timezone()
77        self.__calc_date_time()
78        if _getlang() != self.lang:
79            raise ValueError("locale changed during initialization")
80        if time.tzname != self.tzname or time.daylight != self.daylight:
81            raise ValueError("timezone changed during initialization")
82
83    def __pad(self, seq, front):
84        # Add '' to seq to either the front (is True), else the back.
85        seq = list(seq)
86        if front:
87            seq.insert(0, '')
88        else:
89            seq.append('')
90        return seq
91
92    def __calc_weekday(self):
93        # Set self.a_weekday and self.f_weekday using the calendar
94        # module.
95        a_weekday = [calendar.day_abbr[i].lower() for i in range(7)]
96        f_weekday = [calendar.day_name[i].lower() for i in range(7)]
97        self.a_weekday = a_weekday
98        self.f_weekday = f_weekday
99
100    def __calc_month(self):
101        # Set self.f_month and self.a_month using the calendar module.
102        a_month = [calendar.month_abbr[i].lower() for i in range(13)]
103        f_month = [calendar.month_name[i].lower() for i in range(13)]
104        self.a_month = a_month
105        self.f_month = f_month
106
107    def __calc_am_pm(self):
108        # Set self.am_pm by using time.strftime().
109
110        # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that
111        # magical; just happened to have used it everywhere else where a
112        # static date was needed.
113        am_pm = []
114        for hour in (1, 22):
115            time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0))
116            am_pm.append(time.strftime("%p", time_tuple).lower())
117        self.am_pm = am_pm
118
119    def __calc_date_time(self):
120        # Set self.date_time, self.date, & self.time by using
121        # time.strftime().
122
123        # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
124        # overloaded numbers is minimized.  The order in which searches for
125        # values within the format string is very important; it eliminates
126        # possible ambiguity for what something represents.
127        time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
128        date_time = [None, None, None]
129        date_time[0] = time.strftime("%c", time_tuple).lower()
130        date_time[1] = time.strftime("%x", time_tuple).lower()
131        date_time[2] = time.strftime("%X", time_tuple).lower()
132        replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'),
133                    (self.f_month[3], '%B'), (self.a_weekday[2], '%a'),
134                    (self.a_month[3], '%b'), (self.am_pm[1], '%p'),
135                    ('1999', '%Y'), ('99', '%y'), ('22', '%H'),
136                    ('44', '%M'), ('55', '%S'), ('76', '%j'),
137                    ('17', '%d'), ('03', '%m'), ('3', '%m'),
138                    # '3' needed for when no leading zero.
139                    ('2', '%w'), ('10', '%I')]
140        replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone
141                                                for tz in tz_values])
142        for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')):
143            current_format = date_time[offset]
144            for old, new in replacement_pairs:
145                # Must deal with possible lack of locale info
146                # manifesting itself as the empty string (e.g., Swedish's
147                # lack of AM/PM info) or a platform returning a tuple of empty
148                # strings (e.g., MacOS 9 having timezone as ('','')).
149                if old:
150                    current_format = current_format.replace(old, new)
151            # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
152            # 2005-01-03 occurs before the first Monday of the year.  Otherwise
153            # %U is used.
154            time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0))
155            if '00' in time.strftime(directive, time_tuple):
156                U_W = '%W'
157            else:
158                U_W = '%U'
159            date_time[offset] = current_format.replace('11', U_W)
160        self.LC_date_time = date_time[0]
161        self.LC_date = date_time[1]
162        self.LC_time = date_time[2]
163
164    def __calc_timezone(self):
165        # Set self.timezone by using time.tzname.
166        # Do not worry about possibility of time.tzname[0] == time.tzname[1]
167        # and time.daylight; handle that in strptime.
168        try:
169            time.tzset()
170        except AttributeError:
171            pass
172        self.tzname = time.tzname
173        self.daylight = time.daylight
174        no_saving = frozenset({"utc", "gmt", self.tzname[0].lower()})
175        if self.daylight:
176            has_saving = frozenset({self.tzname[1].lower()})
177        else:
178            has_saving = frozenset()
179        self.timezone = (no_saving, has_saving)
180
181
182class TimeRE(dict):
183    """Handle conversion from format directives to regexes."""
184
185    def __init__(self, locale_time=None):
186        """Create keys/values.
187
188        Order of execution is important for dependency reasons.
189
190        """
191        if locale_time:
192            self.locale_time = locale_time
193        else:
194            self.locale_time = LocaleTime()
195        base = super()
196        base.__init__({
197            # The " \d" part of the regex is to make %c from ANSI C work
198            'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
199            'f': r"(?P<f>[0-9]{1,6})",
200            'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
201            'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
202            'G': r"(?P<G>\d\d\d\d)",
203            'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
204            'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
205            'M': r"(?P<M>[0-5]\d|\d)",
206            'S': r"(?P<S>6[0-1]|[0-5]\d|\d)",
207            'U': r"(?P<U>5[0-3]|[0-4]\d|\d)",
208            'w': r"(?P<w>[0-6])",
209            'u': r"(?P<u>[1-7])",
210            'V': r"(?P<V>5[0-3]|0[1-9]|[1-4]\d|\d)",
211            # W is set below by using 'U'
212            'y': r"(?P<y>\d\d)",
213            #XXX: Does 'Y' need to worry about having less or more than
214            #     4 digits?
215            'Y': r"(?P<Y>\d\d\d\d)",
216            'z': r"(?P<z>[+-]\d\d[0-5]\d)",
217            'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
218            'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
219            'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),
220            'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'),
221            'p': self.__seqToRE(self.locale_time.am_pm, 'p'),
222            'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone
223                                        for tz in tz_names),
224                                'Z'),
225            '%': '%'})
226        base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
227        base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
228        base.__setitem__('x', self.pattern(self.locale_time.LC_date))
229        base.__setitem__('X', self.pattern(self.locale_time.LC_time))
230
231    def __seqToRE(self, to_convert, directive):
232        """Convert a list to a regex string for matching a directive.
233
234        Want possible matching values to be from longest to shortest.  This
235        prevents the possibility of a match occurring for a value that also
236        a substring of a larger value that should have matched (e.g., 'abc'
237        matching when 'abcdef' should have been the match).
238
239        """
240        to_convert = sorted(to_convert, key=len, reverse=True)
241        for value in to_convert:
242            if value != '':
243                break
244        else:
245            return ''
246        regex = '|'.join(re_escape(stuff) for stuff in to_convert)
247        regex = '(?P<%s>%s' % (directive, regex)
248        return '%s)' % regex
249
250    def pattern(self, format):
251        """Return regex pattern for the format string.
252
253        Need to make sure that any characters that might be interpreted as
254        regex syntax are escaped.
255
256        """
257        processed_format = ''
258        # The sub() call escapes all characters that might be misconstrued
259        # as regex syntax.  Cannot use re.escape since we have to deal with
260        # format directives (%m, etc.).
261        regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])")
262        format = regex_chars.sub(r"\\\1", format)
263        whitespace_replacement = re_compile(r'\s+')
264        format = whitespace_replacement.sub(r'\\s+', format)
265        while '%' in format:
266            directive_index = format.index('%')+1
267            processed_format = "%s%s%s" % (processed_format,
268                                           format[:directive_index-1],
269                                           self[format[directive_index]])
270            format = format[directive_index+1:]
271        return "%s%s" % (processed_format, format)
272
273    def compile(self, format):
274        """Return a compiled re object for the format string."""
275        return re_compile(self.pattern(format), IGNORECASE)
276
277_cache_lock = _thread_allocate_lock()
278# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
279# first!
280_TimeRE_cache = TimeRE()
281_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
282_regex_cache = {}
283
284def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon):
285    """Calculate the Julian day based on the year, week of the year, and day of
286    the week, with week_start_day representing whether the week of the year
287    assumes the week starts on Sunday or Monday (6 or 0)."""
288    first_weekday = datetime_date(year, 1, 1).weekday()
289    # If we are dealing with the %U directive (week starts on Sunday), it's
290    # easier to just shift the view to Sunday being the first day of the
291    # week.
292    if not week_starts_Mon:
293        first_weekday = (first_weekday + 1) % 7
294        day_of_week = (day_of_week + 1) % 7
295    # Need to watch out for a week 0 (when the first day of the year is not
296    # the same as that specified by %U or %W).
297    week_0_length = (7 - first_weekday) % 7
298    if week_of_year == 0:
299        return 1 + day_of_week - first_weekday
300    else:
301        days_to_week = week_0_length + (7 * (week_of_year - 1))
302        return 1 + days_to_week + day_of_week
303
304
305def _calc_julian_from_V(iso_year, iso_week, iso_weekday):
306    """Calculate the Julian day based on the ISO 8601 year, week, and weekday.
307    ISO weeks start on Mondays, with week 01 being the week containing 4 Jan.
308    ISO week days range from 1 (Monday) to 7 (Sunday).
309    """
310    correction = datetime_date(iso_year, 1, 4).isoweekday() + 3
311    ordinal = (iso_week * 7) + iso_weekday - correction
312    # ordinal may be negative or 0 now, which means the date is in the previous
313    # calendar year
314    if ordinal < 1:
315        ordinal += datetime_date(iso_year, 1, 1).toordinal()
316        iso_year -= 1
317        ordinal -= datetime_date(iso_year, 1, 1).toordinal()
318    return iso_year, ordinal
319
320
321def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
322    """Return a 2-tuple consisting of a time struct and an int containing
323    the number of microseconds based on the input string and the
324    format string."""
325
326    for index, arg in enumerate([data_string, format]):
327        if not isinstance(arg, str):
328            msg = "strptime() argument {} must be str, not {}"
329            raise TypeError(msg.format(index, type(arg)))
330
331    global _TimeRE_cache, _regex_cache
332    with _cache_lock:
333        locale_time = _TimeRE_cache.locale_time
334        if (_getlang() != locale_time.lang or
335            time.tzname != locale_time.tzname or
336            time.daylight != locale_time.daylight):
337            _TimeRE_cache = TimeRE()
338            _regex_cache.clear()
339            locale_time = _TimeRE_cache.locale_time
340        if len(_regex_cache) > _CACHE_MAX_SIZE:
341            _regex_cache.clear()
342        format_regex = _regex_cache.get(format)
343        if not format_regex:
344            try:
345                format_regex = _TimeRE_cache.compile(format)
346            # KeyError raised when a bad format is found; can be specified as
347            # \\, in which case it was a stray % but with a space after it
348            except KeyError as err:
349                bad_directive = err.args[0]
350                if bad_directive == "\\":
351                    bad_directive = "%"
352                del err
353                raise ValueError("'%s' is a bad directive in format '%s'" %
354                                    (bad_directive, format)) from None
355            # IndexError only occurs when the format string is "%"
356            except IndexError:
357                raise ValueError("stray %% in format '%s'" % format) from None
358            _regex_cache[format] = format_regex
359    found = format_regex.match(data_string)
360    if not found:
361        raise ValueError("time data %r does not match format %r" %
362                         (data_string, format))
363    if len(data_string) != found.end():
364        raise ValueError("unconverted data remains: %s" %
365                          data_string[found.end():])
366
367    iso_year = year = None
368    month = day = 1
369    hour = minute = second = fraction = 0
370    tz = -1
371    tzoffset = None
372    # Default to -1 to signify that values not known; not critical to have,
373    # though
374    iso_week = week_of_year = None
375    week_of_year_start = None
376    # weekday and julian defaulted to None so as to signal need to calculate
377    # values
378    weekday = julian = None
379    found_dict = found.groupdict()
380    for group_key in found_dict.keys():
381        # Directives not explicitly handled below:
382        #   c, x, X
383        #      handled by making out of other directives
384        #   U, W
385        #      worthless without day of the week
386        if group_key == 'y':
387            year = int(found_dict['y'])
388            # Open Group specification for strptime() states that a %y
389            #value in the range of [00, 68] is in the century 2000, while
390            #[69,99] is in the century 1900
391            if year <= 68:
392                year += 2000
393            else:
394                year += 1900
395        elif group_key == 'Y':
396            year = int(found_dict['Y'])
397        elif group_key == 'G':
398            iso_year = int(found_dict['G'])
399        elif group_key == 'm':
400            month = int(found_dict['m'])
401        elif group_key == 'B':
402            month = locale_time.f_month.index(found_dict['B'].lower())
403        elif group_key == 'b':
404            month = locale_time.a_month.index(found_dict['b'].lower())
405        elif group_key == 'd':
406            day = int(found_dict['d'])
407        elif group_key == 'H':
408            hour = int(found_dict['H'])
409        elif group_key == 'I':
410            hour = int(found_dict['I'])
411            ampm = found_dict.get('p', '').lower()
412            # If there was no AM/PM indicator, we'll treat this like AM
413            if ampm in ('', locale_time.am_pm[0]):
414                # We're in AM so the hour is correct unless we're
415                # looking at 12 midnight.
416                # 12 midnight == 12 AM == hour 0
417                if hour == 12:
418                    hour = 0
419            elif ampm == locale_time.am_pm[1]:
420                # We're in PM so we need to add 12 to the hour unless
421                # we're looking at 12 noon.
422                # 12 noon == 12 PM == hour 12
423                if hour != 12:
424                    hour += 12
425        elif group_key == 'M':
426            minute = int(found_dict['M'])
427        elif group_key == 'S':
428            second = int(found_dict['S'])
429        elif group_key == 'f':
430            s = found_dict['f']
431            # Pad to always return microseconds.
432            s += "0" * (6 - len(s))
433            fraction = int(s)
434        elif group_key == 'A':
435            weekday = locale_time.f_weekday.index(found_dict['A'].lower())
436        elif group_key == 'a':
437            weekday = locale_time.a_weekday.index(found_dict['a'].lower())
438        elif group_key == 'w':
439            weekday = int(found_dict['w'])
440            if weekday == 0:
441                weekday = 6
442            else:
443                weekday -= 1
444        elif group_key == 'u':
445            weekday = int(found_dict['u'])
446            weekday -= 1
447        elif group_key == 'j':
448            julian = int(found_dict['j'])
449        elif group_key in ('U', 'W'):
450            week_of_year = int(found_dict[group_key])
451            if group_key == 'U':
452                # U starts week on Sunday.
453                week_of_year_start = 6
454            else:
455                # W starts week on Monday.
456                week_of_year_start = 0
457        elif group_key == 'V':
458            iso_week = int(found_dict['V'])
459        elif group_key == 'z':
460            z = found_dict['z']
461            tzoffset = int(z[1:3]) * 60 + int(z[3:5])
462            if z.startswith("-"):
463                tzoffset = -tzoffset
464        elif group_key == 'Z':
465            # Since -1 is default value only need to worry about setting tz if
466            # it can be something other than -1.
467            found_zone = found_dict['Z'].lower()
468            for value, tz_values in enumerate(locale_time.timezone):
469                if found_zone in tz_values:
470                    # Deal with bad locale setup where timezone names are the
471                    # same and yet time.daylight is true; too ambiguous to
472                    # be able to tell what timezone has daylight savings
473                    if (time.tzname[0] == time.tzname[1] and
474                       time.daylight and found_zone not in ("utc", "gmt")):
475                        break
476                    else:
477                        tz = value
478                        break
479    # Deal with the cases where ambiguities arize
480    # don't assume default values for ISO week/year
481    if year is None and iso_year is not None:
482        if iso_week is None or weekday is None:
483            raise ValueError("ISO year directive '%G' must be used with "
484                             "the ISO week directive '%V' and a weekday "
485                             "directive ('%A', '%a', '%w', or '%u').")
486        if julian is not None:
487            raise ValueError("Day of the year directive '%j' is not "
488                             "compatible with ISO year directive '%G'. "
489                             "Use '%Y' instead.")
490    elif week_of_year is None and iso_week is not None:
491        if weekday is None:
492            raise ValueError("ISO week directive '%V' must be used with "
493                             "the ISO year directive '%G' and a weekday "
494                             "directive ('%A', '%a', '%w', or '%u').")
495        else:
496            raise ValueError("ISO week directive '%V' is incompatible with "
497                             "the year directive '%Y'. Use the ISO year '%G' "
498                             "instead.")
499
500    leap_year_fix = False
501    if year is None and month == 2 and day == 29:
502        year = 1904  # 1904 is first leap year of 20th century
503        leap_year_fix = True
504    elif year is None:
505        year = 1900
506
507
508    # If we know the week of the year and what day of that week, we can figure
509    # out the Julian day of the year.
510    if julian is None and weekday is not None:
511        if week_of_year is not None:
512            week_starts_Mon = True if week_of_year_start == 0 else False
513            julian = _calc_julian_from_U_or_W(year, week_of_year, weekday,
514                                                week_starts_Mon)
515        elif iso_year is not None and iso_week is not None:
516            year, julian = _calc_julian_from_V(iso_year, iso_week, weekday + 1)
517        if julian is not None and julian <= 0:
518            year -= 1
519            yday = 366 if calendar.isleap(year) else 365
520            julian += yday
521
522    if julian is None:
523        # Cannot pre-calculate datetime_date() since can change in Julian
524        # calculation and thus could have different value for the day of
525        # the week calculation.
526        # Need to add 1 to result since first day of the year is 1, not 0.
527        julian = datetime_date(year, month, day).toordinal() - \
528                  datetime_date(year, 1, 1).toordinal() + 1
529    else:  # Assume that if they bothered to include Julian day (or if it was
530           # calculated above with year/week/weekday) it will be accurate.
531        datetime_result = datetime_date.fromordinal(
532                            (julian - 1) +
533                            datetime_date(year, 1, 1).toordinal())
534        year = datetime_result.year
535        month = datetime_result.month
536        day = datetime_result.day
537    if weekday is None:
538        weekday = datetime_date(year, month, day).weekday()
539    # Add timezone info
540    tzname = found_dict.get("Z")
541    if tzoffset is not None:
542        gmtoff = tzoffset * 60
543    else:
544        gmtoff = None
545
546    if leap_year_fix:
547        # the caller didn't supply a year but asked for Feb 29th. We couldn't
548        # use the default of 1900 for computations. We set it back to ensure
549        # that February 29th is smaller than March 1st.
550        year = 1900
551
552    return (year, month, day,
553            hour, minute, second,
554            weekday, julian, tz, tzname, gmtoff), fraction
555
556def _strptime_time(data_string, format="%a %b %d %H:%M:%S %Y"):
557    """Return a time struct based on the input string and the
558    format string."""
559    tt = _strptime(data_string, format)[0]
560    return time.struct_time(tt[:time._STRUCT_TM_ITEMS])
561
562def _strptime_datetime(cls, data_string, format="%a %b %d %H:%M:%S %Y"):
563    """Return a class cls instance based on the input string and the
564    format string."""
565    tt, fraction = _strptime(data_string, format)
566    tzname, gmtoff = tt[-2:]
567    args = tt[:6] + (fraction,)
568    if gmtoff is not None:
569        tzdelta = datetime_timedelta(seconds=gmtoff)
570        if tzname:
571            tz = datetime_timezone(tzdelta, tzname)
572        else:
573            tz = datetime_timezone(tzdelta)
574        args += (tz,)
575
576    return cls(*args)
577