_strptime.py revision f568218e7e6211fa93d390eb327379776962867e
1"""Strptime-related classes and functions.
2
3CLASSES:
4    LocaleTime -- Discovers and stores locale-specific time information
5    TimeRE -- Creates regexes for pattern matching a string of text containing
6                time information
7
8FUNCTIONS:
9    _getlang -- Figure out what language is being used for the locale
10    strptime -- Calculates the time struct represented by the passed-in string
11
12"""
13import time
14import locale
15import calendar
16from re import compile as re_compile
17from re import IGNORECASE, ASCII
18from re import escape as re_escape
19from datetime import (date as datetime_date,
20                      timedelta as datetime_timedelta,
21                      timezone as datetime_timezone)
22try:
23    from _thread import allocate_lock as _thread_allocate_lock
24except:
25    from _dummy_thread import allocate_lock as _thread_allocate_lock
26
27__all__ = []
28
29def _getlang():
30    # Figure out what the current language is set to.
31    return locale.getlocale(locale.LC_TIME)
32
33class LocaleTime(object):
34    """Stores and handles locale-specific information related to time.
35
36    ATTRIBUTES:
37        f_weekday -- full weekday names (7-item list)
38        a_weekday -- abbreviated weekday names (7-item list)
39        f_month -- full month names (13-item list; dummy value in [0], which
40                    is added by code)
41        a_month -- abbreviated month names (13-item list, dummy value in
42                    [0], which is added by code)
43        am_pm -- AM/PM representation (2-item list)
44        LC_date_time -- format string for date/time representation (string)
45        LC_date -- format string for date representation (string)
46        LC_time -- format string for time representation (string)
47        timezone -- daylight- and non-daylight-savings timezone representation
48                    (2-item list of sets)
49        lang -- Language used by instance (2-item tuple)
50    """
51
52    def __init__(self):
53        """Set all attributes.
54
55        Order of methods called matters for dependency reasons.
56
57        The locale language is set at the offset and then checked again before
58        exiting.  This is to make sure that the attributes were not set with a
59        mix of information from more than one locale.  This would most likely
60        happen when using threads where one thread calls a locale-dependent
61        function while another thread changes the locale while the function in
62        the other thread is still running.  Proper coding would call for
63        locks to prevent changing the locale while locale-dependent code is
64        running.  The check here is done in case someone does not think about
65        doing this.
66
67        Only other possible issue is if someone changed the timezone and did
68        not call tz.tzset .  That is an issue for the programmer, though,
69        since changing the timezone is worthless without that call.
70
71        """
72        self.lang = _getlang()
73        self.__calc_weekday()
74        self.__calc_month()
75        self.__calc_am_pm()
76        self.__calc_timezone()
77        self.__calc_date_time()
78        if _getlang() != self.lang:
79            raise ValueError("locale changed during initialization")
80
81    def __pad(self, seq, front):
82        # Add '' to seq to either the front (is True), else the back.
83        seq = list(seq)
84        if front:
85            seq.insert(0, '')
86        else:
87            seq.append('')
88        return seq
89
90    def __calc_weekday(self):
91        # Set self.a_weekday and self.f_weekday using the calendar
92        # module.
93        a_weekday = [calendar.day_abbr[i].lower() for i in range(7)]
94        f_weekday = [calendar.day_name[i].lower() for i in range(7)]
95        self.a_weekday = a_weekday
96        self.f_weekday = f_weekday
97
98    def __calc_month(self):
99        # Set self.f_month and self.a_month using the calendar module.
100        a_month = [calendar.month_abbr[i].lower() for i in range(13)]
101        f_month = [calendar.month_name[i].lower() for i in range(13)]
102        self.a_month = a_month
103        self.f_month = f_month
104
105    def __calc_am_pm(self):
106        # Set self.am_pm by using time.strftime().
107
108        # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that
109        # magical; just happened to have used it everywhere else where a
110        # static date was needed.
111        am_pm = []
112        for hour in (1, 22):
113            time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0))
114            am_pm.append(time.strftime("%p", time_tuple).lower())
115        self.am_pm = am_pm
116
117    def __calc_date_time(self):
118        # Set self.date_time, self.date, & self.time by using
119        # time.strftime().
120
121        # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
122        # overloaded numbers is minimized.  The order in which searches for
123        # values within the format string is very important; it eliminates
124        # possible ambiguity for what something represents.
125        time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
126        date_time = [None, None, None]
127        date_time[0] = time.strftime("%c", time_tuple).lower()
128        date_time[1] = time.strftime("%x", time_tuple).lower()
129        date_time[2] = time.strftime("%X", time_tuple).lower()
130        replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'),
131                    (self.f_month[3], '%B'), (self.a_weekday[2], '%a'),
132                    (self.a_month[3], '%b'), (self.am_pm[1], '%p'),
133                    ('1999', '%Y'), ('99', '%y'), ('22', '%H'),
134                    ('44', '%M'), ('55', '%S'), ('76', '%j'),
135                    ('17', '%d'), ('03', '%m'), ('3', '%m'),
136                    # '3' needed for when no leading zero.
137                    ('2', '%w'), ('10', '%I')]
138        replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone
139                                                for tz in tz_values])
140        for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')):
141            current_format = date_time[offset]
142            for old, new in replacement_pairs:
143                # Must deal with possible lack of locale info
144                # manifesting itself as the empty string (e.g., Swedish's
145                # lack of AM/PM info) or a platform returning a tuple of empty
146                # strings (e.g., MacOS 9 having timezone as ('','')).
147                if old:
148                    current_format = current_format.replace(old, new)
149            # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
150            # 2005-01-03 occurs before the first Monday of the year.  Otherwise
151            # %U is used.
152            time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0))
153            if '00' in time.strftime(directive, time_tuple):
154                U_W = '%W'
155            else:
156                U_W = '%U'
157            date_time[offset] = current_format.replace('11', U_W)
158        self.LC_date_time = date_time[0]
159        self.LC_date = date_time[1]
160        self.LC_time = date_time[2]
161
162    def __calc_timezone(self):
163        # Set self.timezone by using time.tzname.
164        # Do not worry about possibility of time.tzname[0] == timetzname[1]
165        # and time.daylight; handle that in strptime .
166        try:
167            time.tzset()
168        except AttributeError:
169            pass
170        no_saving = frozenset(["utc", "gmt", time.tzname[0].lower()])
171        if time.daylight:
172            has_saving = frozenset([time.tzname[1].lower()])
173        else:
174            has_saving = frozenset()
175        self.timezone = (no_saving, has_saving)
176
177
178class TimeRE(dict):
179    """Handle conversion from format directives to regexes."""
180
181    def __init__(self, locale_time=None):
182        """Create keys/values.
183
184        Order of execution is important for dependency reasons.
185
186        """
187        if locale_time:
188            self.locale_time = locale_time
189        else:
190            self.locale_time = LocaleTime()
191        base = super()
192        base.__init__({
193            # The " \d" part of the regex is to make %c from ANSI C work
194            'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
195            'f': r"(?P<f>[0-9]{1,6})",
196            'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
197            'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
198            'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
199            'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
200            'M': r"(?P<M>[0-5]\d|\d)",
201            'S': r"(?P<S>6[0-1]|[0-5]\d|\d)",
202            'U': r"(?P<U>5[0-3]|[0-4]\d|\d)",
203            'w': r"(?P<w>[0-6])",
204            # W is set below by using 'U'
205            'y': r"(?P<y>\d\d)",
206            #XXX: Does 'Y' need to worry about having less or more than
207            #     4 digits?
208            'Y': r"(?P<Y>\d\d\d\d)",
209            'z': r"(?P<z>[+-]\d\d[0-5]\d)",
210            'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
211            'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
212            'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),
213            'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'),
214            'p': self.__seqToRE(self.locale_time.am_pm, 'p'),
215            'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone
216                                        for tz in tz_names),
217                                'Z'),
218            '%': '%'})
219        base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
220        base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
221        base.__setitem__('x', self.pattern(self.locale_time.LC_date))
222        base.__setitem__('X', self.pattern(self.locale_time.LC_time))
223
224    def __seqToRE(self, to_convert, directive):
225        """Convert a list to a regex string for matching a directive.
226
227        Want possible matching values to be from longest to shortest.  This
228        prevents the possibility of a match occuring for a value that also
229        a substring of a larger value that should have matched (e.g., 'abc'
230        matching when 'abcdef' should have been the match).
231
232        """
233        to_convert = sorted(to_convert, key=len, reverse=True)
234        for value in to_convert:
235            if value != '':
236                break
237        else:
238            return ''
239        regex = '|'.join(re_escape(stuff) for stuff in to_convert)
240        regex = '(?P<%s>%s' % (directive, regex)
241        return '%s)' % regex
242
243    def pattern(self, format):
244        """Return regex pattern for the format string.
245
246        Need to make sure that any characters that might be interpreted as
247        regex syntax are escaped.
248
249        """
250        processed_format = ''
251        # The sub() call escapes all characters that might be misconstrued
252        # as regex syntax.  Cannot use re.escape since we have to deal with
253        # format directives (%m, etc.).
254        regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])")
255        format = regex_chars.sub(r"\\\1", format)
256        whitespace_replacement = re_compile('\s+')
257        format = whitespace_replacement.sub('\s+', format)
258        while '%' in format:
259            directive_index = format.index('%')+1
260            processed_format = "%s%s%s" % (processed_format,
261                                           format[:directive_index-1],
262                                           self[format[directive_index]])
263            format = format[directive_index+1:]
264        return "%s%s" % (processed_format, format)
265
266    def compile(self, format):
267        """Return a compiled re object for the format string."""
268        return re_compile(self.pattern(format), IGNORECASE)
269
270_cache_lock = _thread_allocate_lock()
271# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
272# first!
273_TimeRE_cache = TimeRE()
274_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
275_regex_cache = {}
276
277def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon):
278    """Calculate the Julian day based on the year, week of the year, and day of
279    the week, with week_start_day representing whether the week of the year
280    assumes the week starts on Sunday or Monday (6 or 0)."""
281    first_weekday = datetime_date(year, 1, 1).weekday()
282    # If we are dealing with the %U directive (week starts on Sunday), it's
283    # easier to just shift the view to Sunday being the first day of the
284    # week.
285    if not week_starts_Mon:
286        first_weekday = (first_weekday + 1) % 7
287        day_of_week = (day_of_week + 1) % 7
288    # Need to watch out for a week 0 (when the first day of the year is not
289    # the same as that specified by %U or %W).
290    week_0_length = (7 - first_weekday) % 7
291    if week_of_year == 0:
292        return 1 + day_of_week - first_weekday
293    else:
294        days_to_week = week_0_length + (7 * (week_of_year - 1))
295        return 1 + days_to_week + day_of_week
296
297
298def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
299    """Return a 2-tuple consisting of a time struct and an int containing
300    the number of microseconds based on the input string and the
301    format string."""
302
303    for index, arg in enumerate([data_string, format]):
304        if not isinstance(arg, str):
305            msg = "strptime() argument {} must be str, not {}"
306            raise TypeError(msg.format(index, type(arg)))
307
308    global _TimeRE_cache, _regex_cache
309    with _cache_lock:
310
311        if _getlang() != _TimeRE_cache.locale_time.lang:
312            _TimeRE_cache = TimeRE()
313            _regex_cache.clear()
314        if len(_regex_cache) > _CACHE_MAX_SIZE:
315            _regex_cache.clear()
316        locale_time = _TimeRE_cache.locale_time
317        format_regex = _regex_cache.get(format)
318        if not format_regex:
319            try:
320                format_regex = _TimeRE_cache.compile(format)
321            # KeyError raised when a bad format is found; can be specified as
322            # \\, in which case it was a stray % but with a space after it
323            except KeyError as err:
324                bad_directive = err.args[0]
325                if bad_directive == "\\":
326                    bad_directive = "%"
327                del err
328                raise ValueError("'%s' is a bad directive in format '%s'" %
329                                    (bad_directive, format))
330            # IndexError only occurs when the format string is "%"
331            except IndexError:
332                raise ValueError("stray %% in format '%s'" % format)
333            _regex_cache[format] = format_regex
334    found = format_regex.match(data_string)
335    if not found:
336        raise ValueError("time data %r does not match format %r" %
337                         (data_string, format))
338    if len(data_string) != found.end():
339        raise ValueError("unconverted data remains: %s" %
340                          data_string[found.end():])
341
342    year = 1900
343    month = day = 1
344    hour = minute = second = fraction = 0
345    tz = -1
346    tzoffset = None
347    # Default to -1 to signify that values not known; not critical to have,
348    # though
349    week_of_year = -1
350    week_of_year_start = -1
351    # weekday and julian defaulted to -1 so as to signal need to calculate
352    # values
353    weekday = julian = -1
354    found_dict = found.groupdict()
355    for group_key in found_dict.keys():
356        # Directives not explicitly handled below:
357        #   c, x, X
358        #      handled by making out of other directives
359        #   U, W
360        #      worthless without day of the week
361        if group_key == 'y':
362            year = int(found_dict['y'])
363            # Open Group specification for strptime() states that a %y
364            #value in the range of [00, 68] is in the century 2000, while
365            #[69,99] is in the century 1900
366            if year <= 68:
367                year += 2000
368            else:
369                year += 1900
370        elif group_key == 'Y':
371            year = int(found_dict['Y'])
372        elif group_key == 'm':
373            month = int(found_dict['m'])
374        elif group_key == 'B':
375            month = locale_time.f_month.index(found_dict['B'].lower())
376        elif group_key == 'b':
377            month = locale_time.a_month.index(found_dict['b'].lower())
378        elif group_key == 'd':
379            day = int(found_dict['d'])
380        elif group_key == 'H':
381            hour = int(found_dict['H'])
382        elif group_key == 'I':
383            hour = int(found_dict['I'])
384            ampm = found_dict.get('p', '').lower()
385            # If there was no AM/PM indicator, we'll treat this like AM
386            if ampm in ('', locale_time.am_pm[0]):
387                # We're in AM so the hour is correct unless we're
388                # looking at 12 midnight.
389                # 12 midnight == 12 AM == hour 0
390                if hour == 12:
391                    hour = 0
392            elif ampm == locale_time.am_pm[1]:
393                # We're in PM so we need to add 12 to the hour unless
394                # we're looking at 12 noon.
395                # 12 noon == 12 PM == hour 12
396                if hour != 12:
397                    hour += 12
398        elif group_key == 'M':
399            minute = int(found_dict['M'])
400        elif group_key == 'S':
401            second = int(found_dict['S'])
402        elif group_key == 'f':
403            s = found_dict['f']
404            # Pad to always return microseconds.
405            s += "0" * (6 - len(s))
406            fraction = int(s)
407        elif group_key == 'A':
408            weekday = locale_time.f_weekday.index(found_dict['A'].lower())
409        elif group_key == 'a':
410            weekday = locale_time.a_weekday.index(found_dict['a'].lower())
411        elif group_key == 'w':
412            weekday = int(found_dict['w'])
413            if weekday == 0:
414                weekday = 6
415            else:
416                weekday -= 1
417        elif group_key == 'j':
418            julian = int(found_dict['j'])
419        elif group_key in ('U', 'W'):
420            week_of_year = int(found_dict[group_key])
421            if group_key == 'U':
422                # U starts week on Sunday.
423                week_of_year_start = 6
424            else:
425                # W starts week on Monday.
426                week_of_year_start = 0
427        elif group_key == 'z':
428            z = found_dict['z']
429            tzoffset = int(z[1:3]) * 60 + int(z[3:5])
430            if z.startswith("-"):
431                tzoffset = -tzoffset
432        elif group_key == 'Z':
433            # Since -1 is default value only need to worry about setting tz if
434            # it can be something other than -1.
435            found_zone = found_dict['Z'].lower()
436            for value, tz_values in enumerate(locale_time.timezone):
437                if found_zone in tz_values:
438                    # Deal with bad locale setup where timezone names are the
439                    # same and yet time.daylight is true; too ambiguous to
440                    # be able to tell what timezone has daylight savings
441                    if (time.tzname[0] == time.tzname[1] and
442                       time.daylight and found_zone not in ("utc", "gmt")):
443                        break
444                    else:
445                        tz = value
446                        break
447    # If we know the week of the year and what day of that week, we can figure
448    # out the Julian day of the year.
449    if julian == -1 and week_of_year != -1 and weekday != -1:
450        week_starts_Mon = True if week_of_year_start == 0 else False
451        julian = _calc_julian_from_U_or_W(year, week_of_year, weekday,
452                                            week_starts_Mon)
453    # Cannot pre-calculate datetime_date() since can change in Julian
454    # calculation and thus could have different value for the day of the week
455    # calculation.
456    if julian == -1:
457        # Need to add 1 to result since first day of the year is 1, not 0.
458        julian = datetime_date(year, month, day).toordinal() - \
459                  datetime_date(year, 1, 1).toordinal() + 1
460    else:  # Assume that if they bothered to include Julian day it will
461           # be accurate.
462        datetime_result = datetime_date.fromordinal((julian - 1) + datetime_date(year, 1, 1).toordinal())
463        year = datetime_result.year
464        month = datetime_result.month
465        day = datetime_result.day
466    if weekday == -1:
467        weekday = datetime_date(year, month, day).weekday()
468    # Add timezone info
469    tzname = found_dict.get("Z")
470    if tzoffset is not None:
471        gmtoff = tzoffset * 60
472    else:
473        gmtoff = None
474
475    return (year, month, day,
476            hour, minute, second,
477            weekday, julian, tz, gmtoff, tzname), fraction
478
479def _strptime_time(data_string, format="%a %b %d %H:%M:%S %Y"):
480    """Return a time struct based on the input string and the
481    format string."""
482    tt = _strptime(data_string, format)[0]
483    return time.struct_time(tt[:9])
484
485def _strptime_datetime(class_, data_string, format="%a %b %d %H:%M:%S %Y"):
486    """Return a class_ instance based on the input string and the
487    format string."""
488    tt, fraction = _strptime(data_string, format)
489    gmtoff, tzname = tt[-2:]
490    args = tt[:6] + (fraction,)
491    if gmtoff is not None:
492        tzdelta = datetime_timedelta(seconds=gmtoff)
493        if tzname:
494            tz = datetime_timezone(tzdelta, tzname)
495        else:
496            tz = datetime_timezone(tzdelta)
497        args += (tz,)
498
499    return class_(*args)
500