_strptime.py revision 1fdb6335304551b79838523811525d3c59d901ae
1"""Strptime-related classes and functions.
2
3CLASSES:
4    LocaleTime -- Discovers and/or stores locale-specific time information
5    TimeRE -- Creates regexes for pattern matching a string of text containing
6                time information as is returned by time.strftime()
7
8FUNCTIONS:
9    _getlang -- Figure out what language is being used for the locale
10    strptime -- Calculates the time struct represented by the passed-in string
11
12Requires Python 2.2.1 or higher (mainly because of the use of property()).
13Can be used in Python 2.2 if the following line is added:
14    True = 1; False = 0
15"""
16import time
17import locale
18import calendar
19from re import compile as re_compile
20from re import IGNORECASE
21from datetime import date as datetime_date
22
23__author__ = "Brett Cannon"
24__email__ = "brett@python.org"
25
26__all__ = ['strptime']
27
28def _getlang():
29    # Figure out what the current language is set to.
30    current_lang = locale.getlocale(locale.LC_TIME)[0]
31    if current_lang:
32        return current_lang
33    else:
34        current_lang = locale.getdefaultlocale()[0]
35        if current_lang:
36            return current_lang
37        else:
38            return ''
39
40class LocaleTime(object):
41    """Stores and handles locale-specific information related to time.
42
43    ATTRIBUTES (all read-only after instance creation! Instance variables that
44                store the values have mangled names):
45        f_weekday -- full weekday names (7-item list)
46        a_weekday -- abbreviated weekday names (7-item list)
47        f_month -- full weekday names (14-item list; dummy value in [0], which
48                    is added by code)
49        a_month -- abbreviated weekday names (13-item list, dummy value in
50                    [0], which is added by code)
51        am_pm -- AM/PM representation (2-item list)
52        LC_date_time -- format string for date/time representation (string)
53        LC_date -- format string for date representation (string)
54        LC_time -- format string for time representation (string)
55        timezone -- daylight- and non-daylight-savings timezone representation
56                    (3-item list; code tacks on blank item at end for
57                    possible lack of timezone such as UTC)
58        lang -- Language used by instance (string)
59    """
60
61    def __init__(self, f_weekday=None, a_weekday=None, f_month=None,
62                 a_month=None, am_pm=None, LC_date_time=None, LC_time=None,
63                 LC_date=None, timezone=None, lang=None):
64        """Optionally set attributes with passed-in values."""
65        if f_weekday is None:
66            self.__f_weekday = None
67        elif len(f_weekday) == 7:
68            self.__f_weekday = list(f_weekday)
69        else:
70            raise TypeError("full weekday names must be a 7-item sequence")
71        if a_weekday is None:
72            self.__a_weekday = None
73        elif len(a_weekday) == 7:
74            self.__a_weekday = list(a_weekday)
75        else:
76            raise TypeError(
77                "abbreviated weekday names must be a 7-item  sequence")
78        if f_month is None:
79            self.__f_month = None
80        elif len(f_month) == 12:
81            self.__f_month = self.__pad(f_month, True)
82        else:
83            raise TypeError("full month names must be a 12-item sequence")
84        if a_month is None:
85            self.__a_month = None
86        elif len(a_month) == 12:
87            self.__a_month = self.__pad(a_month, True)
88        else:
89            raise TypeError(
90                "abbreviated month names must be a 12-item sequence")
91        if am_pm is None:
92            self.__am_pm = None
93        elif len(am_pm) == 2:
94            self.__am_pm = am_pm
95        else:
96            raise TypeError("AM/PM representation must be a 2-item sequence")
97        self.__LC_date_time = LC_date_time
98        self.__LC_time = LC_time
99        self.__LC_date = LC_date
100        self.__timezone = timezone
101        if timezone:
102            if len(timezone) != 2:
103                raise TypeError("timezone names must contain 2 items")
104            else:
105                self.__timezone = self.__pad(timezone, False)
106        self.__lang = lang
107
108    def __pad(self, seq, front):
109        # Add '' to seq to either front (is True), else the back.
110        seq = list(seq)
111        if front:
112            seq.insert(0, '')
113        else:
114            seq.append('')
115        return seq
116
117    def __set_nothing(self, stuff):
118        # Raise TypeError when trying to set an attribute.
119        raise TypeError("attribute does not support assignment")
120
121    def __get_f_weekday(self):
122        # Fetch self.f_weekday.
123        if not self.__f_weekday:
124            self.__calc_weekday()
125        return self.__f_weekday
126
127    def __get_a_weekday(self):
128        # Fetch self.a_weekday.
129        if not self.__a_weekday:
130            self.__calc_weekday()
131        return self.__a_weekday
132
133    f_weekday = property(__get_f_weekday, __set_nothing,
134                         doc="Full weekday names")
135    a_weekday = property(__get_a_weekday, __set_nothing,
136                         doc="Abbreviated weekday names")
137
138    def __get_f_month(self):
139        # Fetch self.f_month.
140        if not self.__f_month:
141            self.__calc_month()
142        return self.__f_month
143
144    def __get_a_month(self):
145        # Fetch self.a_month.
146        if not self.__a_month:
147            self.__calc_month()
148        return self.__a_month
149
150    f_month = property(__get_f_month, __set_nothing,
151                       doc="Full month names (dummy value at index 0)")
152    a_month = property(__get_a_month, __set_nothing,
153                       doc="Abbreviated month names (dummy value at index 0)")
154
155    def __get_am_pm(self):
156        # Fetch self.am_pm.
157        if not self.__am_pm:
158            self.__calc_am_pm()
159        return self.__am_pm
160
161    am_pm = property(__get_am_pm, __set_nothing, doc="AM/PM representation")
162
163    def __get_timezone(self):
164        # Fetch self.timezone.
165        if not self.__timezone:
166            self.__calc_timezone()
167        return self.__timezone
168
169    timezone = property(__get_timezone, __set_nothing,
170                        doc="Timezone representation (dummy value at index 2)")
171
172    def __get_LC_date_time(self):
173        # Fetch self.LC_date_time.
174        if not self.__LC_date_time:
175            self.__calc_date_time()
176        return self.__LC_date_time
177
178    def __get_LC_date(self):
179        # Fetch self.LC_date.
180        if not self.__LC_date:
181            self.__calc_date_time()
182        return self.__LC_date
183
184    def __get_LC_time(self):
185        # Fetch self.LC_time.
186        if not self.__LC_time:
187            self.__calc_date_time()
188        return self.__LC_time
189
190    LC_date_time = property(
191        __get_LC_date_time, __set_nothing,
192        doc=
193        "Format string for locale's date/time representation ('%c' format)")
194    LC_date = property(__get_LC_date, __set_nothing,
195        doc="Format string for locale's date representation ('%x' format)")
196    LC_time = property(__get_LC_time, __set_nothing,
197        doc="Format string for locale's time representation ('%X' format)")
198
199    def __get_lang(self):
200        # Fetch self.lang.
201        if not self.__lang:
202            self.__calc_lang()
203        return self.__lang
204
205    lang = property(__get_lang, __set_nothing,
206                    doc="Language used for instance")
207
208    def __calc_weekday(self):
209        # Set self.__a_weekday and self.__f_weekday using the calendar
210        # module.
211        a_weekday = [calendar.day_abbr[i] for i in range(7)]
212        f_weekday = [calendar.day_name[i] for i in range(7)]
213        if not self.__a_weekday:
214            self.__a_weekday = a_weekday
215        if not self.__f_weekday:
216            self.__f_weekday = f_weekday
217
218    def __calc_month(self):
219        # Set self.__f_month and self.__a_month using the calendar module.
220        a_month = [calendar.month_abbr[i] for i in range(13)]
221        f_month = [calendar.month_name[i] for i in range(13)]
222        if not self.__a_month:
223            self.__a_month = a_month
224        if not self.__f_month:
225            self.__f_month = f_month
226
227    def __calc_am_pm(self):
228        # Set self.__am_pm by using time.strftime().
229
230        # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that
231        # magical; just happened to have used it everywhere else where a
232        # static date was needed.
233        am_pm = []
234        for hour in (01,22):
235            time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0))
236            am_pm.append(time.strftime("%p", time_tuple))
237        self.__am_pm = am_pm
238
239    def __calc_date_time(self):
240        # Set self.__date_time, self.__date, & self.__time by using
241        # time.strftime().
242
243        # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
244        # overloaded numbers is minimized.  The order in which searches for
245        # values within the format string is very important; it eliminates
246        # possible ambiguity for what something represents.
247        time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
248        date_time = [None, None, None]
249        date_time[0] = time.strftime("%c", time_tuple)
250        date_time[1] = time.strftime("%x", time_tuple)
251        date_time[2] = time.strftime("%X", time_tuple)
252        for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')):
253            current_format = date_time[offset]
254            for old, new in (
255                    ('%', '%%'), (self.f_weekday[2], '%A'),
256                    (self.f_month[3], '%B'), (self.a_weekday[2], '%a'),
257                    (self.a_month[3], '%b'), (self.am_pm[1], '%p'),
258                    (self.timezone[0], '%Z'), (self.timezone[1], '%Z'),
259                    ('1999', '%Y'), ('99', '%y'), ('22', '%H'),
260                    ('44', '%M'), ('55', '%S'), ('76', '%j'),
261                    ('17', '%d'), ('03', '%m'), ('3', '%m'),
262                    # '3' needed for when no leading zero.
263                    ('2', '%w'), ('10', '%I')):
264                # Must deal with possible lack of locale info
265                # manifesting itself as the empty string (e.g., Swedish's
266                # lack of AM/PM info) or a platform returning a tuple of empty
267                # strings (e.g., MacOS 9 having timezone as ('','')).
268                if old:
269                    current_format = current_format.replace(old, new)
270            time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0))
271            if time.strftime(directive, time_tuple).find('00'):
272                U_W = '%U'
273            else:
274                U_W = '%W'
275            date_time[offset] = current_format.replace('11', U_W)
276        if not self.__LC_date_time:
277            self.__LC_date_time = date_time[0]
278        if not self.__LC_date:
279            self.__LC_date = date_time[1]
280        if not self.__LC_time:
281            self.__LC_time = date_time[2]
282
283    def __calc_timezone(self):
284        # Set self.__timezone by using time.tzname.
285        #
286        # Empty string used for matching when timezone is not used/needed such
287        # as with UTC.
288        self.__timezone = self.__pad(time.tzname, 0)
289
290    def __calc_lang(self):
291        # Set self.__lang by using __getlang().
292        self.__lang = _getlang()
293
294
295
296class TimeRE(dict):
297    """Handle conversion from format directives to regexes."""
298
299    def __init__(self, locale_time=LocaleTime()):
300        """Init inst with non-locale regexes and store LocaleTime object."""
301        #XXX: Does 'Y' need to worry about having less or more than 4 digits?
302        base = super(TimeRE, self)
303        base.__init__({
304            # The " \d" option is to make %c from ANSI C work
305            'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
306            'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
307            'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
308            'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
309            'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
310            'M': r"(?P<M>[0-5]\d|\d)",
311            'S': r"(?P<S>6[0-1]|[0-5]\d|\d)",
312            'U': r"(?P<U>5[0-3]|[0-4]\d|\d)",
313            'w': r"(?P<w>[0-6])",
314            # W is set below by using 'U'
315            'y': r"(?P<y>\d\d)",
316            'Y': r"(?P<Y>\d\d\d\d)"})
317        base.__setitem__('W', base.__getitem__('U'))
318        self.locale_time = locale_time
319
320    def __getitem__(self, fetch):
321        """Try to fetch regex; if it does not exist, construct it."""
322        try:
323            return super(TimeRE, self).__getitem__(fetch)
324        except KeyError:
325            constructors = {
326                'A': lambda: self.__seqToRE(self.locale_time.f_weekday, fetch),
327                'a': lambda: self.__seqToRE(self.locale_time.a_weekday, fetch),
328                'B': lambda: self.__seqToRE(self.locale_time.f_month[1:],
329                                            fetch),
330                'b': lambda: self.__seqToRE(self.locale_time.a_month[1:],
331                                            fetch),
332                'c': lambda: self.pattern(self.locale_time.LC_date_time),
333                'p': lambda: self.__seqToRE(self.locale_time.am_pm, fetch),
334                'x': lambda: self.pattern(self.locale_time.LC_date),
335                'X': lambda: self.pattern(self.locale_time.LC_time),
336                'Z': lambda: self.__seqToRE(self.locale_time.timezone, fetch),
337                '%': lambda: '%',
338                }
339            if fetch in constructors:
340                self[fetch] = constructors[fetch]()
341                return self[fetch]
342            else:
343                raise
344
345    def __seqToRE(self, to_convert, directive):
346        """Convert a list to a regex string for matching a directive."""
347        def sorter(a, b):
348            """Sort based on length.
349
350            Done in case for some strange reason that names in the locale only
351            differ by a suffix and thus want the name with the suffix to match
352            first.
353            """
354            try:
355                a_length = len(a)
356            except TypeError:
357                a_length = 0
358            try:
359                b_length = len(b)
360            except TypeError:
361                b_length = 0
362            return cmp(b_length, a_length)
363
364        to_convert = to_convert[:]  # Don't want to change value in-place.
365        for value in to_convert:
366            if value != '':
367                break
368        else:
369            return ''
370        to_convert.sort(sorter)
371        regex = '|'.join(to_convert)
372        regex = '(?P<%s>%s' % (directive, regex)
373        return '%s)' % regex
374
375    def pattern(self, format):
376        """Return re pattern for the format string."""
377        processed_format = ''
378        whitespace_replacement = re_compile('\s+')
379        format = whitespace_replacement.sub('\s*', format)
380        while format.find('%') != -1:
381            directive_index = format.index('%')+1
382            processed_format = "%s%s%s" % (processed_format,
383                                           format[:directive_index-1],
384                                           self[format[directive_index]])
385            format = format[directive_index+1:]
386        return "%s%s" % (processed_format, format)
387
388    def compile(self, format):
389        """Return a compiled re object for the format string."""
390        return re_compile(self.pattern(format), IGNORECASE)
391
392# Cached TimeRE; probably only need one instance ever so cache it for performance
393_locale_cache = TimeRE()
394# Cached regex objects; same reason as for TimeRE cache
395_regex_cache = dict()
396
397def strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
398    """Return a time struct based on the input data and the format string."""
399    global _locale_cache
400    global _regex_cache
401    locale_time = _locale_cache.locale_time
402    # If the language changes, caches are invalidated, so clear them
403    if locale_time.lang != _getlang():
404        _locale_cache = TimeRE()
405        _regex_cache.clear()
406    format_regex = _regex_cache.get(format)
407    if not format_regex:
408        # Limit regex cache size to prevent major bloating of the module;
409        # The value 5 is arbitrary
410        if len(_regex_cache) > 5:
411            _regex_cache.clear()
412        format_regex = _locale_cache.compile(format)
413        _regex_cache[format] = format_regex
414    found = format_regex.match(data_string)
415    if not found:
416        raise ValueError("time data did not match format")
417    year = 1900
418    month = day = 1
419    hour = minute = second = 0
420    tz = -1
421    # weekday and julian defaulted to -1 so as to signal need to calculate values
422    weekday = julian = -1
423    found_dict = found.groupdict()
424    for group_key in found_dict.iterkeys():
425        if group_key == 'y':
426            year = int(found_dict['y'])
427            # Open Group specification for strptime() states that a %y
428            #value in the range of [00, 68] is in the century 2000, while
429            #[69,99] is in the century 1900
430            if year <= 68:
431                year += 2000
432            else:
433                year += 1900
434        elif group_key == 'Y':
435            year = int(found_dict['Y'])
436        elif group_key == 'm':
437            month = int(found_dict['m'])
438        elif group_key == 'B':
439            month = _insensitiveindex(locale_time.f_month, found_dict['B'])
440        elif group_key == 'b':
441            month = _insensitiveindex(locale_time.a_month, found_dict['b'])
442        elif group_key == 'd':
443            day = int(found_dict['d'])
444        elif group_key is 'H':
445            hour = int(found_dict['H'])
446        elif group_key == 'I':
447            hour = int(found_dict['I'])
448            ampm = found_dict.get('p', '').lower()
449            # If there was no AM/PM indicator, we'll treat this like AM
450            if ampm in ('', locale_time.am_pm[0].lower()):
451                # We're in AM so the hour is correct unless we're
452                # looking at 12 midnight.
453                # 12 midnight == 12 AM == hour 0
454                if hour == 12:
455                    hour = 0
456            elif ampm == locale_time.am_pm[1].lower():
457                # We're in PM so we need to add 12 to the hour unless
458                # we're looking at 12 noon.
459                # 12 noon == 12 PM == hour 12
460                if hour != 12:
461                    hour += 12
462        elif group_key == 'M':
463            minute = int(found_dict['M'])
464        elif group_key == 'S':
465            second = int(found_dict['S'])
466        elif group_key == 'A':
467            weekday = _insensitiveindex(locale_time.f_weekday,
468                                        found_dict['A'])
469        elif group_key == 'a':
470            weekday = _insensitiveindex(locale_time.a_weekday,
471                                        found_dict['a'])
472        elif group_key == 'w':
473            weekday = int(found_dict['w'])
474            if weekday == 0:
475                weekday = 6
476            else:
477                weekday -= 1
478        elif group_key == 'j':
479            julian = int(found_dict['j'])
480        elif group_key == 'Z':
481            found_zone = found_dict['Z'].lower()
482            if locale_time.timezone[0] == locale_time.timezone[1]:
483                pass #Deals with bad locale setup where timezone info is
484                     # the same; first found on FreeBSD 4.4.
485            elif locale_time.timezone[0].lower() == found_zone:
486                tz = 0
487            elif locale_time.timezone[1].lower() == found_zone:
488                tz = 1
489            elif locale_time.timezone[2].lower() == found_zone:
490                tz = -1
491    # Cannot pre-calculate datetime_date() since can change in Julian
492    #calculation and thus could have different value for the day of the week
493    #calculation
494    if julian == -1:
495        # Need to add 1 to result since first day of the year is 1, not 0.
496        julian = datetime_date(year, month, day).toordinal() - \
497                  datetime_date(year, 1, 1).toordinal() + 1
498    else:  # Assume that if they bothered to include Julian day it will
499           #be accurate
500        datetime_result = datetime_date.fromordinal((julian - 1) + datetime_date(year, 1, 1).toordinal())
501        year = datetime_result.year
502        month = datetime_result.month
503        day = datetime_result.day
504    if weekday == -1:
505        weekday = datetime_date(year, month, day).weekday()
506    return time.struct_time((year, month, day,
507                             hour, minute, second,
508                             weekday, julian, tz))
509
510def _insensitiveindex(lst, findme):
511    # Perform a case-insensitive index search.
512
513    #XXX <bc>: If LocaleTime is not exposed, then consider removing this and
514    #          just lowercase when LocaleTime sets its vars and lowercasing
515    #          search values.
516    findme = findme.lower()
517    for key,item in enumerate(lst):
518        if item.lower() == findme:
519            return key
520    else:
521        raise ValueError("value not in list")
522
523