_strptime.py revision 8dc25ad6f2e2280249341c2f4e015455269d10c9
1"""Strptime-related classes and functions. 2 3CLASSES: 4 LocaleTime -- Discovers and stores locale-specific time information 5 TimeRE -- Creates regexes for pattern matching a string of text containing 6 time information 7 8FUNCTIONS: 9 _getlang -- Figure out what language is being used for the locale 10 strptime -- Calculates the time struct represented by the passed-in string 11 12""" 13import time 14import locale 15import calendar 16from re import compile as re_compile 17from re import IGNORECASE 18from re import escape as re_escape 19from datetime import date as datetime_date 20try: 21 from thread import allocate_lock as _thread_allocate_lock 22except: 23 from dummy_thread import allocate_lock as _thread_allocate_lock 24 25__author__ = "Brett Cannon" 26__email__ = "brett@python.org" 27 28__all__ = ['strptime'] 29 30def _getlang(): 31 # Figure out what the current language is set to. 32 return locale.getlocale(locale.LC_TIME) 33 34class LocaleTime(object): 35 """Stores and handles locale-specific information related to time. 36 37 ATTRIBUTES: 38 f_weekday -- full weekday names (7-item list) 39 a_weekday -- abbreviated weekday names (7-item list) 40 f_month -- full month names (13-item list; dummy value in [0], which 41 is added by code) 42 a_month -- abbreviated month names (13-item list, dummy value in 43 [0], which is added by code) 44 am_pm -- AM/PM representation (2-item list) 45 LC_date_time -- format string for date/time representation (string) 46 LC_date -- format string for date representation (string) 47 LC_time -- format string for time representation (string) 48 timezone -- daylight- and non-daylight-savings timezone representation 49 (2-item list of sets) 50 lang -- Language used by instance (2-item tuple) 51 """ 52 53 def __init__(self): 54 """Set all attributes. 55 56 Order of methods called matters for dependency reasons. 57 58 The locale language is set at the offset and then checked again before 59 exiting. This is to make sure that the attributes were not set with a 60 mix of information from more than one locale. This would most likely 61 happen when using threads where one thread calls a locale-dependent 62 function while another thread changes the locale while the function in 63 the other thread is still running. Proper coding would call for 64 locks to prevent changing the locale while locale-dependent code is 65 running. The check here is done in case someone does not think about 66 doing this. 67 68 Only other possible issue is if someone changed the timezone and did 69 not call tz.tzset . That is an issue for the programmer, though, 70 since changing the timezone is worthless without that call. 71 72 """ 73 self.lang = _getlang() 74 self.__calc_weekday() 75 self.__calc_month() 76 self.__calc_am_pm() 77 self.__calc_timezone() 78 self.__calc_date_time() 79 if _getlang() != self.lang: 80 raise ValueError("locale changed during initialization") 81 82 def __pad(self, seq, front): 83 # Add '' to seq to either the front (is True), else the back. 84 seq = list(seq) 85 if front: 86 seq.insert(0, '') 87 else: 88 seq.append('') 89 return seq 90 91 def __calc_weekday(self): 92 # Set self.a_weekday and self.f_weekday using the calendar 93 # module. 94 a_weekday = [calendar.day_abbr[i].lower() for i in range(7)] 95 f_weekday = [calendar.day_name[i].lower() for i in range(7)] 96 self.a_weekday = a_weekday 97 self.f_weekday = f_weekday 98 99 def __calc_month(self): 100 # Set self.f_month and self.a_month using the calendar module. 101 a_month = [calendar.month_abbr[i].lower() for i in range(13)] 102 f_month = [calendar.month_name[i].lower() for i in range(13)] 103 self.a_month = a_month 104 self.f_month = f_month 105 106 def __calc_am_pm(self): 107 # Set self.am_pm by using time.strftime(). 108 109 # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that 110 # magical; just happened to have used it everywhere else where a 111 # static date was needed. 112 am_pm = [] 113 for hour in (01,22): 114 time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0)) 115 am_pm.append(time.strftime("%p", time_tuple).lower()) 116 self.am_pm = am_pm 117 118 def __calc_date_time(self): 119 # Set self.date_time, self.date, & self.time by using 120 # time.strftime(). 121 122 # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of 123 # overloaded numbers is minimized. The order in which searches for 124 # values within the format string is very important; it eliminates 125 # possible ambiguity for what something represents. 126 time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0)) 127 date_time = [None, None, None] 128 date_time[0] = time.strftime("%c", time_tuple).lower() 129 date_time[1] = time.strftime("%x", time_tuple).lower() 130 date_time[2] = time.strftime("%X", time_tuple).lower() 131 replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), 132 (self.f_month[3], '%B'), (self.a_weekday[2], '%a'), 133 (self.a_month[3], '%b'), (self.am_pm[1], '%p'), 134 ('1999', '%Y'), ('99', '%y'), ('22', '%H'), 135 ('44', '%M'), ('55', '%S'), ('76', '%j'), 136 ('17', '%d'), ('03', '%m'), ('3', '%m'), 137 # '3' needed for when no leading zero. 138 ('2', '%w'), ('10', '%I')] 139 replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone 140 for tz in tz_values]) 141 for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')): 142 current_format = date_time[offset] 143 for old, new in replacement_pairs: 144 # Must deal with possible lack of locale info 145 # manifesting itself as the empty string (e.g., Swedish's 146 # lack of AM/PM info) or a platform returning a tuple of empty 147 # strings (e.g., MacOS 9 having timezone as ('','')). 148 if old: 149 current_format = current_format.replace(old, new) 150 time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0)) 151 if time.strftime(directive, time_tuple).find('00'): 152 U_W = '%U' 153 else: 154 U_W = '%W' 155 date_time[offset] = current_format.replace('11', U_W) 156 self.LC_date_time = date_time[0] 157 self.LC_date = date_time[1] 158 self.LC_time = date_time[2] 159 160 def __calc_timezone(self): 161 # Set self.timezone by using time.tzname. 162 # Do not worry about possibility of time.tzname[0] == timetzname[1] 163 # and time.daylight; handle that in strptime . 164 try: 165 time.tzset() 166 except AttributeError: 167 pass 168 no_saving = frozenset(["utc", "gmt", time.tzname[0].lower()]) 169 if time.daylight: 170 has_saving = frozenset([time.tzname[1].lower()]) 171 else: 172 has_saving = frozenset() 173 self.timezone = (no_saving, has_saving) 174 175 176class TimeRE(dict): 177 """Handle conversion from format directives to regexes.""" 178 179 def __init__(self, locale_time=None): 180 """Create keys/values. 181 182 Order of execution is important for dependency reasons. 183 184 """ 185 if locale_time: 186 self.locale_time = locale_time 187 else: 188 self.locale_time = LocaleTime() 189 base = super(TimeRE, self) 190 base.__init__({ 191 # The " \d" part of the regex is to make %c from ANSI C work 192 'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", 193 'H': r"(?P<H>2[0-3]|[0-1]\d|\d)", 194 'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])", 195 'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])", 196 'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])", 197 'M': r"(?P<M>[0-5]\d|\d)", 198 'S': r"(?P<S>6[0-1]|[0-5]\d|\d)", 199 'U': r"(?P<U>5[0-3]|[0-4]\d|\d)", 200 'w': r"(?P<w>[0-6])", 201 # W is set below by using 'U' 202 'y': r"(?P<y>\d\d)", 203 #XXX: Does 'Y' need to worry about having less or more than 204 # 4 digits? 205 'Y': r"(?P<Y>\d\d\d\d)", 206 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), 207 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), 208 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), 209 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), 210 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), 211 'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone 212 for tz in tz_names), 213 'Z'), 214 '%': '%'}) 215 base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) 216 base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) 217 base.__setitem__('x', self.pattern(self.locale_time.LC_date)) 218 base.__setitem__('X', self.pattern(self.locale_time.LC_time)) 219 220 def __seqToRE(self, to_convert, directive): 221 """Convert a list to a regex string for matching a directive. 222 223 Want possible matching values to be from longest to shortest. This 224 prevents the possibility of a match occuring for a value that also 225 a substring of a larger value that should have matched (e.g., 'abc' 226 matching when 'abcdef' should have been the match). 227 228 """ 229 to_convert = sorted(to_convert, key=len, reverse=True) 230 for value in to_convert: 231 if value != '': 232 break 233 else: 234 return '' 235 regex = '|'.join(re_escape(stuff) for stuff in to_convert) 236 regex = '(?P<%s>%s' % (directive, regex) 237 return '%s)' % regex 238 239 def pattern(self, format): 240 """Return regex pattern for the format string. 241 242 Need to make sure that any characters that might be interpreted as 243 regex syntax are escaped. 244 245 """ 246 processed_format = '' 247 # The sub() call escapes all characters that might be misconstrued 248 # as regex syntax. Cannot use re.escape since we have to deal with 249 # format directives (%m, etc.). 250 regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])") 251 format = regex_chars.sub(r"\\\1", format) 252 whitespace_replacement = re_compile('\s+') 253 format = whitespace_replacement.sub('\s*', format) 254 while '%' in format: 255 directive_index = format.index('%')+1 256 processed_format = "%s%s%s" % (processed_format, 257 format[:directive_index-1], 258 self[format[directive_index]]) 259 format = format[directive_index+1:] 260 return "%s%s" % (processed_format, format) 261 262 def compile(self, format): 263 """Return a compiled re object for the format string.""" 264 return re_compile(self.pattern(format), IGNORECASE) 265 266_cache_lock = _thread_allocate_lock() 267# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock 268# first! 269_TimeRE_cache = TimeRE() 270_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache 271_regex_cache = {} 272 273def strptime(data_string, format="%a %b %d %H:%M:%S %Y"): 274 """Return a time struct based on the input string and the format string.""" 275 global _TimeRE_cache 276 _cache_lock.acquire() 277 try: 278 time_re = _TimeRE_cache 279 locale_time = time_re.locale_time 280 if _getlang() != locale_time.lang: 281 _TimeRE_cache = TimeRE() 282 if len(_regex_cache) > _CACHE_MAX_SIZE: 283 _regex_cache.clear() 284 format_regex = _regex_cache.get(format) 285 if not format_regex: 286 format_regex = time_re.compile(format) 287 _regex_cache[format] = format_regex 288 finally: 289 _cache_lock.release() 290 found = format_regex.match(data_string) 291 if not found: 292 raise ValueError("time data did not match format: data=%s fmt=%s" % 293 (data_string, format)) 294 if len(data_string) != found.end(): 295 raise ValueError("unconverted data remains: %s" % 296 data_string[found.end():]) 297 year = 1900 298 month = day = 1 299 hour = minute = second = 0 300 tz = -1 301 # Default to -1 to signify that values not known; not critical to have, 302 # though 303 week_of_year = -1 304 week_of_year_start = -1 305 # weekday and julian defaulted to -1 so as to signal need to calculate 306 # values 307 weekday = julian = -1 308 found_dict = found.groupdict() 309 for group_key in found_dict.iterkeys(): 310 # Directives not explicitly handled below: 311 # c, x, X 312 # handled by making out of other directives 313 # U, W 314 # worthless without day of the week 315 if group_key == 'y': 316 year = int(found_dict['y']) 317 # Open Group specification for strptime() states that a %y 318 #value in the range of [00, 68] is in the century 2000, while 319 #[69,99] is in the century 1900 320 if year <= 68: 321 year += 2000 322 else: 323 year += 1900 324 elif group_key == 'Y': 325 year = int(found_dict['Y']) 326 elif group_key == 'm': 327 month = int(found_dict['m']) 328 elif group_key == 'B': 329 month = locale_time.f_month.index(found_dict['B'].lower()) 330 elif group_key == 'b': 331 month = locale_time.a_month.index(found_dict['b'].lower()) 332 elif group_key == 'd': 333 day = int(found_dict['d']) 334 elif group_key == 'H': 335 hour = int(found_dict['H']) 336 elif group_key == 'I': 337 hour = int(found_dict['I']) 338 ampm = found_dict.get('p', '').lower() 339 # If there was no AM/PM indicator, we'll treat this like AM 340 if ampm in ('', locale_time.am_pm[0]): 341 # We're in AM so the hour is correct unless we're 342 # looking at 12 midnight. 343 # 12 midnight == 12 AM == hour 0 344 if hour == 12: 345 hour = 0 346 elif ampm == locale_time.am_pm[1]: 347 # We're in PM so we need to add 12 to the hour unless 348 # we're looking at 12 noon. 349 # 12 noon == 12 PM == hour 12 350 if hour != 12: 351 hour += 12 352 elif group_key == 'M': 353 minute = int(found_dict['M']) 354 elif group_key == 'S': 355 second = int(found_dict['S']) 356 elif group_key == 'A': 357 weekday = locale_time.f_weekday.index(found_dict['A'].lower()) 358 elif group_key == 'a': 359 weekday = locale_time.a_weekday.index(found_dict['a'].lower()) 360 elif group_key == 'w': 361 weekday = int(found_dict['w']) 362 if weekday == 0: 363 weekday = 6 364 else: 365 weekday -= 1 366 elif group_key == 'j': 367 julian = int(found_dict['j']) 368 elif group_key in ('U', 'W'): 369 week_of_year = int(found_dict[group_key]) 370 if group_key == 'U': 371 # U starts week on Sunday 372 week_of_year_start = 6 373 else: 374 # W starts week on Monday 375 week_of_year_start = 0 376 elif group_key == 'Z': 377 # Since -1 is default value only need to worry about setting tz if 378 # it can be something other than -1. 379 found_zone = found_dict['Z'].lower() 380 for value, tz_values in enumerate(locale_time.timezone): 381 if found_zone in tz_values: 382 # Deal with bad locale setup where timezone names are the 383 # same and yet time.daylight is true; too ambiguous to 384 # be able to tell what timezone has daylight savings 385 if (time.tzname[0] == time.tzname[1] and 386 time.daylight and found_zone not in ("utc", "gmt")): 387 break 388 else: 389 tz = value 390 break 391 # If we know the week of the year and what day of that week, we can figure 392 # out the Julian day of the year 393 # Calculations below assume 0 is a Monday 394 if julian == -1 and week_of_year != -1 and weekday != -1 and year != -1: 395 # Adjust for U directive so that calculations are not dependent on 396 # directive used to figure out week of year 397 if weekday == 6 and week_of_year_start == 6: 398 week_of_year -= 1 399 # For some reason when Dec 31 falls on a Monday the week of the year is 400 # off by a week; verified on both OS X and Solaris. 401 elif weekday == 0 and week_of_year_start == 6 and week_of_year >= 52: 402 week_of_year += 1 403 # Calculate how many days in week 0 404 first_weekday = datetime_date(year, 1, 1).weekday() 405 preceeding_days = 7 - first_weekday 406 if preceeding_days == 7: 407 preceeding_days = 0 408 # If in week 0, then just figure out how many days from Jan 1 to day of 409 # week specified, else calculate by multiplying week of year by 7, 410 # adding in days in week 0, and the number of days from Monday to the 411 # day of the week 412 if not week_of_year: 413 julian = 1 + weekday - first_weekday 414 else: 415 days_to_week = preceeding_days + (7 * (week_of_year - 1)) 416 julian = 1 + days_to_week + weekday 417 # Cannot pre-calculate datetime_date() since can change in Julian 418 #calculation and thus could have different value for the day of the week 419 #calculation 420 if julian == -1: 421 # Need to add 1 to result since first day of the year is 1, not 0. 422 julian = datetime_date(year, month, day).toordinal() - \ 423 datetime_date(year, 1, 1).toordinal() + 1 424 else: # Assume that if they bothered to include Julian day it will 425 #be accurate 426 datetime_result = datetime_date.fromordinal((julian - 1) + datetime_date(year, 1, 1).toordinal()) 427 year = datetime_result.year 428 month = datetime_result.month 429 day = datetime_result.day 430 if weekday == -1: 431 weekday = datetime_date(year, month, day).weekday() 432 return time.struct_time((year, month, day, 433 hour, minute, second, 434 weekday, julian, tz)) 435