13257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 23257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel""" 33257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcsv.py - read/write/investigate CSV files 43257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel""" 53257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 63257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport re 73257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielfrom functools import reduce 83257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielfrom _csv import Error, __version__, writer, reader, register_dialect, \ 93257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel unregister_dialect, get_dialect, list_dialects, \ 103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel field_size_limit, \ 113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ 123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel __doc__ 133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielfrom _csv import Dialect as _Dialect 143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieltry: 163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel from cStringIO import StringIO 173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielexcept ImportError: 183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel from StringIO import StringIO 193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", 213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel "Error", "Dialect", "__doc__", "excel", "excel_tab", 223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel "field_size_limit", "reader", "writer", 233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel "register_dialect", "get_dialect", "list_dialects", "Sniffer", 243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel "unregister_dialect", "__version__", "DictReader", "DictWriter" ] 253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass Dialect: 273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """Describe an Excel dialect. 283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel This must be subclassed (see csv.excel). Valid attributes are: 303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delimiter, quotechar, escapechar, doublequote, skipinitialspace, 313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel lineterminator, quoting. 323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """ 343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _name = "" 353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _valid = False 363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # placeholders 373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delimiter = None 383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel quotechar = None 393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel escapechar = None 403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel doublequote = None 413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel skipinitialspace = None 423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel lineterminator = None 433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel quoting = None 443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __init__(self): 463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.__class__ != Dialect: 473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self._valid = True 483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self._validate() 493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def _validate(self): 513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _Dialect(self) 533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except TypeError, e: 543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # We do this for compatibility with py2.3 553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise Error(str(e)) 563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass excel(Dialect): 583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """Describe the usual properties of Excel-generated CSV files.""" 593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delimiter = ',' 603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel quotechar = '"' 613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel doublequote = True 623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel skipinitialspace = False 633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel lineterminator = '\r\n' 643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel quoting = QUOTE_MINIMAL 653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielregister_dialect("excel", excel) 663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass excel_tab(excel): 683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """Describe the usual properties of Excel-generated TAB-delimited files.""" 693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delimiter = '\t' 703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielregister_dialect("excel-tab", excel_tab) 713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass DictReader: 743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __init__(self, f, fieldnames=None, restkey=None, restval=None, 753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel dialect="excel", *args, **kwds): 763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self._fieldnames = fieldnames # list of keys for the dict 773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.restkey = restkey # key to catch long rows 783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.restval = restval # default value for short rows 793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.reader = reader(f, dialect, *args, **kwds) 803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.dialect = dialect 813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.line_num = 0 823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __iter__(self): 843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return self 853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel @property 873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def fieldnames(self): 883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self._fieldnames is None: 893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self._fieldnames = self.reader.next() 913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except StopIteration: 923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.line_num = self.reader.line_num 943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return self._fieldnames 953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Issue 20004: Because DictReader is a classic class, this setter is 973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # ignored. At this point in 2.7's lifecycle, it is too late to change the 983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # base class for fear of breaking working code. If you want to change 993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # fieldnames without overwriting the getter, set _fieldnames directly. 1003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel @fieldnames.setter 1013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def fieldnames(self, value): 1023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self._fieldnames = value 1033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def next(self): 1053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.line_num == 0: 1063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Used only for its side effect. 1073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.fieldnames 1083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel row = self.reader.next() 1093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.line_num = self.reader.line_num 1103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # unlike the basic reader, we prefer not to return blanks, 1123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # because we will typically wind up with a dict full of None 1133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # values 1143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel while row == []: 1153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel row = self.reader.next() 1163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel d = dict(zip(self.fieldnames, row)) 1173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel lf = len(self.fieldnames) 1183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel lr = len(row) 1193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if lf < lr: 1203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel d[self.restkey] = row[lf:] 1213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif lf > lr: 1223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for key in self.fieldnames[lr:]: 1233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel d[key] = self.restval 1243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return d 1253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass DictWriter: 1283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __init__(self, f, fieldnames, restval="", extrasaction="raise", 1293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel dialect="excel", *args, **kwds): 1303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.fieldnames = fieldnames # list of keys for the dict 1313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.restval = restval # for writing short dicts 1323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if extrasaction.lower() not in ("raise", "ignore"): 1333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise ValueError, \ 1343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ("extrasaction (%s) must be 'raise' or 'ignore'" % 1353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel extrasaction) 1363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.extrasaction = extrasaction 1373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.writer = writer(f, dialect, *args, **kwds) 1383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def writeheader(self): 1403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel header = dict(zip(self.fieldnames, self.fieldnames)) 1413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.writerow(header) 1423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def _dict_to_list(self, rowdict): 1443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.extrasaction == "raise": 1453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel wrong_fields = [k for k in rowdict if k not in self.fieldnames] 1463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if wrong_fields: 1473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise ValueError("dict contains fields not in fieldnames: " 1483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel + ", ".join([repr(x) for x in wrong_fields])) 1493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return [rowdict.get(key, self.restval) for key in self.fieldnames] 1503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def writerow(self, rowdict): 1523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return self.writer.writerow(self._dict_to_list(rowdict)) 1533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def writerows(self, rowdicts): 1553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rows = [] 1563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for rowdict in rowdicts: 1573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rows.append(self._dict_to_list(rowdict)) 1583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return self.writer.writerows(rows) 1593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Guard Sniffer's type checking against builds that exclude complex() 1613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieltry: 1623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel complex 1633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielexcept NameError: 1643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel complex = float 1653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass Sniffer: 1673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ''' 1683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) 1693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel Returns a Dialect object. 1703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ''' 1713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __init__(self): 1723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # in case there is more than one possible delimiter 1733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.preferred = [',', '\t', ';', ' ', ':'] 1743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def sniff(self, sample, delimiters=None): 1773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """ 1783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel Returns a dialect (or None) corresponding to the sample 1793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """ 1803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel quotechar, doublequote, delimiter, skipinitialspace = \ 1823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self._guess_quote_and_delimiter(sample, delimiters) 1833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not delimiter: 1843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delimiter, skipinitialspace = self._guess_delimiter(sample, 1853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delimiters) 1863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not delimiter: 1883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise Error, "Could not determine delimiter" 1893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel class dialect(Dialect): 1913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _name = "sniffed" 1923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel lineterminator = '\r\n' 1933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel quoting = QUOTE_MINIMAL 1943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # escapechar = '' 1953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel dialect.doublequote = doublequote 1973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel dialect.delimiter = delimiter 1983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # _csv.reader won't accept a quotechar of '' 1993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel dialect.quotechar = quotechar or '"' 2003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel dialect.skipinitialspace = skipinitialspace 2013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return dialect 2033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def _guess_quote_and_delimiter(self, data, delimiters): 2063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """ 2073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel Looks for text enclosed between two identical quotes 2083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (the probable quotechar) which are preceded and followed 2093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel by the same character (the probable delimiter). 2103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel For example: 2113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ,'some text', 2123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel The quote with the most wins, same with the delimiter. 2133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel If there is no quotechar the delimiter can't be determined 2143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel this way. 2153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """ 2163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel matches = [] 2183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", 2193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", 2203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" 2213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) 2223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel regexp = re.compile(restr, re.DOTALL | re.MULTILINE) 2233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel matches = regexp.findall(data) 2243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if matches: 2253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel break 2263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not matches: 2283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # (quotechar, doublequote, delimiter, skipinitialspace) 2293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return ('', False, None, 0) 2303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel quotes = {} 2313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delims = {} 2323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel spaces = 0 2333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for m in matches: 2343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel n = regexp.groupindex['quote'] - 1 2353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel key = m[n] 2363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if key: 2373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel quotes[key] = quotes.get(key, 0) + 1 2383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 2393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel n = regexp.groupindex['delim'] - 1 2403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel key = m[n] 2413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except KeyError: 2423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 2433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if key and (delimiters is None or key in delimiters): 2443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delims[key] = delims.get(key, 0) + 1 2453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 2463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel n = regexp.groupindex['space'] - 1 2473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except KeyError: 2483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 2493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if m[n]: 2503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel spaces += 1 2513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel quotechar = reduce(lambda a, b, quotes = quotes: 2533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (quotes[a] > quotes[b]) and a or b, quotes.keys()) 2543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if delims: 2563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delim = reduce(lambda a, b, delims = delims: 2573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (delims[a] > delims[b]) and a or b, delims.keys()) 2583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel skipinitialspace = delims[delim] == spaces 2593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if delim == '\n': # most likely a file with a single column 2603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delim = '' 2613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 2623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # there is *no* delimiter, it's a single column of quoted data 2633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delim = '' 2643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel skipinitialspace = 0 2653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # if we see an extra quote between delimiters, we've got a 2673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # double quoted format 2683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel dq_regexp = re.compile( 2693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ 2703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) 2713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if dq_regexp.search(data): 2753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel doublequote = True 2763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 2773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel doublequote = False 2783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (quotechar, doublequote, delim, skipinitialspace) 2803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def _guess_delimiter(self, data, delimiters): 2833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """ 2843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel The delimiter /should/ occur the same number of times on 2853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel each row. However, due to malformed data, it may not. We don't want 2863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel an all or nothing approach, so we allow for small variations in this 2873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel number. 2883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1) build a table of the frequency of each character on every line. 2893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 2) build a table of frequencies of this frequency (meta-frequency?), 2903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, 2913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 7 times in 2 rows' 2923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3) use the mode of the meta-frequency to determine the /expected/ 2933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel frequency for that character 2943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4) find out how often the character actually meets that goal 2953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 5) the character that best meets its goal is the delimiter 2963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel For performance reasons, the data is evaluated in chunks, so it can 2973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try and evaluate the smallest portion of the data possible, evaluating 2983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel additional chunks as necessary. 2993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """ 3003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data = filter(None, data.split('\n')) 3023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ascii = [chr(c) for c in range(127)] # 7-bit ASCII 3043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # build frequency tables 3063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel chunkLength = min(10, len(data)) 3073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel iteration = 0 3083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel charFrequency = {} 3093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel modes = {} 3103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delims = {} 3113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel start, end = 0, min(chunkLength, len(data)) 3123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel while start < len(data): 3133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel iteration += 1 3143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for line in data[start:end]: 3153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for char in ascii: 3163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel metaFrequency = charFrequency.get(char, {}) 3173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # must count even if frequency is 0 3183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel freq = line.count(char) 3193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # value is the mode 3203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 3213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel charFrequency[char] = metaFrequency 3223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for char in charFrequency.keys(): 3243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel items = charFrequency[char].items() 3253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(items) == 1 and items[0][0] == 0: 3263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 3273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # get the mode of the frequencies 3283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(items) > 1: 3293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, 3303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel items) 3313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # adjust the mode - subtract the sum of all 3323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # other frequencies 3333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel items.remove(modes[char]) 3343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel modes[char] = (modes[char][0], modes[char][1] 3353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel - reduce(lambda a, b: (0, a[1] + b[1]), 3363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel items)[1]) 3373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 3383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel modes[char] = items[0] 3393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # build a list of possible delimiters 3413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel modeList = modes.items() 3423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel total = float(chunkLength * iteration) 3433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # (rows of consistent data) / (number of rows) = 100% 3443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel consistency = 1.0 3453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # minimum consistency threshold 3463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel threshold = 0.9 3473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel while len(delims) == 0 and consistency >= threshold: 3483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for k, v in modeList: 3493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if v[0] > 0 and v[1] > 0: 3503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if ((v[1]/total) >= consistency and 3513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (delimiters is None or k in delimiters)): 3523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delims[k] = v 3533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel consistency -= 0.01 3543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(delims) == 1: 3563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delim = delims.keys()[0] 3573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel skipinitialspace = (data[0].count(delim) == 3583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data[0].count("%c " % delim)) 3593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (delim, skipinitialspace) 3603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # analyze another chunkLength lines 3623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel start = end 3633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel end += chunkLength 3643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not delims: 3663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return ('', 0) 3673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # if there's more than one, fall back to a 'preferred' list 3693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(delims) > 1: 3703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for d in self.preferred: 3713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if d in delims.keys(): 3723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel skipinitialspace = (data[0].count(d) == 3733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data[0].count("%c " % d)) 3743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (d, skipinitialspace) 3753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # nothing else indicates a preference, pick the character that 3773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # dominates(?) 3783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel items = [(v,k) for (k,v) in delims.items()] 3793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel items.sort() 3803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel delim = items[-1][1] 3813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel skipinitialspace = (data[0].count(delim) == 3833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel data[0].count("%c " % delim)) 3843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (delim, skipinitialspace) 3853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def has_header(self, sample): 3883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Creates a dictionary of types of data in each column. If any 3893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # column is of a single type (say, integers), *except* for the first 3903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # row, then the first row is presumed to be labels. If the type 3913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # can't be determined, it is assumed to be a string in which case 3923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # the length of the string is the determining factor: if all of the 3933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # rows except for the first are the same length, it's a header. 3943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Finally, a 'vote' is taken at the end for each column, adding or 3953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # subtracting from the likelihood of the first row being a header. 3963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel rdr = reader(StringIO(sample), self.sniff(sample)) 3983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 3993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel header = rdr.next() # assume first row is header 4003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel columns = len(header) 4023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel columnTypes = {} 4033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for i in range(columns): columnTypes[i] = None 4043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel checked = 0 4063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for row in rdr: 4073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # arbitrary number of rows to check, to keep it sane 4083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if checked > 20: 4093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel break 4103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel checked += 1 4113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(row) != columns: 4133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue # skip rows that have irregular number of columns 4143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for col in columnTypes.keys(): 4163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for thisType in [int, long, float, complex]: 4183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 4193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel thisType(row[col]) 4203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel break 4213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except (ValueError, OverflowError): 4223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 4233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 4243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # fallback to length of string 4253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel thisType = len(row[col]) 4263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # treat longs as ints 4283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if thisType == long: 4293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel thisType = int 4303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if thisType != columnTypes[col]: 4323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if columnTypes[col] is None: # add new column type 4333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel columnTypes[col] = thisType 4343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 4353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # type is inconsistent, remove column from 4363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # consideration 4373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel del columnTypes[col] 4383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # finally, compare results against first row and "vote" 4403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # on whether it's a header 4413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel hasHeader = 0 4423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for col, colType in columnTypes.items(): 4433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if type(colType) == type(0): # it's a length 4443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(header[col]) != colType: 4453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel hasHeader += 1 4463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 4473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel hasHeader -= 1 4483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: # attempt typecast 4493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 4503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel colType(header[col]) 4513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except (ValueError, TypeError): 4523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel hasHeader += 1 4533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 4543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel hasHeader -= 1 4553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 4563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return hasHeader > 0 457