13257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
23257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel"""
33257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcsv.py - read/write/investigate CSV files
43257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel"""
53257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
63257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport re
73257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielfrom functools import reduce
83257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielfrom _csv import Error, __version__, writer, reader, register_dialect, \
93257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                 unregister_dialect, get_dialect, list_dialects, \
103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                 field_size_limit, \
113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                 __doc__
133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielfrom _csv import Dialect as _Dialect
143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieltry:
163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    from cStringIO import StringIO
173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielexcept ImportError:
183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    from StringIO import StringIO
193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            "Error", "Dialect", "__doc__", "excel", "excel_tab",
223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            "field_size_limit", "reader", "writer",
233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            "register_dialect", "get_dialect", "list_dialects", "Sniffer",
243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass Dialect:
273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    """Describe an Excel dialect.
283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    This must be subclassed (see csv.excel).  Valid attributes are:
303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    lineterminator, quoting.
323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    """
343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    _name = ""
353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    _valid = False
363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # placeholders
373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    delimiter = None
383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    quotechar = None
393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    escapechar = None
403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    doublequote = None
413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    skipinitialspace = None
423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    lineterminator = None
433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    quoting = None
443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __init__(self):
463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.__class__ != Dialect:
473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self._valid = True
483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self._validate()
493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def _validate(self):
513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        try:
523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            _Dialect(self)
533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        except TypeError, e:
543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # We do this for compatibility with py2.3
553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            raise Error(str(e))
563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass excel(Dialect):
583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    """Describe the usual properties of Excel-generated CSV files."""
593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    delimiter = ','
603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    quotechar = '"'
613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    doublequote = True
623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    skipinitialspace = False
633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    lineterminator = '\r\n'
643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    quoting = QUOTE_MINIMAL
653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielregister_dialect("excel", excel)
663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass excel_tab(excel):
683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    """Describe the usual properties of Excel-generated TAB-delimited files."""
693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    delimiter = '\t'
703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielregister_dialect("excel-tab", excel_tab)
713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass DictReader:
743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                 dialect="excel", *args, **kwds):
763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self._fieldnames = fieldnames   # list of keys for the dict
773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.restkey = restkey          # key to catch long rows
783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.restval = restval          # default value for short rows
793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.reader = reader(f, dialect, *args, **kwds)
803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.dialect = dialect
813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.line_num = 0
823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __iter__(self):
843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return self
853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    @property
873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def fieldnames(self):
883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self._fieldnames is None:
893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            try:
903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self._fieldnames = self.reader.next()
913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            except StopIteration:
923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                pass
933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.line_num = self.reader.line_num
943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return self._fieldnames
953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Issue 20004: Because DictReader is a classic class, this setter is
973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # ignored.  At this point in 2.7's lifecycle, it is too late to change the
983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # base class for fear of breaking working code.  If you want to change
993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # fieldnames without overwriting the getter, set _fieldnames directly.
1003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    @fieldnames.setter
1013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def fieldnames(self, value):
1023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self._fieldnames = value
1033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def next(self):
1053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.line_num == 0:
1063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # Used only for its side effect.
1073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.fieldnames
1083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        row = self.reader.next()
1093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.line_num = self.reader.line_num
1103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # unlike the basic reader, we prefer not to return blanks,
1123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # because we will typically wind up with a dict full of None
1133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # values
1143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        while row == []:
1153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            row = self.reader.next()
1163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        d = dict(zip(self.fieldnames, row))
1173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        lf = len(self.fieldnames)
1183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        lr = len(row)
1193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if lf < lr:
1203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            d[self.restkey] = row[lf:]
1213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        elif lf > lr:
1223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for key in self.fieldnames[lr:]:
1233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                d[key] = self.restval
1243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return d
1253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass DictWriter:
1283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
1293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                 dialect="excel", *args, **kwds):
1303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.fieldnames = fieldnames    # list of keys for the dict
1313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.restval = restval          # for writing short dicts
1323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if extrasaction.lower() not in ("raise", "ignore"):
1333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            raise ValueError, \
1343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                  ("extrasaction (%s) must be 'raise' or 'ignore'" %
1353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                   extrasaction)
1363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.extrasaction = extrasaction
1373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.writer = writer(f, dialect, *args, **kwds)
1383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def writeheader(self):
1403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        header = dict(zip(self.fieldnames, self.fieldnames))
1413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.writerow(header)
1423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def _dict_to_list(self, rowdict):
1443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.extrasaction == "raise":
1453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            wrong_fields = [k for k in rowdict if k not in self.fieldnames]
1463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if wrong_fields:
1473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                raise ValueError("dict contains fields not in fieldnames: "
1483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                 + ", ".join([repr(x) for x in wrong_fields]))
1493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return [rowdict.get(key, self.restval) for key in self.fieldnames]
1503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def writerow(self, rowdict):
1523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return self.writer.writerow(self._dict_to_list(rowdict))
1533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def writerows(self, rowdicts):
1553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rows = []
1563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for rowdict in rowdicts:
1573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            rows.append(self._dict_to_list(rowdict))
1583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return self.writer.writerows(rows)
1593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Guard Sniffer's type checking against builds that exclude complex()
1613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieltry:
1623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    complex
1633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielexcept NameError:
1643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    complex = float
1653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass Sniffer:
1673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    '''
1683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
1693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    Returns a Dialect object.
1703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    '''
1713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __init__(self):
1723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # in case there is more than one possible delimiter
1733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.preferred = [',', '\t', ';', ' ', ':']
1743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def sniff(self, sample, delimiters=None):
1773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        """
1783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        Returns a dialect (or None) corresponding to the sample
1793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        """
1803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        quotechar, doublequote, delimiter, skipinitialspace = \
1823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                   self._guess_quote_and_delimiter(sample, delimiters)
1833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not delimiter:
1843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            delimiter, skipinitialspace = self._guess_delimiter(sample,
1853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                                                delimiters)
1863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not delimiter:
1883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            raise Error, "Could not determine delimiter"
1893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        class dialect(Dialect):
1913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            _name = "sniffed"
1923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            lineterminator = '\r\n'
1933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            quoting = QUOTE_MINIMAL
1943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # escapechar = ''
1953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        dialect.doublequote = doublequote
1973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        dialect.delimiter = delimiter
1983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # _csv.reader won't accept a quotechar of ''
1993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        dialect.quotechar = quotechar or '"'
2003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        dialect.skipinitialspace = skipinitialspace
2013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return dialect
2033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def _guess_quote_and_delimiter(self, data, delimiters):
2063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        """
2073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        Looks for text enclosed between two identical quotes
2083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        (the probable quotechar) which are preceded and followed
2093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        by the same character (the probable delimiter).
2103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        For example:
2113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                         ,'some text',
2123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        The quote with the most wins, same with the delimiter.
2133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        If there is no quotechar the delimiter can't be determined
2143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        this way.
2153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        """
2163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        matches = []
2183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
2193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
2203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
2213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
2223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
2233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            matches = regexp.findall(data)
2243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if matches:
2253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                break
2263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not matches:
2283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # (quotechar, doublequote, delimiter, skipinitialspace)
2293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return ('', False, None, 0)
2303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        quotes = {}
2313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        delims = {}
2323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        spaces = 0
2333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for m in matches:
2343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            n = regexp.groupindex['quote'] - 1
2353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            key = m[n]
2363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if key:
2373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                quotes[key] = quotes.get(key, 0) + 1
2383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            try:
2393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                n = regexp.groupindex['delim'] - 1
2403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                key = m[n]
2413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            except KeyError:
2423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                continue
2433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if key and (delimiters is None or key in delimiters):
2443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                delims[key] = delims.get(key, 0) + 1
2453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            try:
2463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                n = regexp.groupindex['space'] - 1
2473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            except KeyError:
2483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                continue
2493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if m[n]:
2503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                spaces += 1
2513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        quotechar = reduce(lambda a, b, quotes = quotes:
2533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                           (quotes[a] > quotes[b]) and a or b, quotes.keys())
2543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if delims:
2563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            delim = reduce(lambda a, b, delims = delims:
2573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                           (delims[a] > delims[b]) and a or b, delims.keys())
2583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            skipinitialspace = delims[delim] == spaces
2593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if delim == '\n': # most likely a file with a single column
2603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                delim = ''
2613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
2623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # there is *no* delimiter, it's a single column of quoted data
2633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            delim = ''
2643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            skipinitialspace = 0
2653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # if we see an extra quote between delimiters, we've got a
2673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # double quoted format
2683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        dq_regexp = re.compile(
2693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
2703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
2713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if dq_regexp.search(data):
2753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            doublequote = True
2763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
2773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            doublequote = False
2783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return (quotechar, doublequote, delim, skipinitialspace)
2803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
2823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def _guess_delimiter(self, data, delimiters):
2833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        """
2843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        The delimiter /should/ occur the same number of times on
2853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        each row. However, due to malformed data, it may not. We don't want
2863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        an all or nothing approach, so we allow for small variations in this
2873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        number.
2883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel          1) build a table of the frequency of each character on every line.
2893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel          2) build a table of frequencies of this frequency (meta-frequency?),
2903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
2913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel             7 times in 2 rows'
2923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel          3) use the mode of the meta-frequency to determine the /expected/
2933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel             frequency for that character
2943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel          4) find out how often the character actually meets that goal
2953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel          5) the character that best meets its goal is the delimiter
2963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        For performance reasons, the data is evaluated in chunks, so it can
2973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        try and evaluate the smallest portion of the data possible, evaluating
2983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        additional chunks as necessary.
2993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        """
3003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        data = filter(None, data.split('\n'))
3023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
3043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # build frequency tables
3063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        chunkLength = min(10, len(data))
3073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        iteration = 0
3083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        charFrequency = {}
3093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        modes = {}
3103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        delims = {}
3113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        start, end = 0, min(chunkLength, len(data))
3123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        while start < len(data):
3133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            iteration += 1
3143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for line in data[start:end]:
3153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                for char in ascii:
3163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    metaFrequency = charFrequency.get(char, {})
3173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    # must count even if frequency is 0
3183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    freq = line.count(char)
3193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    # value is the mode
3203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
3213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    charFrequency[char] = metaFrequency
3223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for char in charFrequency.keys():
3243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                items = charFrequency[char].items()
3253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if len(items) == 1 and items[0][0] == 0:
3263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    continue
3273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                # get the mode of the frequencies
3283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if len(items) > 1:
3293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
3303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                         items)
3313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    # adjust the mode - subtract the sum of all
3323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    # other frequencies
3333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    items.remove(modes[char])
3343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    modes[char] = (modes[char][0], modes[char][1]
3353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                   - reduce(lambda a, b: (0, a[1] + b[1]),
3363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                            items)[1])
3373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                else:
3383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    modes[char] = items[0]
3393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # build a list of possible delimiters
3413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            modeList = modes.items()
3423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            total = float(chunkLength * iteration)
3433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # (rows of consistent data) / (number of rows) = 100%
3443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            consistency = 1.0
3453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # minimum consistency threshold
3463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            threshold = 0.9
3473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            while len(delims) == 0 and consistency >= threshold:
3483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                for k, v in modeList:
3493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if v[0] > 0 and v[1] > 0:
3503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        if ((v[1]/total) >= consistency and
3513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                            (delimiters is None or k in delimiters)):
3523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                            delims[k] = v
3533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                consistency -= 0.01
3543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if len(delims) == 1:
3563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                delim = delims.keys()[0]
3573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                skipinitialspace = (data[0].count(delim) ==
3583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                    data[0].count("%c " % delim))
3593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                return (delim, skipinitialspace)
3603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # analyze another chunkLength lines
3623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            start = end
3633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            end += chunkLength
3643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not delims:
3663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return ('', 0)
3673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # if there's more than one, fall back to a 'preferred' list
3693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if len(delims) > 1:
3703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for d in self.preferred:
3713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if d in delims.keys():
3723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    skipinitialspace = (data[0].count(d) ==
3733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                                        data[0].count("%c " % d))
3743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    return (d, skipinitialspace)
3753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # nothing else indicates a preference, pick the character that
3773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # dominates(?)
3783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        items = [(v,k) for (k,v) in delims.items()]
3793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        items.sort()
3803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        delim = items[-1][1]
3813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        skipinitialspace = (data[0].count(delim) ==
3833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                            data[0].count("%c " % delim))
3843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return (delim, skipinitialspace)
3853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def has_header(self, sample):
3883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # Creates a dictionary of types of data in each column. If any
3893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # column is of a single type (say, integers), *except* for the first
3903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # row, then the first row is presumed to be labels. If the type
3913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # can't be determined, it is assumed to be a string in which case
3923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # the length of the string is the determining factor: if all of the
3933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # rows except for the first are the same length, it's a header.
3943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # Finally, a 'vote' is taken at the end for each column, adding or
3953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # subtracting from the likelihood of the first row being a header.
3963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        rdr = reader(StringIO(sample), self.sniff(sample))
3983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
3993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        header = rdr.next() # assume first row is header
4003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        columns = len(header)
4023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        columnTypes = {}
4033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for i in range(columns): columnTypes[i] = None
4043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        checked = 0
4063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for row in rdr:
4073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # arbitrary number of rows to check, to keep it sane
4083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if checked > 20:
4093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                break
4103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            checked += 1
4113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if len(row) != columns:
4133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                continue # skip rows that have irregular number of columns
4143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            for col in columnTypes.keys():
4163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                for thisType in [int, long, float, complex]:
4183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    try:
4193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        thisType(row[col])
4203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        break
4213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    except (ValueError, OverflowError):
4223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        pass
4233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                else:
4243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    # fallback to length of string
4253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    thisType = len(row[col])
4263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                # treat longs as ints
4283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if thisType == long:
4293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    thisType = int
4303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if thisType != columnTypes[col]:
4323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    if columnTypes[col] is None: # add new column type
4333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        columnTypes[col] = thisType
4343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    else:
4353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        # type is inconsistent, remove column from
4363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        # consideration
4373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                        del columnTypes[col]
4383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # finally, compare results against first row and "vote"
4403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # on whether it's a header
4413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        hasHeader = 0
4423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for col, colType in columnTypes.items():
4433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if type(colType) == type(0): # it's a length
4443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if len(header[col]) != colType:
4453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    hasHeader += 1
4463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                else:
4473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    hasHeader -= 1
4483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            else: # attempt typecast
4493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                try:
4503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    colType(header[col])
4513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                except (ValueError, TypeError):
4523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    hasHeader += 1
4533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                else:
4543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    hasHeader -= 1
4553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
4563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return hasHeader > 0
457