14710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
24710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm"""
34710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmcsv.py - read/write/investigate CSV files
44710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm"""
54710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
64710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmimport re
74710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmfrom functools import reduce
84710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmfrom _csv import Error, __version__, writer, reader, register_dialect, \
94710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                 unregister_dialect, get_dialect, list_dialects, \
104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                 field_size_limit, \
114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                 __doc__
134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmfrom _csv import Dialect as _Dialect
144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmtry:
164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    from cStringIO import StringIO
174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmexcept ImportError:
184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    from StringIO import StringIO
194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            "Error", "Dialect", "__doc__", "excel", "excel_tab",
224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            "field_size_limit", "reader", "writer",
234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            "register_dialect", "get_dialect", "list_dialects", "Sniffer",
244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass Dialect:
274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    """Describe an Excel dialect.
284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    This must be subclassed (see csv.excel).  Valid attributes are:
304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    lineterminator, quoting.
324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    """
344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    _name = ""
354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    _valid = False
364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # placeholders
374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    delimiter = None
384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    quotechar = None
394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    escapechar = None
404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    doublequote = None
414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    skipinitialspace = None
424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    lineterminator = None
434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    quoting = None
444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __init__(self):
464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if self.__class__ != Dialect:
474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            self._valid = True
484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self._validate()
494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def _validate(self):
514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        try:
524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            _Dialect(self)
534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        except TypeError, e:
544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # We do this for compatibility with py2.3
554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            raise Error(str(e))
564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass excel(Dialect):
584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    """Describe the usual properties of Excel-generated CSV files."""
594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    delimiter = ','
604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    quotechar = '"'
614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    doublequote = True
624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    skipinitialspace = False
634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    lineterminator = '\r\n'
644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    quoting = QUOTE_MINIMAL
654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmregister_dialect("excel", excel)
664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass excel_tab(excel):
684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    """Describe the usual properties of Excel-generated TAB-delimited files."""
694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    delimiter = '\t'
704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmregister_dialect("excel-tab", excel_tab)
714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass DictReader:
744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                 dialect="excel", *args, **kwds):
764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self._fieldnames = fieldnames   # list of keys for the dict
774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.restkey = restkey          # key to catch long rows
784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.restval = restval          # default value for short rows
794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.reader = reader(f, dialect, *args, **kwds)
804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.dialect = dialect
814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.line_num = 0
824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __iter__(self):
844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return self
854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    @property
874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def fieldnames(self):
884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if self._fieldnames is None:
894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            try:
904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                self._fieldnames = self.reader.next()
914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            except StopIteration:
924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                pass
934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.line_num = self.reader.line_num
944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return self._fieldnames
954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    @fieldnames.setter
974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def fieldnames(self, value):
984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self._fieldnames = value
994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def next(self):
1014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if self.line_num == 0:
1024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # Used only for its side effect.
1034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            self.fieldnames
1044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        row = self.reader.next()
1054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.line_num = self.reader.line_num
1064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # unlike the basic reader, we prefer not to return blanks,
1084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # because we will typically wind up with a dict full of None
1094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # values
1104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        while row == []:
1114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            row = self.reader.next()
1124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        d = dict(zip(self.fieldnames, row))
1134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        lf = len(self.fieldnames)
1144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        lr = len(row)
1154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if lf < lr:
1164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            d[self.restkey] = row[lf:]
1174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif lf > lr:
1184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            for key in self.fieldnames[lr:]:
1194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                d[key] = self.restval
1204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return d
1214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass DictWriter:
1244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
1254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                 dialect="excel", *args, **kwds):
1264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.fieldnames = fieldnames    # list of keys for the dict
1274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.restval = restval          # for writing short dicts
1284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if extrasaction.lower() not in ("raise", "ignore"):
1294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            raise ValueError, \
1304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                  ("extrasaction (%s) must be 'raise' or 'ignore'" %
1314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                   extrasaction)
1324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.extrasaction = extrasaction
1334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.writer = writer(f, dialect, *args, **kwds)
1344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def writeheader(self):
1364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        header = dict(zip(self.fieldnames, self.fieldnames))
1374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.writerow(header)
1384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def _dict_to_list(self, rowdict):
1404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if self.extrasaction == "raise":
1414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            wrong_fields = [k for k in rowdict if k not in self.fieldnames]
1424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if wrong_fields:
1434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                raise ValueError("dict contains fields not in fieldnames: " +
1444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                 ", ".join(wrong_fields))
1454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return [rowdict.get(key, self.restval) for key in self.fieldnames]
1464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def writerow(self, rowdict):
1484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return self.writer.writerow(self._dict_to_list(rowdict))
1494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def writerows(self, rowdicts):
1514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        rows = []
1524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for rowdict in rowdicts:
1534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            rows.append(self._dict_to_list(rowdict))
1544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return self.writer.writerows(rows)
1554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Guard Sniffer's type checking against builds that exclude complex()
1574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmtry:
1584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    complex
1594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmexcept NameError:
1604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    complex = float
1614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass Sniffer:
1634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    '''
1644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
1654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    Returns a Dialect object.
1664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    '''
1674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __init__(self):
1684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # in case there is more than one possible delimiter
1694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.preferred = [',', '\t', ';', ' ', ':']
1704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def sniff(self, sample, delimiters=None):
1734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        """
1744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        Returns a dialect (or None) corresponding to the sample
1754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        """
1764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        quotechar, doublequote, delimiter, skipinitialspace = \
1784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                   self._guess_quote_and_delimiter(sample, delimiters)
1794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if not delimiter:
1804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            delimiter, skipinitialspace = self._guess_delimiter(sample,
1814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                                                delimiters)
1824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if not delimiter:
1844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            raise Error, "Could not determine delimiter"
1854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        class dialect(Dialect):
1874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            _name = "sniffed"
1884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            lineterminator = '\r\n'
1894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            quoting = QUOTE_MINIMAL
1904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # escapechar = ''
1914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        dialect.doublequote = doublequote
1934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        dialect.delimiter = delimiter
1944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # _csv.reader won't accept a quotechar of ''
1954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        dialect.quotechar = quotechar or '"'
1964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        dialect.skipinitialspace = skipinitialspace
1974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return dialect
1994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def _guess_quote_and_delimiter(self, data, delimiters):
2024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        """
2034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        Looks for text enclosed between two identical quotes
2044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        (the probable quotechar) which are preceded and followed
2054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        by the same character (the probable delimiter).
2064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        For example:
2074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                         ,'some text',
2084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        The quote with the most wins, same with the delimiter.
2094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        If there is no quotechar the delimiter can't be determined
2104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        this way.
2114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        """
2124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        matches = []
2144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
2154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
2164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                      '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
2174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
2184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
2194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            matches = regexp.findall(data)
2204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if matches:
2214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                break
2224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if not matches:
2244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # (quotechar, doublequote, delimiter, skipinitialspace)
2254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return ('', False, None, 0)
2264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        quotes = {}
2274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        delims = {}
2284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        spaces = 0
2294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for m in matches:
2304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            n = regexp.groupindex['quote'] - 1
2314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            key = m[n]
2324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if key:
2334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                quotes[key] = quotes.get(key, 0) + 1
2344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            try:
2354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                n = regexp.groupindex['delim'] - 1
2364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                key = m[n]
2374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            except KeyError:
2384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                continue
2394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if key and (delimiters is None or key in delimiters):
2404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                delims[key] = delims.get(key, 0) + 1
2414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            try:
2424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                n = regexp.groupindex['space'] - 1
2434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            except KeyError:
2444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                continue
2454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if m[n]:
2464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                spaces += 1
2474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        quotechar = reduce(lambda a, b, quotes = quotes:
2494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                           (quotes[a] > quotes[b]) and a or b, quotes.keys())
2504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if delims:
2524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            delim = reduce(lambda a, b, delims = delims:
2534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                           (delims[a] > delims[b]) and a or b, delims.keys())
2544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            skipinitialspace = delims[delim] == spaces
2554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if delim == '\n': # most likely a file with a single column
2564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                delim = ''
2574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        else:
2584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # there is *no* delimiter, it's a single column of quoted data
2594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            delim = ''
2604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            skipinitialspace = 0
2614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # if we see an extra quote between delimiters, we've got a
2634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # double quoted format
2644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
2654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                               {'delim':delim, 'quote':quotechar}, re.MULTILINE)
2664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if dq_regexp.search(data):
2704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            doublequote = True
2714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        else:
2724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            doublequote = False
2734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return (quotechar, doublequote, delim, skipinitialspace)
2754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def _guess_delimiter(self, data, delimiters):
2784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        """
2794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        The delimiter /should/ occur the same number of times on
2804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        each row. However, due to malformed data, it may not. We don't want
2814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        an all or nothing approach, so we allow for small variations in this
2824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        number.
2834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm          1) build a table of the frequency of each character on every line.
2844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm          2) build a table of frequencies of this frequency (meta-frequency?),
2854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
2864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm             7 times in 2 rows'
2874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm          3) use the mode of the meta-frequency to determine the /expected/
2884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm             frequency for that character
2894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm          4) find out how often the character actually meets that goal
2904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm          5) the character that best meets its goal is the delimiter
2914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        For performance reasons, the data is evaluated in chunks, so it can
2924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        try and evaluate the smallest portion of the data possible, evaluating
2934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        additional chunks as necessary.
2944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        """
2954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        data = filter(None, data.split('\n'))
2974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
2994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # build frequency tables
3014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        chunkLength = min(10, len(data))
3024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        iteration = 0
3034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        charFrequency = {}
3044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        modes = {}
3054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        delims = {}
3064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        start, end = 0, min(chunkLength, len(data))
3074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        while start < len(data):
3084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            iteration += 1
3094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            for line in data[start:end]:
3104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                for char in ascii:
3114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    metaFrequency = charFrequency.get(char, {})
3124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # must count even if frequency is 0
3134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    freq = line.count(char)
3144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # value is the mode
3154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
3164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    charFrequency[char] = metaFrequency
3174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            for char in charFrequency.keys():
3194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                items = charFrequency[char].items()
3204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if len(items) == 1 and items[0][0] == 0:
3214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    continue
3224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                # get the mode of the frequencies
3234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if len(items) > 1:
3244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
3254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                         items)
3264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # adjust the mode - subtract the sum of all
3274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # other frequencies
3284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    items.remove(modes[char])
3294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    modes[char] = (modes[char][0], modes[char][1]
3304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                   - reduce(lambda a, b: (0, a[1] + b[1]),
3314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                            items)[1])
3324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
3334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    modes[char] = items[0]
3344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # build a list of possible delimiters
3364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            modeList = modes.items()
3374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            total = float(chunkLength * iteration)
3384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # (rows of consistent data) / (number of rows) = 100%
3394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            consistency = 1.0
3404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # minimum consistency threshold
3414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            threshold = 0.9
3424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            while len(delims) == 0 and consistency >= threshold:
3434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                for k, v in modeList:
3444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if v[0] > 0 and v[1] > 0:
3454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if ((v[1]/total) >= consistency and
3464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            (delimiters is None or k in delimiters)):
3474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            delims[k] = v
3484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                consistency -= 0.01
3494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if len(delims) == 1:
3514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                delim = delims.keys()[0]
3524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                skipinitialspace = (data[0].count(delim) ==
3534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                    data[0].count("%c " % delim))
3544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                return (delim, skipinitialspace)
3554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # analyze another chunkLength lines
3574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            start = end
3584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            end += chunkLength
3594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if not delims:
3614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return ('', 0)
3624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # if there's more than one, fall back to a 'preferred' list
3644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if len(delims) > 1:
3654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            for d in self.preferred:
3664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if d in delims.keys():
3674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    skipinitialspace = (data[0].count(d) ==
3684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                        data[0].count("%c " % d))
3694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    return (d, skipinitialspace)
3704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # nothing else indicates a preference, pick the character that
3724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # dominates(?)
3734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        items = [(v,k) for (k,v) in delims.items()]
3744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        items.sort()
3754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        delim = items[-1][1]
3764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        skipinitialspace = (data[0].count(delim) ==
3784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            data[0].count("%c " % delim))
3794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return (delim, skipinitialspace)
3804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def has_header(self, sample):
3834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # Creates a dictionary of types of data in each column. If any
3844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # column is of a single type (say, integers), *except* for the first
3854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # row, then the first row is presumed to be labels. If the type
3864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # can't be determined, it is assumed to be a string in which case
3874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # the length of the string is the determining factor: if all of the
3884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # rows except for the first are the same length, it's a header.
3894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # Finally, a 'vote' is taken at the end for each column, adding or
3904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # subtracting from the likelihood of the first row being a header.
3914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        rdr = reader(StringIO(sample), self.sniff(sample))
3934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        header = rdr.next() # assume first row is header
3954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        columns = len(header)
3974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        columnTypes = {}
3984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for i in range(columns): columnTypes[i] = None
3994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        checked = 0
4014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for row in rdr:
4024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # arbitrary number of rows to check, to keep it sane
4034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if checked > 20:
4044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                break
4054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            checked += 1
4064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if len(row) != columns:
4084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                continue # skip rows that have irregular number of columns
4094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            for col in columnTypes.keys():
4114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                for thisType in [int, long, float, complex]:
4134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    try:
4144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        thisType(row[col])
4154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        break
4164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    except (ValueError, OverflowError):
4174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        pass
4184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
4194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # fallback to length of string
4204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    thisType = len(row[col])
4214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                # treat longs as ints
4234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if thisType == long:
4244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    thisType = int
4254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if thisType != columnTypes[col]:
4274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if columnTypes[col] is None: # add new column type
4284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        columnTypes[col] = thisType
4294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    else:
4304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        # type is inconsistent, remove column from
4314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        # consideration
4324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        del columnTypes[col]
4334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # finally, compare results against first row and "vote"
4354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # on whether it's a header
4364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        hasHeader = 0
4374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for col, colType in columnTypes.items():
4384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if type(colType) == type(0): # it's a length
4394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if len(header[col]) != colType:
4404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    hasHeader += 1
4414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
4424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    hasHeader -= 1
4434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            else: # attempt typecast
4444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                try:
4454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    colType(header[col])
4464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                except (ValueError, TypeError):
4474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    hasHeader += 1
4484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
4494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    hasHeader -= 1
4504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return hasHeader > 0
452