1
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from functools import reduce
8from _csv import Error, __version__, writer, reader, register_dialect, \
9                 unregister_dialect, get_dialect, list_dialects, \
10                 field_size_limit, \
11                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12                 __doc__
13from _csv import Dialect as _Dialect
14
15try:
16    from cStringIO import StringIO
17except ImportError:
18    from StringIO import StringIO
19
20__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21            "Error", "Dialect", "__doc__", "excel", "excel_tab",
22            "field_size_limit", "reader", "writer",
23            "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24            "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25
26class Dialect:
27    """Describe an Excel dialect.
28
29    This must be subclassed (see csv.excel).  Valid attributes are:
30    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31    lineterminator, quoting.
32
33    """
34    _name = ""
35    _valid = False
36    # placeholders
37    delimiter = None
38    quotechar = None
39    escapechar = None
40    doublequote = None
41    skipinitialspace = None
42    lineterminator = None
43    quoting = None
44
45    def __init__(self):
46        if self.__class__ != Dialect:
47            self._valid = True
48        self._validate()
49
50    def _validate(self):
51        try:
52            _Dialect(self)
53        except TypeError, e:
54            # We do this for compatibility with py2.3
55            raise Error(str(e))
56
57class excel(Dialect):
58    """Describe the usual properties of Excel-generated CSV files."""
59    delimiter = ','
60    quotechar = '"'
61    doublequote = True
62    skipinitialspace = False
63    lineterminator = '\r\n'
64    quoting = QUOTE_MINIMAL
65register_dialect("excel", excel)
66
67class excel_tab(excel):
68    """Describe the usual properties of Excel-generated TAB-delimited files."""
69    delimiter = '\t'
70register_dialect("excel-tab", excel_tab)
71
72
73class DictReader:
74    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
75                 dialect="excel", *args, **kwds):
76        self._fieldnames = fieldnames   # list of keys for the dict
77        self.restkey = restkey          # key to catch long rows
78        self.restval = restval          # default value for short rows
79        self.reader = reader(f, dialect, *args, **kwds)
80        self.dialect = dialect
81        self.line_num = 0
82
83    def __iter__(self):
84        return self
85
86    @property
87    def fieldnames(self):
88        if self._fieldnames is None:
89            try:
90                self._fieldnames = self.reader.next()
91            except StopIteration:
92                pass
93        self.line_num = self.reader.line_num
94        return self._fieldnames
95
96    @fieldnames.setter
97    def fieldnames(self, value):
98        self._fieldnames = value
99
100    def next(self):
101        if self.line_num == 0:
102            # Used only for its side effect.
103            self.fieldnames
104        row = self.reader.next()
105        self.line_num = self.reader.line_num
106
107        # unlike the basic reader, we prefer not to return blanks,
108        # because we will typically wind up with a dict full of None
109        # values
110        while row == []:
111            row = self.reader.next()
112        d = dict(zip(self.fieldnames, row))
113        lf = len(self.fieldnames)
114        lr = len(row)
115        if lf < lr:
116            d[self.restkey] = row[lf:]
117        elif lf > lr:
118            for key in self.fieldnames[lr:]:
119                d[key] = self.restval
120        return d
121
122
123class DictWriter:
124    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
125                 dialect="excel", *args, **kwds):
126        self.fieldnames = fieldnames    # list of keys for the dict
127        self.restval = restval          # for writing short dicts
128        if extrasaction.lower() not in ("raise", "ignore"):
129            raise ValueError, \
130                  ("extrasaction (%s) must be 'raise' or 'ignore'" %
131                   extrasaction)
132        self.extrasaction = extrasaction
133        self.writer = writer(f, dialect, *args, **kwds)
134
135    def writeheader(self):
136        header = dict(zip(self.fieldnames, self.fieldnames))
137        self.writerow(header)
138
139    def _dict_to_list(self, rowdict):
140        if self.extrasaction == "raise":
141            wrong_fields = [k for k in rowdict if k not in self.fieldnames]
142            if wrong_fields:
143                raise ValueError("dict contains fields not in fieldnames: " +
144                                 ", ".join(wrong_fields))
145        return [rowdict.get(key, self.restval) for key in self.fieldnames]
146
147    def writerow(self, rowdict):
148        return self.writer.writerow(self._dict_to_list(rowdict))
149
150    def writerows(self, rowdicts):
151        rows = []
152        for rowdict in rowdicts:
153            rows.append(self._dict_to_list(rowdict))
154        return self.writer.writerows(rows)
155
156# Guard Sniffer's type checking against builds that exclude complex()
157try:
158    complex
159except NameError:
160    complex = float
161
162class Sniffer:
163    '''
164    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
165    Returns a Dialect object.
166    '''
167    def __init__(self):
168        # in case there is more than one possible delimiter
169        self.preferred = [',', '\t', ';', ' ', ':']
170
171
172    def sniff(self, sample, delimiters=None):
173        """
174        Returns a dialect (or None) corresponding to the sample
175        """
176
177        quotechar, doublequote, delimiter, skipinitialspace = \
178                   self._guess_quote_and_delimiter(sample, delimiters)
179        if not delimiter:
180            delimiter, skipinitialspace = self._guess_delimiter(sample,
181                                                                delimiters)
182
183        if not delimiter:
184            raise Error, "Could not determine delimiter"
185
186        class dialect(Dialect):
187            _name = "sniffed"
188            lineterminator = '\r\n'
189            quoting = QUOTE_MINIMAL
190            # escapechar = ''
191
192        dialect.doublequote = doublequote
193        dialect.delimiter = delimiter
194        # _csv.reader won't accept a quotechar of ''
195        dialect.quotechar = quotechar or '"'
196        dialect.skipinitialspace = skipinitialspace
197
198        return dialect
199
200
201    def _guess_quote_and_delimiter(self, data, delimiters):
202        """
203        Looks for text enclosed between two identical quotes
204        (the probable quotechar) which are preceded and followed
205        by the same character (the probable delimiter).
206        For example:
207                         ,'some text',
208        The quote with the most wins, same with the delimiter.
209        If there is no quotechar the delimiter can't be determined
210        this way.
211        """
212
213        matches = []
214        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
215                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
216                      '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
217                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
218            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
219            matches = regexp.findall(data)
220            if matches:
221                break
222
223        if not matches:
224            # (quotechar, doublequote, delimiter, skipinitialspace)
225            return ('', False, None, 0)
226        quotes = {}
227        delims = {}
228        spaces = 0
229        for m in matches:
230            n = regexp.groupindex['quote'] - 1
231            key = m[n]
232            if key:
233                quotes[key] = quotes.get(key, 0) + 1
234            try:
235                n = regexp.groupindex['delim'] - 1
236                key = m[n]
237            except KeyError:
238                continue
239            if key and (delimiters is None or key in delimiters):
240                delims[key] = delims.get(key, 0) + 1
241            try:
242                n = regexp.groupindex['space'] - 1
243            except KeyError:
244                continue
245            if m[n]:
246                spaces += 1
247
248        quotechar = reduce(lambda a, b, quotes = quotes:
249                           (quotes[a] > quotes[b]) and a or b, quotes.keys())
250
251        if delims:
252            delim = reduce(lambda a, b, delims = delims:
253                           (delims[a] > delims[b]) and a or b, delims.keys())
254            skipinitialspace = delims[delim] == spaces
255            if delim == '\n': # most likely a file with a single column
256                delim = ''
257        else:
258            # there is *no* delimiter, it's a single column of quoted data
259            delim = ''
260            skipinitialspace = 0
261
262        # if we see an extra quote between delimiters, we've got a
263        # double quoted format
264        dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
265                               {'delim':delim, 'quote':quotechar}, re.MULTILINE)
266
267
268
269        if dq_regexp.search(data):
270            doublequote = True
271        else:
272            doublequote = False
273
274        return (quotechar, doublequote, delim, skipinitialspace)
275
276
277    def _guess_delimiter(self, data, delimiters):
278        """
279        The delimiter /should/ occur the same number of times on
280        each row. However, due to malformed data, it may not. We don't want
281        an all or nothing approach, so we allow for small variations in this
282        number.
283          1) build a table of the frequency of each character on every line.
284          2) build a table of frequencies of this frequency (meta-frequency?),
285             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
286             7 times in 2 rows'
287          3) use the mode of the meta-frequency to determine the /expected/
288             frequency for that character
289          4) find out how often the character actually meets that goal
290          5) the character that best meets its goal is the delimiter
291        For performance reasons, the data is evaluated in chunks, so it can
292        try and evaluate the smallest portion of the data possible, evaluating
293        additional chunks as necessary.
294        """
295
296        data = filter(None, data.split('\n'))
297
298        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
299
300        # build frequency tables
301        chunkLength = min(10, len(data))
302        iteration = 0
303        charFrequency = {}
304        modes = {}
305        delims = {}
306        start, end = 0, min(chunkLength, len(data))
307        while start < len(data):
308            iteration += 1
309            for line in data[start:end]:
310                for char in ascii:
311                    metaFrequency = charFrequency.get(char, {})
312                    # must count even if frequency is 0
313                    freq = line.count(char)
314                    # value is the mode
315                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
316                    charFrequency[char] = metaFrequency
317
318            for char in charFrequency.keys():
319                items = charFrequency[char].items()
320                if len(items) == 1 and items[0][0] == 0:
321                    continue
322                # get the mode of the frequencies
323                if len(items) > 1:
324                    modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
325                                         items)
326                    # adjust the mode - subtract the sum of all
327                    # other frequencies
328                    items.remove(modes[char])
329                    modes[char] = (modes[char][0], modes[char][1]
330                                   - reduce(lambda a, b: (0, a[1] + b[1]),
331                                            items)[1])
332                else:
333                    modes[char] = items[0]
334
335            # build a list of possible delimiters
336            modeList = modes.items()
337            total = float(chunkLength * iteration)
338            # (rows of consistent data) / (number of rows) = 100%
339            consistency = 1.0
340            # minimum consistency threshold
341            threshold = 0.9
342            while len(delims) == 0 and consistency >= threshold:
343                for k, v in modeList:
344                    if v[0] > 0 and v[1] > 0:
345                        if ((v[1]/total) >= consistency and
346                            (delimiters is None or k in delimiters)):
347                            delims[k] = v
348                consistency -= 0.01
349
350            if len(delims) == 1:
351                delim = delims.keys()[0]
352                skipinitialspace = (data[0].count(delim) ==
353                                    data[0].count("%c " % delim))
354                return (delim, skipinitialspace)
355
356            # analyze another chunkLength lines
357            start = end
358            end += chunkLength
359
360        if not delims:
361            return ('', 0)
362
363        # if there's more than one, fall back to a 'preferred' list
364        if len(delims) > 1:
365            for d in self.preferred:
366                if d in delims.keys():
367                    skipinitialspace = (data[0].count(d) ==
368                                        data[0].count("%c " % d))
369                    return (d, skipinitialspace)
370
371        # nothing else indicates a preference, pick the character that
372        # dominates(?)
373        items = [(v,k) for (k,v) in delims.items()]
374        items.sort()
375        delim = items[-1][1]
376
377        skipinitialspace = (data[0].count(delim) ==
378                            data[0].count("%c " % delim))
379        return (delim, skipinitialspace)
380
381
382    def has_header(self, sample):
383        # Creates a dictionary of types of data in each column. If any
384        # column is of a single type (say, integers), *except* for the first
385        # row, then the first row is presumed to be labels. If the type
386        # can't be determined, it is assumed to be a string in which case
387        # the length of the string is the determining factor: if all of the
388        # rows except for the first are the same length, it's a header.
389        # Finally, a 'vote' is taken at the end for each column, adding or
390        # subtracting from the likelihood of the first row being a header.
391
392        rdr = reader(StringIO(sample), self.sniff(sample))
393
394        header = rdr.next() # assume first row is header
395
396        columns = len(header)
397        columnTypes = {}
398        for i in range(columns): columnTypes[i] = None
399
400        checked = 0
401        for row in rdr:
402            # arbitrary number of rows to check, to keep it sane
403            if checked > 20:
404                break
405            checked += 1
406
407            if len(row) != columns:
408                continue # skip rows that have irregular number of columns
409
410            for col in columnTypes.keys():
411
412                for thisType in [int, long, float, complex]:
413                    try:
414                        thisType(row[col])
415                        break
416                    except (ValueError, OverflowError):
417                        pass
418                else:
419                    # fallback to length of string
420                    thisType = len(row[col])
421
422                # treat longs as ints
423                if thisType == long:
424                    thisType = int
425
426                if thisType != columnTypes[col]:
427                    if columnTypes[col] is None: # add new column type
428                        columnTypes[col] = thisType
429                    else:
430                        # type is inconsistent, remove column from
431                        # consideration
432                        del columnTypes[col]
433
434        # finally, compare results against first row and "vote"
435        # on whether it's a header
436        hasHeader = 0
437        for col, colType in columnTypes.items():
438            if type(colType) == type(0): # it's a length
439                if len(header[col]) != colType:
440                    hasHeader += 1
441                else:
442                    hasHeader -= 1
443            else: # attempt typecast
444                try:
445                    colType(header[col])
446                except (ValueError, TypeError):
447                    hasHeader += 1
448                else:
449                    hasHeader -= 1
450
451        return hasHeader > 0
452