diff options
author | Skip Montanaro <skip@pobox.com> | 2003-04-25 14:47:16 (GMT) |
---|---|---|
committer | Skip Montanaro <skip@pobox.com> | 2003-04-25 14:47:16 (GMT) |
commit | 1448d4719ca5db5580de461944a8d5da20c30888 (patch) | |
tree | 4460d1a09aa0202e0362220935f9b84ebbe5a99d /Lib/csv.py | |
parent | 48816c6f041ea28079ba54ff6bf2ba5124a528a4 (diff) | |
download | cpython-1448d4719ca5db5580de461944a8d5da20c30888.zip cpython-1448d4719ca5db5580de461944a8d5da20c30888.tar.gz cpython-1448d4719ca5db5580de461944a8d5da20c30888.tar.bz2 |
rework Sniffer api significantly
Diffstat (limited to 'Lib/csv.py')
-rw-r--r-- | Lib/csv.py | 67 |
1 files changed, 26 insertions, 41 deletions
@@ -9,6 +9,11 @@ from _csv import Error, __version__, writer, reader, register_dialect, \ QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ __doc__ +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", "Error", "Dialect", "excel", "excel_tab", "reader", "writer", "register_dialect", "get_dialect", "list_dialects", "Sniffer", @@ -147,52 +152,39 @@ class DictWriter: class Sniffer: ''' "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) - Returns a csv.Dialect object. + Returns a Dialect object. ''' - def __init__(self, sample = 16 * 1024): + def __init__(self): # in case there is more than one possible delimiter self.preferred = [',', '\t', ';', ' ', ':'] - # amount of data (in bytes) to sample - self.sample = sample - - def sniff(self, fileobj): + def sniff(self, sample): """ - Takes a file-like object and returns a dialect (or None) + Returns a dialect (or None) corresponding to the sample """ - self.fileobj = fileobj - - data = fileobj.read(self.sample) quotechar, delimiter, skipinitialspace = \ - self._guessQuoteAndDelimiter(data) + self._guess_quote_and_delimiter(sample) if delimiter is None: - delimiter, skipinitialspace = self._guessDelimiter(data) + delimiter, skipinitialspace = self._guess_delimiter(sample) - class SniffedDialect(Dialect): + class dialect(Dialect): _name = "sniffed" lineterminator = '\r\n' quoting = QUOTE_MINIMAL # escapechar = '' doublequote = False - SniffedDialect.delimiter = delimiter - SniffedDialect.quotechar = quotechar - SniffedDialect.skipinitialspace = skipinitialspace - self.dialect = SniffedDialect - return self.dialect + dialect.delimiter = delimiter + # _csv.reader won't accept a quotechar of '' + dialect.quotechar = quotechar or '"' + dialect.skipinitialspace = skipinitialspace + return dialect - def hasHeaders(self): - return self._hasHeaders(self.fileobj, self.dialect) - - def register_dialect(self, name='sniffed'): - register_dialect(name, self.dialect) - - - def _guessQuoteAndDelimiter(self, data): + def _guess_quote_and_delimiter(self, data): """ Looks for text enclosed between two identical quotes (the probable quotechar) which are preceded and followed @@ -256,7 +248,7 @@ class Sniffer: return (quotechar, delim, skipinitialspace) - def _guessDelimiter(self, data): + def _guess_delimiter(self, data): """ The delimiter /should/ occur the same number of times on each row. However, due to malformed data, it may not. We don't want @@ -290,12 +282,12 @@ class Sniffer: iteration += 1 for line in data[start:end]: for char in ascii: - metafrequency = charFrequency.get(char, {}) + metaFrequency = charFrequency.get(char, {}) # must count even if frequency is 0 freq = line.strip().count(char) # value is the mode - metafrequency[freq] = metafrequency.get(freq, 0) + 1 - charFrequency[char] = metafrequency + metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 + charFrequency[char] = metaFrequency for char in charFrequency.keys(): items = charFrequency[char].items() @@ -356,7 +348,7 @@ class Sniffer: return (delim, skipinitialspace) - def _hasHeaders(self, fileobj, dialect): + def has_header(self, sample): # Creates a dictionary of types of data in each column. If any # column is of a single type (say, integers), *except* for the first # row, then the first row is presumed to be labels. If the type @@ -373,23 +365,16 @@ class Sniffer: """ return eval(item.replace('(', '').replace(')', '')) - # rewind the fileobj - this might not work for some file-like - # objects... - fileobj.seek(0) - - r = csv.reader(fileobj, - delimiter=dialect.delimiter, - quotechar=dialect.quotechar, - skipinitialspace=dialect.skipinitialspace) + rdr = reader(StringIO(sample), self.sniff(sample)) - header = r.next() # assume first row is header + header = rdr.next() # assume first row is header columns = len(header) columnTypes = {} for i in range(columns): columnTypes[i] = None checked = 0 - for row in r: + for row in rdr: # arbitrary number of rows to check, to keep it sane if checked > 20: break |