summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorSkip Montanaro <skip@pobox.com>2003-04-25 14:47:16 (GMT)
committerSkip Montanaro <skip@pobox.com>2003-04-25 14:47:16 (GMT)
commit1448d4719ca5db5580de461944a8d5da20c30888 (patch)
tree4460d1a09aa0202e0362220935f9b84ebbe5a99d /Lib
parent48816c6f041ea28079ba54ff6bf2ba5124a528a4 (diff)
downloadcpython-1448d4719ca5db5580de461944a8d5da20c30888.zip
cpython-1448d4719ca5db5580de461944a8d5da20c30888.tar.gz
cpython-1448d4719ca5db5580de461944a8d5da20c30888.tar.bz2
rework Sniffer api significantly
Diffstat (limited to 'Lib')
-rw-r--r--Lib/csv.py67
1 files changed, 26 insertions, 41 deletions
diff --git a/Lib/csv.py b/Lib/csv.py
index 89d86d6..7e297b6 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -9,6 +9,11 @@ from _csv import Error, __version__, writer, reader, register_dialect, \
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
__doc__
+try:
+ from cStringIO import StringIO
+except ImportError:
+ from StringIO import StringIO
+
__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
"register_dialect", "get_dialect", "list_dialects", "Sniffer",
@@ -147,52 +152,39 @@ class DictWriter:
class Sniffer:
'''
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
- Returns a csv.Dialect object.
+ Returns a Dialect object.
'''
- def __init__(self, sample = 16 * 1024):
+ def __init__(self):
# in case there is more than one possible delimiter
self.preferred = [',', '\t', ';', ' ', ':']
- # amount of data (in bytes) to sample
- self.sample = sample
-
- def sniff(self, fileobj):
+ def sniff(self, sample):
"""
- Takes a file-like object and returns a dialect (or None)
+ Returns a dialect (or None) corresponding to the sample
"""
- self.fileobj = fileobj
-
- data = fileobj.read(self.sample)
quotechar, delimiter, skipinitialspace = \
- self._guessQuoteAndDelimiter(data)
+ self._guess_quote_and_delimiter(sample)
if delimiter is None:
- delimiter, skipinitialspace = self._guessDelimiter(data)
+ delimiter, skipinitialspace = self._guess_delimiter(sample)
- class SniffedDialect(Dialect):
+ class dialect(Dialect):
_name = "sniffed"
lineterminator = '\r\n'
quoting = QUOTE_MINIMAL
# escapechar = ''
doublequote = False
- SniffedDialect.delimiter = delimiter
- SniffedDialect.quotechar = quotechar
- SniffedDialect.skipinitialspace = skipinitialspace
- self.dialect = SniffedDialect
- return self.dialect
+ dialect.delimiter = delimiter
+ # _csv.reader won't accept a quotechar of ''
+ dialect.quotechar = quotechar or '"'
+ dialect.skipinitialspace = skipinitialspace
+ return dialect
- def hasHeaders(self):
- return self._hasHeaders(self.fileobj, self.dialect)
-
- def register_dialect(self, name='sniffed'):
- register_dialect(name, self.dialect)
-
-
- def _guessQuoteAndDelimiter(self, data):
+ def _guess_quote_and_delimiter(self, data):
"""
Looks for text enclosed between two identical quotes
(the probable quotechar) which are preceded and followed
@@ -256,7 +248,7 @@ class Sniffer:
return (quotechar, delim, skipinitialspace)
- def _guessDelimiter(self, data):
+ def _guess_delimiter(self, data):
"""
The delimiter /should/ occur the same number of times on
each row. However, due to malformed data, it may not. We don't want
@@ -290,12 +282,12 @@ class Sniffer:
iteration += 1
for line in data[start:end]:
for char in ascii:
- metafrequency = charFrequency.get(char, {})
+ metaFrequency = charFrequency.get(char, {})
# must count even if frequency is 0
freq = line.strip().count(char)
# value is the mode
- metafrequency[freq] = metafrequency.get(freq, 0) + 1
- charFrequency[char] = metafrequency
+ metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
+ charFrequency[char] = metaFrequency
for char in charFrequency.keys():
items = charFrequency[char].items()
@@ -356,7 +348,7 @@ class Sniffer:
return (delim, skipinitialspace)
- def _hasHeaders(self, fileobj, dialect):
+ def has_header(self, sample):
# Creates a dictionary of types of data in each column. If any
# column is of a single type (say, integers), *except* for the first
# row, then the first row is presumed to be labels. If the type
@@ -373,23 +365,16 @@ class Sniffer:
"""
return eval(item.replace('(', '').replace(')', ''))
- # rewind the fileobj - this might not work for some file-like
- # objects...
- fileobj.seek(0)
-
- r = csv.reader(fileobj,
- delimiter=dialect.delimiter,
- quotechar=dialect.quotechar,
- skipinitialspace=dialect.skipinitialspace)
+ rdr = reader(StringIO(sample), self.sniff(sample))
- header = r.next() # assume first row is header
+ header = rdr.next() # assume first row is header
columns = len(header)
columnTypes = {}
for i in range(columns): columnTypes[i] = None
checked = 0
- for row in r:
+ for row in rdr:
# arbitrary number of rows to check, to keep it sane
if checked > 20:
break