summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorSkip Montanaro <skip@pobox.com>2005-12-30 05:09:48 (GMT)
committerSkip Montanaro <skip@pobox.com>2005-12-30 05:09:48 (GMT)
commit39b29be8a6639212402303adab4df29976d8fc7c (patch)
tree39ec5c719125381f6898668ba36f1090334359d0 /Lib
parent0174dddc65af50900324afca3c5d2400858b75f0 (diff)
downloadcpython-39b29be8a6639212402303adab4df29976d8fc7c.zip
cpython-39b29be8a6639212402303adab4df29976d8fc7c.tar.gz
cpython-39b29be8a6639212402303adab4df29976d8fc7c.tar.bz2
Fix a delimiter detection problem in sniffer. Sniffing "a|b|c\r\n" was
returning 'a' as the delimiter. It now returns '|', but not because I understood better what the code was supposed to do. Would someone that understands the idea behind _guess_delimiter() (see its doc string) look to see if my fallback choice is better than before or if it's just serendipity that I picked the proper delimiter?
Diffstat (limited to 'Lib')
-rw-r--r--Lib/csv.py13
-rw-r--r--Lib/test/test_csv.py7
2 files changed, 17 insertions, 3 deletions
diff --git a/Lib/csv.py b/Lib/csv.py
index 7516380..f213854 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -152,10 +152,13 @@ class Sniffer:
quotechar, delimiter, skipinitialspace = \
self._guess_quote_and_delimiter(sample, delimiters)
- if delimiter is None:
+ if not delimiter:
delimiter, skipinitialspace = self._guess_delimiter(sample,
delimiters)
+ if not delimiter:
+ raise Error, "Could not determine delimiter"
+
class dialect(Dialect):
_name = "sniffed"
lineterminator = '\r\n'
@@ -329,8 +332,12 @@ class Sniffer:
data[0].count("%c " % d))
return (d, skipinitialspace)
- # finally, just return the first damn character in the list
- delim = delims.keys()[0]
+ # nothing else indicates a preference, pick the character that
+ # dominates(?)
+ items = [(v,k) for (k,v) in delims.items()]
+ items.sort()
+ delim = items[-1][1]
+
skipinitialspace = (data[0].count(delim) ==
data[0].count("%c " % delim))
return (delim, skipinitialspace)
diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py
index 0ad77ef..8511a5a 100644
--- a/Lib/test/test_csv.py
+++ b/Lib/test/test_csv.py
@@ -852,6 +852,8 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
'''
sample5 = "aaa\tbbb\r\nAAA\t\r\nBBB\t\r\n"
+ sample6 = "a|b|c\r\nd|e|f\r\n"
+ sample7 = "'a'|'b'|'c'\r\n'd'|e|f\r\n"
def test_has_header(self):
sniffer = csv.Sniffer()
@@ -882,6 +884,11 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
self.assertEqual(dialect.delimiter, ";")
dialect = sniffer.sniff(self.sample5)
self.assertEqual(dialect.delimiter, "\t")
+ dialect = sniffer.sniff(self.sample6)
+ self.assertEqual(dialect.delimiter, "|")
+ dialect = sniffer.sniff(self.sample7)
+ self.assertEqual(dialect.delimiter, "|")
+ self.assertEqual(dialect.quotechar, "'")
if not hasattr(sys, "gettotalrefcount"):
if test_support.verbose: print "*** skipping leakage tests ***"