From 77892373313b7195e16755abe0604bd734df4736 Mon Sep 17 00:00:00 2001 From: Skip Montanaro Date: Mon, 19 May 2003 15:33:36 +0000 Subject: * Correct Sniffer doc to correspond to the implementation. * Add optional delimiters arg to Sniffer.sniff() which restricts the set of candidate field delimiters. --- Doc/lib/libcsv.tex | 14 +++++++------- Lib/csv.py | 16 +++++++++------- Lib/test/test_csv.py | 15 +++++++++++++++ 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/Doc/lib/libcsv.tex b/Doc/lib/libcsv.tex index f30aefe..a287ba8 100644 --- a/Doc/lib/libcsv.tex +++ b/Doc/lib/libcsv.tex @@ -152,17 +152,17 @@ attributes, which are used to define the parameters for a specific \class{reader} or \class{writer} instance. \end{classdesc*} -\begin{classdesc}{Sniffer}{\optional{sample=16384}} -The \class{Sniffer} class is used to deduce the format of a CSV file. The -optional \var{sample} argument to the constructor specifies the number of -bytes to use when determining Dialect parameters. +\begin{classdesc}{Sniffer}{} +The \class{Sniffer} class is used to deduce the format of a CSV file. \end{classdesc} The \class{Sniffer} class provides a single method: -\begin{methoddesc}{sniff}{fileobj} -Analyze the next chunk of \var{fileobj} and return a \class{Dialect} subclass -reflecting the parameters found. +\begin{methoddesc}{sniff}{sample\optional{,delimiters=None}} +Analyze the given \var{sample} and return a \class{Dialect} subclass +reflecting the parameters found. If the optional \var{delimiters} parameter +is given, it is interpreted as a string containing possible valid delimiter +characters. \end{methoddesc} \begin{methoddesc}{has_header}{sample} diff --git a/Lib/csv.py b/Lib/csv.py index 7e297b6..83b8aa4 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -159,15 +159,16 @@ class Sniffer: self.preferred = [',', '\t', ';', ' ', ':'] - def sniff(self, sample): + def sniff(self, sample, delimiters=None): """ Returns a dialect (or None) corresponding to the sample """ quotechar, delimiter, skipinitialspace = \ - self._guess_quote_and_delimiter(sample) + self._guess_quote_and_delimiter(sample, delimiters) if delimiter is None: - delimiter, skipinitialspace = self._guess_delimiter(sample) + delimiter, skipinitialspace = self._guess_delimiter(sample, + delimiters) class dialect(Dialect): _name = "sniffed" @@ -184,7 +185,7 @@ class Sniffer: return dialect - def _guess_quote_and_delimiter(self, data): + def _guess_quote_and_delimiter(self, data, delimiters): """ Looks for text enclosed between two identical quotes (the probable quotechar) which are preceded and followed @@ -222,7 +223,7 @@ class Sniffer: key = m[n] except KeyError: continue - if key: + if key and (delimiters is None or key in delimiters): delims[key] = delims.get(key, 0) + 1 try: n = regexp.groupindex['space'] - 1 @@ -248,7 +249,7 @@ class Sniffer: return (quotechar, delim, skipinitialspace) - def _guess_delimiter(self, data): + def _guess_delimiter(self, data, delimiters): """ The delimiter /should/ occur the same number of times on each row. However, due to malformed data, it may not. We don't want @@ -316,7 +317,8 @@ class Sniffer: while len(delims) == 0 and consistency >= threshold: for k, v in modeList: if v[0] > 0 and v[1] > 0: - if (v[1]/total) >= consistency: + if ((v[1]/total) >= consistency and + (delimiters is None or k in delimiters)): delims[k] = v consistency -= 0.01 diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index c0ad645..00ba8cd 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -551,6 +551,12 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back header = '''\ "venue","city","state","date","performers" ''' + sample3 = '''\ +05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03 +05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03 +05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03 +''' + def test_has_header(self): sniffer = csv.Sniffer() self.assertEqual(sniffer.has_header(self.sample1), False) @@ -568,6 +574,15 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back self.assertEqual(dialect.quotechar, "'") self.assertEqual(dialect.skipinitialspace, False) + def test_delimiters(self): + sniffer = csv.Sniffer() + dialect = sniffer.sniff(self.sample3) + self.assertEqual(dialect.delimiter, "0") + dialect = sniffer.sniff(self.sample3, delimiters="?,") + self.assertEqual(dialect.delimiter, "?") + dialect = sniffer.sniff(self.sample3, delimiters="/,") + self.assertEqual(dialect.delimiter, "/") + if not hasattr(sys, "gettotalrefcount"): if test_support.verbose: print "*** skipping leakage tests ***" else: -- cgit v0.12