forgot Cliff's sniffer

author: Skip Montanaro <skip@pobox.com> 2003-03-20 23:31:24 (GMT)
committer: Skip Montanaro <skip@pobox.com> 2003-03-20 23:31:24 (GMT)
commit: f823f11c36c4acfdfa818168cee0824a58e46551 (patch)
tree: 5e9ad3ed28ee86525058a92738c2766c1654c5df
parent: b4a0417e9112126070316d21cb1f54a7c365a24c (diff)
download: cpython-f823f11c36c4acfdfa818168cee0824a58e46551.zip
cpython-f823f11c36c4acfdfa818168cee0824a58e46551.tar.gz
cpython-f823f11c36c4acfdfa818168cee0824a58e46551.tar.bz2
1 files changed, 289 insertions, 0 deletions
diff --git a/Lib/csv/util/sniffer.py b/Lib/csv/util/sniffer.py
new file mode 100644
index 0000000..7b9b060
--- /dev/null
+++ b/Lib/csv/util/sniffer.py
@@ -0,0 +1,289 @@
+"""
+dialect = Sniffer().sniff(file('csv/easy.csv'))
+print "delimiter", dialect.delimiter
+print "quotechar", dialect.quotechar
+print "skipinitialspace", dialect.skipinitialspace
+"""
+
+from csv import csv
+import re
+
+# ------------------------------------------------------------------------------
+class Sniffer:
+    """
+    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
+    Returns a csv.Dialect object.
+    """
+    def __init__(self, sample = 16 * 1024):
+        # in case there is more than one possible delimiter
+        self.preferred = [',', '\t', ';', ' ', ':']
+
+        # amount of data (in bytes) to sample
+        self.sample = sample
+
+
+    def sniff(self, fileobj):
+        """
+        Takes a file-like object and returns a dialect (or None)
+        """
+        
+        self.fileobj = fileobj
+        
+        data = fileobj.read(self.sample)
+
+        quotechar, delimiter, skipinitialspace = self._guessQuoteAndDelimiter(data)
+        if delimiter is None:
+            delimiter, skipinitialspace = self._guessDelimiter(data)
+
+        class Dialect(csv.Dialect):
+            _name = "sniffed"
+            lineterminator = '\r\n'
+            quoting = csv.QUOTE_MINIMAL
+            # escapechar = ''
+            doublequote = False
+        Dialect.delimiter = delimiter
+        Dialect.quotechar = quotechar
+        Dialect.skipinitialspace = skipinitialspace
+
+        self.dialect = Dialect
+        return self.dialect
+
+
+    def hasHeaders(self):
+        return self._hasHeaders(self.fileobj, self.dialect)
+    
+
+    def register_dialect(self, name = 'sniffed'):
+        csv.register_dialect(name, self.dialect)
+    
+
+    def _guessQuoteAndDelimiter(self, data):
+        """
+        Looks for text enclosed between two identical quotes
+        (the probable quotechar) which are preceded and followed
+        by the same character (the probable delimiter).
+        For example:
+                         ,'some text',
+        The quote with the most wins, same with the delimiter.
+        If there is no quotechar the delimiter can't be determined
+        this way.
+        """
+
+        matches = []
+        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
+                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
+                      '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
+                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
+            regexp = re.compile(restr, re.S | re.M)
+            matches = regexp.findall(data)
+            if matches:
+                break
+        
+        if not matches:
+            return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
+
+        quotes = {}
+        delims = {}
+        spaces = 0
+        for m in matches:
+            n = regexp.groupindex['quote'] - 1
+            key = m[n]
+            if key:
+                quotes[key] = quotes.get(key, 0) + 1
+            try:
+                n = regexp.groupindex['delim'] - 1
+                key = m[n]
+            except KeyError:
+                continue
+            if key:
+                delims[key] = delims.get(key, 0) + 1
+            try:
+                n = regexp.groupindex['space'] - 1
+            except KeyError:
+                continue
+            if m[n]:
+                spaces += 1
+
+        quotechar = reduce(lambda a, b, quotes = quotes:
+                           (quotes[a] > quotes[b]) and a or b, quotes.keys())
+
+        if delims:
+            delim = reduce(lambda a, b, delims = delims:
+                           (delims[a] > delims[b]) and a or b, delims.keys())
+            skipinitialspace = delims[delim] == spaces
+            if delim == '\n': # most likely a file with a single column
+                delim = ''
+        else:
+            # there is *no* delimiter, it's a single column of quoted data
+            delim = ''
+            skipinitialspace = 0
+            
+        return (quotechar, delim, skipinitialspace)
+
+
+    def _guessDelimiter(self, data):
+        """
+        The delimiter /should/ occur the same number of times on
+        each row. However, due to malformed data, it may not. We don't want
+        an all or nothing approach, so we allow for small variations in this
+        number.
+          1) build a table of the frequency of each character on every line.
+          2) build a table of freqencies of this frequency (meta-frequency?),
+             e.g.  "x occurred 5 times in 10 rows, 6 times in 1000 rows,
+             7 times in 2 rows"
+          3) use the mode of the meta-frequency to determine the /expected/
+             frequency for that character 
+          4) find out how often the character actually meets that goal 
+          5) the character that best meets its goal is the delimiter 
+        For performance reasons, the data is evaluated in chunks, so it can
+        try and evaluate the smallest portion of the data possible, evaluating
+        additional chunks as necessary. 
+        """
+        
+        data = filter(None, data.split('\n'))
+
+        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
+
+        # build frequency tables
+        chunkLength = min(10, len(data))
+        iteration = 0
+        charFrequency = {}
+        modes = {}
+        delims = {}
+        start, end = 0, min(chunkLength, len(data))
+        while start < len(data):
+            iteration += 1
+            for line in data[start:end]:
+                for char in ascii:
+                    metafrequency = charFrequency.get(char, {})
+                    freq = line.strip().count(char) # must count even if frequency is 0
+                    metafrequency[freq] = metafrequency.get(freq, 0) + 1 # value is the mode
+                    charFrequency[char] = metafrequency
+
+            for char in charFrequency.keys():
+                items = charFrequency[char].items()
+                if len(items) == 1 and items[0][0] == 0:
+                    continue
+                # get the mode of the frequencies
+                if len(items) > 1:
+                    modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, items)
+                    # adjust the mode - subtract the sum of all other frequencies
+                    items.remove(modes[char])
+                    modes[char] = (modes[char][0], modes[char][1]
+                                   - reduce(lambda a, b: (0, a[1] + b[1]), items)[1])
+                else:
+                    modes[char] = items[0]
+
+            # build a list of possible delimiters
+            modeList = modes.items()
+            total = float(chunkLength * iteration)
+            consistency = 1.0 # (rows of consistent data) / (number of rows) = 100%
+            threshold = 0.9  # minimum consistency threshold
+            while len(delims) == 0 and consistency >= threshold:
+                for k, v in modeList:
+                    if v[0] > 0 and v[1] > 0:
+                        if (v[1]/total) >= consistency:
+                            delims[k] = v
+                consistency -= 0.01
+
+            if len(delims) == 1:
+                delim = delims.keys()[0]
+                skipinitialspace = data[0].count(delim) == data[0].count("%c " % delim)
+                return (delim, skipinitialspace)
+
+            # analyze another chunkLength lines
+            start = end
+            end += chunkLength
+
+        if not delims:
+            return ('', 0)
+
+        # if there's more than one, fall back to a 'preferred' list
+        if len(delims) > 1:
+            for d in self.preferred:
+                if d in delims.keys():
+                    skipinitialspace = data[0].count(d) == data[0].count("%c " % d)
+                    return (d, skipinitialspace)
+
+        # finally, just return the first damn character in the list
+        delim = delims.keys()[0]
+        skipinitialspace = data[0].count(delim) == data[0].count("%c " % delim)
+        return (delim, skipinitialspace)
+
+
+    def _hasHeaders(self, fileobj, dialect):
+        # Creates a dictionary of types of data in each column. If any column
+        # is of a single type (say, integers), *except* for the first row, then the first
+        # row is presumed to be labels. If the type can't be determined, it is assumed to
+        # be a string in which case the length of the string is the determining factor: if
+        # all of the rows except for the first are the same length, it's a header.
+        # Finally, a 'vote' is taken at the end for each column, adding or subtracting from
+        # the likelihood of the first row being a header. 
+
+        def seval(item):
+            """
+            Strips parens from item prior to calling eval in an attempt to make it safer
+            """
+            return eval(item.replace('(', '').replace(')', ''))
+
+        fileobj.seek(0) # rewind the fileobj - this might not work for some file-like objects...
+        
+        reader = csv.reader(fileobj,
+                            delimiter = dialect.delimiter,
+                            quotechar = dialect.quotechar,
+                            skipinitialspace = dialect.skipinitialspace)
+
+        header = reader.next() # assume first row is header
+
+        columns = len(header)
+        columnTypes = {}
+        for i in range(columns): columnTypes[i] = None
+
+        checked = 0
+        for row in reader:
+            if checked > 20: # arbitrary number of rows to check, to keep it sane
+                break
+            checked += 1
+
+            if len(row) != columns:
+                continue # skip rows that have irregular number of columns
+
+            for col in columnTypes.keys():
+                try:
+                    try:
+                        # is it a built-in type (besides string)?
+                        thisType = type(seval(row[col]))
+                    except OverflowError:
+                        # a long int?
+                        thisType = type(seval(row[col] + 'L'))
+                        thisType = type(0) # treat long ints as int
+                except:
+                    # fallback to length of string
+                    thisType = len(row[col])
+
+                if thisType != columnTypes[col]:
+                    if columnTypes[col] is None: # add new column type
+                        columnTypes[col] = thisType
+                    else: # type is inconsistent, remove column from consideration
+                        del columnTypes[col]
+
+        # finally, compare results against first row and "vote" on whether it's a header
+        hasHeader = 0
+        for col, colType in columnTypes.items():
+            if type(colType) == type(0): # it's a length
+                if len(header[col]) != colType:
+                    hasHeader += 1
+                else:
+                    hasHeader -= 1
+            else: # attempt typecast
+                try:
+                    eval("%s(%s)" % (colType.__name__, header[col]))
+                except:
+                    hasHeader += 1
+                else:
+                    hasHeader -= 1
+
+        return hasHeader > 0
+
+
+
author	Skip Montanaro <skip@pobox.com>	2003-03-20 23:31:24 (GMT)
committer	Skip Montanaro <skip@pobox.com>	2003-03-20 23:31:24 (GMT)
commit	f823f11c36c4acfdfa818168cee0824a58e46551 (patch)
tree	5e9ad3ed28ee86525058a92738c2766c1654c5df
parent	b4a0417e9112126070316d21cb1f54a7c365a24c (diff)
download	cpython-f823f11c36c4acfdfa818168cee0824a58e46551.zip cpython-f823f11c36c4acfdfa818168cee0824a58e46551.tar.gz cpython-f823f11c36c4acfdfa818168cee0824a58e46551.tar.bz2