diff options
Diffstat (limited to 'Doc/lib/libcsv.tex')
-rw-r--r-- | Doc/lib/libcsv.tex | 538 |
1 files changed, 0 insertions, 538 deletions
diff --git a/Doc/lib/libcsv.tex b/Doc/lib/libcsv.tex deleted file mode 100644 index e965e31..0000000 --- a/Doc/lib/libcsv.tex +++ /dev/null @@ -1,538 +0,0 @@ -\section{\module{csv} --- CSV File Reading and Writing} - -\declaremodule{standard}{csv} -\modulesynopsis{Write and read tabular data to and from delimited files.} -\sectionauthor{Skip Montanaro}{skip@pobox.com} - -\versionadded{2.3} -\index{csv} -\indexii{data}{tabular} - -The so-called CSV (Comma Separated Values) format is the most common import -and export format for spreadsheets and databases. There is no ``CSV -standard'', so the format is operationally defined by the many applications -which read and write it. The lack of a standard means that subtle -differences often exist in the data produced and consumed by different -applications. These differences can make it annoying to process CSV files -from multiple sources. Still, while the delimiters and quoting characters -vary, the overall format is similar enough that it is possible to write a -single module which can efficiently manipulate such data, hiding the details -of reading and writing the data from the programmer. - -The \module{csv} module implements classes to read and write tabular data in -CSV format. It allows programmers to say, ``write this data in the format -preferred by Excel,'' or ``read data from this file which was generated by -Excel,'' without knowing the precise details of the CSV format used by -Excel. Programmers can also describe the CSV formats understood by other -applications or define their own special-purpose CSV formats. - -The \module{csv} module's \class{reader} and \class{writer} objects read and -write sequences. Programmers can also read and write data in dictionary -form using the \class{DictReader} and \class{DictWriter} classes. - -\begin{notice} - This version of the \module{csv} module doesn't support Unicode - input. Also, there are currently some issues regarding \ASCII{} NUL - characters. Accordingly, all input should be UTF-8 or printable - \ASCII{} to be safe; see the examples in section~\ref{csv-examples}. - These restrictions will be removed in the future. -\end{notice} - -\begin{seealso} -% \seemodule{array}{Arrays of uniformly types numeric values.} - \seepep{305}{CSV File API} - {The Python Enhancement Proposal which proposed this addition - to Python.} -\end{seealso} - - -\subsection{Module Contents \label{csv-contents}} - -The \module{csv} module defines the following functions: - -\begin{funcdesc}{reader}{csvfile\optional{, - dialect=\code{'excel'}}\optional{, fmtparam}} -Return a reader object which will iterate over lines in the given -{}\var{csvfile}. \var{csvfile} can be any object which supports the -iterator protocol and returns a string each time its \method{next} -method is called --- file objects and list objects are both suitable. -If \var{csvfile} is a file object, it must be opened with -the 'b' flag on platforms where that makes a difference. An optional -{}\var{dialect} parameter can be given -which is used to define a set of parameters specific to a particular CSV -dialect. It may be an instance of a subclass of the \class{Dialect} -class or one of the strings returned by the \function{list_dialects} -function. The other optional {}\var{fmtparam} keyword arguments can be -given to override individual formatting parameters in the current -dialect. For full details about the dialect and formatting -parameters, see section~\ref{csv-fmt-params}, ``Dialects and Formatting -Parameters''. - -All data read are returned as strings. No automatic data type -conversion is performed. - -\versionchanged[ -The parser is now stricter with respect to multi-line quoted -fields. Previously, if a line ended within a quoted field without a -terminating newline character, a newline would be inserted into the -returned field. This behavior caused problems when reading files -which contained carriage return characters within fields. The -behavior was changed to return the field without inserting newlines. As -a consequence, if newlines embedded within fields are important, the -input should be split into lines in a manner which preserves the newline -characters]{2.5} - -\end{funcdesc} - -\begin{funcdesc}{writer}{csvfile\optional{, - dialect=\code{'excel'}}\optional{, fmtparam}} -Return a writer object responsible for converting the user's data into -delimited strings on the given file-like object. \var{csvfile} can be any -object with a \function{write} method. If \var{csvfile} is a file object, -it must be opened with the 'b' flag on platforms where that makes a -difference. An optional -{}\var{dialect} parameter can be given which is used to define a set of -parameters specific to a particular CSV dialect. It may be an instance -of a subclass of the \class{Dialect} class or one of the strings -returned by the \function{list_dialects} function. The other optional -{}\var{fmtparam} keyword arguments can be given to override individual -formatting parameters in the current dialect. For full details -about the dialect and formatting parameters, see -section~\ref{csv-fmt-params}, ``Dialects and Formatting Parameters''. -To make it as easy as possible to -interface with modules which implement the DB API, the value -\constant{None} is written as the empty string. While this isn't a -reversible transformation, it makes it easier to dump SQL NULL data values -to CSV files without preprocessing the data returned from a -\code{cursor.fetch*()} call. All other non-string data are stringified -with \function{str()} before being written. -\end{funcdesc} - -\begin{funcdesc}{register_dialect}{name\optional{, dialect}\optional{, fmtparam}} -Associate \var{dialect} with \var{name}. \var{name} must be a string -or Unicode object. The dialect can be specified either by passing a -sub-class of \class{Dialect}, or by \var{fmtparam} keyword arguments, -or both, with keyword arguments overriding parameters of the dialect. -For full details about the dialect and formatting parameters, see -section~\ref{csv-fmt-params}, ``Dialects and Formatting Parameters''. -\end{funcdesc} - -\begin{funcdesc}{unregister_dialect}{name} -Delete the dialect associated with \var{name} from the dialect registry. An -\exception{Error} is raised if \var{name} is not a registered dialect -name. -\end{funcdesc} - -\begin{funcdesc}{get_dialect}{name} -Return the dialect associated with \var{name}. An \exception{Error} is -raised if \var{name} is not a registered dialect name. -\end{funcdesc} - -\begin{funcdesc}{list_dialects}{} -Return the names of all registered dialects. -\end{funcdesc} - -\begin{funcdesc}{field_size_limit}{\optional{new_limit}} - Returns the current maximum field size allowed by the parser. If - \var{new_limit} is given, this becomes the new limit. - \versionadded{2.5} -\end{funcdesc} - - -The \module{csv} module defines the following classes: - -\begin{classdesc}{DictReader}{csvfile\optional{, - fieldnames=\constant{None},\optional{, - restkey=\constant{None}\optional{, - restval=\constant{None}\optional{, - dialect=\code{'excel'}\optional{, - *args, **kwds}}}}}} -Create an object which operates like a regular reader but maps the -information read into a dict whose keys are given by the optional -{} \var{fieldnames} -parameter. If the \var{fieldnames} parameter is omitted, the values in -the first row of the \var{csvfile} will be used as the fieldnames. -If the row read has fewer fields than the fieldnames sequence, -the value of \var{restval} will be used as the default value. If the row -read has more fields than the fieldnames sequence, the remaining data is -added as a sequence keyed by the value of \var{restkey}. If the row read -has fewer fields than the fieldnames sequence, the remaining keys take the -value of the optional \var{restval} parameter. Any other optional or -keyword arguments are passed to the underlying \class{reader} instance. -\end{classdesc} - - -\begin{classdesc}{DictWriter}{csvfile, fieldnames\optional{, - restval=""\optional{, - extrasaction=\code{'raise'}\optional{, - dialect=\code{'excel'}\optional{, - *args, **kwds}}}}} -Create an object which operates like a regular writer but maps dictionaries -onto output rows. The \var{fieldnames} parameter identifies the order in -which values in the dictionary passed to the \method{writerow()} method are -written to the \var{csvfile}. The optional \var{restval} parameter -specifies the value to be written if the dictionary is missing a key in -\var{fieldnames}. If the dictionary passed to the \method{writerow()} -method contains a key not found in \var{fieldnames}, the optional -\var{extrasaction} parameter indicates what action to take. If it is set -to \code{'raise'} a \exception{ValueError} is raised. If it is set to -\code{'ignore'}, extra values in the dictionary are ignored. Any other -optional or keyword arguments are passed to the underlying \class{writer} -instance. - -Note that unlike the \class{DictReader} class, the \var{fieldnames} -parameter of the \class{DictWriter} is not optional. Since Python's -\class{dict} objects are not ordered, there is not enough information -available to deduce the order in which the row should be written to the -\var{csvfile}. - -\end{classdesc} - -\begin{classdesc*}{Dialect}{} -The \class{Dialect} class is a container class relied on primarily for its -attributes, which are used to define the parameters for a specific -\class{reader} or \class{writer} instance. -\end{classdesc*} - -\begin{classdesc}{excel}{} -The \class{excel} class defines the usual properties of an Excel-generated -CSV file. It is registered with the dialect name \code{'excel'}. -\end{classdesc} - -\begin{classdesc}{excel_tab}{} -The \class{excel_tab} class defines the usual properties of an -Excel-generated TAB-delimited file. It is registered with the dialect name -\code{'excel-tab'}. -\end{classdesc} - -\begin{classdesc}{Sniffer}{} -The \class{Sniffer} class is used to deduce the format of a CSV file. -\end{classdesc} - -The \class{Sniffer} class provides two methods: - -\begin{methoddesc}{sniff}{sample\optional{,delimiters=None}} -Analyze the given \var{sample} and return a \class{Dialect} subclass -reflecting the parameters found. If the optional \var{delimiters} parameter -is given, it is interpreted as a string containing possible valid delimiter -characters. -\end{methoddesc} - -\begin{methoddesc}{has_header}{sample} -Analyze the sample text (presumed to be in CSV format) and return -\constant{True} if the first row appears to be a series of column -headers. -\end{methoddesc} - - -The \module{csv} module defines the following constants: - -\begin{datadesc}{QUOTE_ALL} -Instructs \class{writer} objects to quote all fields. -\end{datadesc} - -\begin{datadesc}{QUOTE_MINIMAL} -Instructs \class{writer} objects to only quote those fields which contain -special characters such as \var{delimiter}, \var{quotechar} or any of the -characters in \var{lineterminator}. -\end{datadesc} - -\begin{datadesc}{QUOTE_NONNUMERIC} -Instructs \class{writer} objects to quote all non-numeric -fields. - -Instructs the reader to convert all non-quoted fields to type \var{float}. -\end{datadesc} - -\begin{datadesc}{QUOTE_NONE} -Instructs \class{writer} objects to never quote fields. When the current -\var{delimiter} occurs in output data it is preceded by the current -\var{escapechar} character. If \var{escapechar} is not set, the writer -will raise \exception{Error} if any characters that require escaping -are encountered. - -Instructs \class{reader} to perform no special processing of quote characters. -\end{datadesc} - - -The \module{csv} module defines the following exception: - -\begin{excdesc}{Error} -Raised by any of the functions when an error is detected. -\end{excdesc} - - -\subsection{Dialects and Formatting Parameters\label{csv-fmt-params}} - -To make it easier to specify the format of input and output records, -specific formatting parameters are grouped together into dialects. A -dialect is a subclass of the \class{Dialect} class having a set of specific -methods and a single \method{validate()} method. When creating \class{reader} -or \class{writer} objects, the programmer can specify a string or a subclass -of the \class{Dialect} class as the dialect parameter. In addition to, or -instead of, the \var{dialect} parameter, the programmer can also specify -individual formatting parameters, which have the same names as the -attributes defined below for the \class{Dialect} class. - -Dialects support the following attributes: - -\begin{memberdesc}[Dialect]{delimiter} -A one-character string used to separate fields. It defaults to \code{','}. -\end{memberdesc} - -\begin{memberdesc}[Dialect]{doublequote} -Controls how instances of \var{quotechar} appearing inside a field should -be themselves be quoted. When \constant{True}, the character is doubled. -When \constant{False}, the \var{escapechar} is used as a prefix to the -\var{quotechar}. It defaults to \constant{True}. - -On output, if \var{doublequote} is \constant{False} and no -\var{escapechar} is set, \exception{Error} is raised if a \var{quotechar} -is found in a field. -\end{memberdesc} - -\begin{memberdesc}[Dialect]{escapechar} -A one-character string used by the writer to escape the \var{delimiter} if -\var{quoting} is set to \constant{QUOTE_NONE} and the \var{quotechar} -if \var{doublequote} is \constant{False}. On reading, the \var{escapechar} -removes any special meaning from the following character. It defaults -to \constant{None}, which disables escaping. -\end{memberdesc} - -\begin{memberdesc}[Dialect]{lineterminator} -The string used to terminate lines produced by the \class{writer}. -It defaults to \code{'\e r\e n'}. - -\note{The \class{reader} is hard-coded to recognise either \code{'\e r'} -or \code{'\e n'} as end-of-line, and ignores \var{lineterminator}. This -behavior may change in the future.} -\end{memberdesc} - -\begin{memberdesc}[Dialect]{quotechar} -A one-character string used to quote fields containing special characters, -such as the \var{delimiter} or \var{quotechar}, or which contain new-line -characters. It defaults to \code{'"'}. -\end{memberdesc} - -\begin{memberdesc}[Dialect]{quoting} -Controls when quotes should be generated by the writer and recognised -by the reader. It can take on any of the \constant{QUOTE_*} constants -(see section~\ref{csv-contents}) and defaults to \constant{QUOTE_MINIMAL}. -\end{memberdesc} - -\begin{memberdesc}[Dialect]{skipinitialspace} -When \constant{True}, whitespace immediately following the \var{delimiter} -is ignored. The default is \constant{False}. -\end{memberdesc} - - -\subsection{Reader Objects} - -Reader objects (\class{DictReader} instances and objects returned by -the \function{reader()} function) have the following public methods: - -\begin{methoddesc}[csv reader]{next}{} -Return the next row of the reader's iterable object as a list, parsed -according to the current dialect. -\end{methoddesc} - -Reader objects have the following public attributes: - -\begin{memberdesc}[csv reader]{dialect} -A read-only description of the dialect in use by the parser. -\end{memberdesc} - -\begin{memberdesc}[csv reader]{line_num} - The number of lines read from the source iterator. This is not the same - as the number of records returned, as records can span multiple lines. - \versionadded{2.5} -\end{memberdesc} - - -\subsection{Writer Objects} - -\class{Writer} objects (\class{DictWriter} instances and objects returned by -the \function{writer()} function) have the following public methods. A -{}\var{row} must be a sequence of strings or numbers for \class{Writer} -objects and a dictionary mapping fieldnames to strings or numbers (by -passing them through \function{str()} first) for {}\class{DictWriter} -objects. Note that complex numbers are written out surrounded by parens. -This may cause some problems for other programs which read CSV files -(assuming they support complex numbers at all). - -\begin{methoddesc}[csv writer]{writerow}{row} -Write the \var{row} parameter to the writer's file object, formatted -according to the current dialect. -\end{methoddesc} - -\begin{methoddesc}[csv writer]{writerows}{rows} -Write all the \var{rows} parameters (a list of \var{row} objects as -described above) to the writer's file object, formatted -according to the current dialect. -\end{methoddesc} - -Writer objects have the following public attribute: - -\begin{memberdesc}[csv writer]{dialect} -A read-only description of the dialect in use by the writer. -\end{memberdesc} - - - -\subsection{Examples\label{csv-examples}} - -The simplest example of reading a CSV file: - -\begin{verbatim} -import csv -reader = csv.reader(open("some.csv", "rb")) -for row in reader: - print row -\end{verbatim} - -Reading a file with an alternate format: - -\begin{verbatim} -import csv -reader = csv.reader(open("passwd", "rb"), delimiter=':', quoting=csv.QUOTE_NONE) -for row in reader: - print row -\end{verbatim} - -The corresponding simplest possible writing example is: - -\begin{verbatim} -import csv -writer = csv.writer(open("some.csv", "wb")) -writer.writerows(someiterable) -\end{verbatim} - -Registering a new dialect: - -\begin{verbatim} -import csv - -csv.register_dialect('unixpwd', delimiter=':', quoting=csv.QUOTE_NONE) - -reader = csv.reader(open("passwd", "rb"), 'unixpwd') -\end{verbatim} - -A slightly more advanced use of the reader --- catching and reporting errors: - -\begin{verbatim} -import csv, sys -filename = "some.csv" -reader = csv.reader(open(filename, "rb")) -try: - for row in reader: - print row -except csv.Error, e: - sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) -\end{verbatim} - -And while the module doesn't directly support parsing strings, it can -easily be done: - -\begin{verbatim} -import csv -for row in csv.reader(['one,two,three']): - print row -\end{verbatim} - -The \module{csv} module doesn't directly support reading and writing -Unicode, but it is 8-bit-clean save for some problems with \ASCII{} NUL -characters. So you can write functions or classes that handle the -encoding and decoding for you as long as you avoid encodings like -UTF-16 that use NULs. UTF-8 is recommended. - -\function{unicode_csv_reader} below is a generator that wraps -\class{csv.reader} to handle Unicode CSV data (a list of Unicode -strings). \function{utf_8_encoder} is a generator that encodes the -Unicode strings as UTF-8, one string (or row) at a time. The encoded -strings are parsed by the CSV reader, and -\function{unicode_csv_reader} decodes the UTF-8-encoded cells back -into Unicode: - -\begin{verbatim} -import csv - -def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs): - # csv.py doesn't do Unicode; encode temporarily as UTF-8: - csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), - dialect=dialect, **kwargs) - for row in csv_reader: - # decode UTF-8 back to Unicode, cell by cell: - yield [unicode(cell, 'utf-8') for cell in row] - -def utf_8_encoder(unicode_csv_data): - for line in unicode_csv_data: - yield line.encode('utf-8') -\end{verbatim} - -For all other encodings the following \class{UnicodeReader} and -\class{UnicodeWriter} classes can be used. They take an additional -\var{encoding} parameter in their constructor and make sure that the data -passes the real reader or writer encoded as UTF-8: - -\begin{verbatim} -import csv, codecs, cStringIO - -class UTF8Recoder: - """ - Iterator that reads an encoded stream and reencodes the input to UTF-8 - """ - def __init__(self, f, encoding): - self.reader = codecs.getreader(encoding)(f) - - def __iter__(self): - return self - - def next(self): - return self.reader.next().encode("utf-8") - -class UnicodeReader: - """ - A CSV reader which will iterate over lines in the CSV file "f", - which is encoded in the given encoding. - """ - - def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): - f = UTF8Recoder(f, encoding) - self.reader = csv.reader(f, dialect=dialect, **kwds) - - def next(self): - row = self.reader.next() - return [unicode(s, "utf-8") for s in row] - - def __iter__(self): - return self - -class UnicodeWriter: - """ - A CSV writer which will write rows to CSV file "f", - which is encoded in the given encoding. - """ - - def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): - # Redirect output to a queue - self.queue = cStringIO.StringIO() - self.writer = csv.writer(self.queue, dialect=dialect, **kwds) - self.stream = f - self.encoder = codecs.getincrementalencoder(encoding)() - - def writerow(self, row): - self.writer.writerow([s.encode("utf-8") for s in row]) - # Fetch UTF-8 output from the queue ... - data = self.queue.getvalue() - data = data.decode("utf-8") - # ... and reencode it into the target encoding - data = self.encoder.encode(data) - # write to the target stream - self.stream.write(data) - # empty queue - self.queue.truncate(0) - - def writerows(self, rows): - for row in rows: - self.writerow(row) -\end{verbatim} |