Implemented posix-mode parsing support in shlex.py, as dicussed in

mailing list, and in patch #722686.
author: Gustavo Niemeyer <gustavo@niemeyer.net> 2003-04-17 21:31:33 (GMT)
committer: Gustavo Niemeyer <gustavo@niemeyer.net> 2003-04-17 21:31:33 (GMT)
commit: 68d8cef89a307bafc752da68dce078306bc51352 (patch)
tree: a1a740dce058de6a3810ceb011675ab57dd900c6 /Doc/lib/libshlex.tex
parent: 84c2b1b9aa3a596b597d37e6258c790987e50963 (diff)
download: cpython-68d8cef89a307bafc752da68dce078306bc51352.zip
cpython-68d8cef89a307bafc752da68dce078306bc51352.tar.gz
cpython-68d8cef89a307bafc752da68dce078306bc51352.tar.bz2
1 files changed, 113 insertions, 23 deletions
diff --git a/Doc/lib/libshlex.tex b/Doc/lib/libshlex.tex
index eecded7..c0c4e56 100644
--- a/Doc/lib/libshlex.tex
+++ b/Doc/lib/libshlex.tex
@@ -4,26 +4,16 @@
 \declaremodule{standard}{shlex}
 \modulesynopsis{Simple lexical analysis for \UNIX\ shell-like languages.}
 \moduleauthor{Eric S. Raymond}{esr@snark.thyrsus.com}
+\moduleauthor{Gustavo Niemeyer}{niemeyer@conectiva.com}
 \sectionauthor{Eric S. Raymond}{esr@snark.thyrsus.com}
+\sectionauthor{Gustavo Niemeyer}{niemeyer@conectiva.com}
 
 \versionadded{1.5.2}
 
 The \class{shlex} class makes it easy to write lexical analyzers for
 simple syntaxes resembling that of the \UNIX{} shell.  This will often
-be useful for writing minilanguages, e.g.\ in run control files for
-Python applications.
-
-\begin{classdesc}{shlex}{\optional{stream\optional{, file}}}
-A \class{shlex} instance or subclass instance is a lexical analyzer
-object.  The initialization argument, if present, specifies where to
-read characters from. It must be a file- or stream-like object with
-\method{read()} and \method{readline()} methods.  If no argument is given,
-input will be taken from \code{sys.stdin}.  The second optional 
-argument is a filename string, which sets the initial value of the
-\member{infile} member.  If the stream argument is omitted or
-equal to \code{sys.stdin}, this second argument defaults to ``stdin''.
-\end{classdesc}
-
+be useful for writing minilanguages, (e.g. in run control files for
+Python applications) or for parsing quoted strings.
 
 \begin{seealso}
   \seemodule{ConfigParser}{Parser for configuration files similar to the
@@ -31,16 +21,50 @@ equal to \code{sys.stdin}, this second argument defaults to ``stdin''.
 \end{seealso}
 
 
+\subsection{Module Contents}
+
+The \module{shlex} module defines the following functions:
+
+\begin{funcdesc}{split}{s\optional{, posix=\code{True}\optional{,
+			spaces=\code{True}}}}
+Split the string \var{s} using shell-like syntax. If \code{posix} is
+\code{True}, operate in posix mode. If \code{spaces} is \code{True}, it
+will only split words in whitespaces (setting the
+\member{whitespace_split} member of the \class{shlex} instance).
+\versionadded{2.3}
+\end{funcdesc}
+
+The \module{shlex} module defines the following classes:
+
+\begin{classdesc}{shlex}{\optional{instream=\code{sys.stdin}\optional{,
+			 infile=\code{None}\optional{,
+			 posix=\code{False}}}}}
+A \class{shlex} instance or subclass instance is a lexical analyzer
+object.  The initialization argument, if present, specifies where to
+read characters from. It must be a file-/stream-like object with
+\method{read()} and \method{readline()} methods, or a string (strings
+are accepted since Python 2.3). If no argument is given, input will be
+taken from \code{sys.stdin}.  The second optional argument is a filename
+string, which sets the initial value of the \member{infile} member.  If
+the \var{instream} argument is omitted or equal to \code{sys.stdin},
+this second argument defaults to ``stdin''.  The \var{posix} argument
+was introduced in Python 2.3, and defines the operational mode. When
+\var{posix} is not true (default), the \class{shlex} instance will
+operate in compatibility mode. When operating in posix mode,
+\class{shlex} will try to be as close as possible to the posix shell
+parsing rules. See~\ref{shlex-objects}.
+\end{classdesc}
+
 \subsection{shlex Objects \label{shlex-objects}}
 
 A \class{shlex} instance has the following methods:
 
-
 \begin{methoddesc}{get_token}{}
 Return a token.  If tokens have been stacked using
 \method{push_token()}, pop a token off the stack.  Otherwise, read one
 from the input stream.  If reading encounters an immediate
-end-of-file, an empty string is returned. 
+end-of-file, \member{self.eof} is returned (the empty string (\code{""})
+in non-posix mode, and \code{None} in posix mode).
 \end{methoddesc}
 
 \begin{methoddesc}{push_token}{str}
@@ -132,6 +156,12 @@ bounds tokens.  By default, includes space, tab, linefeed and
 carriage-return.
 \end{memberdesc}
 
+\begin{memberdesc}{escape}
+Characters that will be considered as escape. This will be only used
+in posix mode, and includes just \character{\textbackslash} by default.
+\versionadded{2.3}
+\end{memberdesc}
+
 \begin{memberdesc}{quotes}
 Characters that will be considered string quotes.  The token
 accumulates until the same quote is encountered again (thus, different
@@ -139,6 +169,20 @@ quote types protect each other as in the shell.)  By default, includes
 \ASCII{} single and double quotes.
 \end{memberdesc}
 
+\begin{memberdesc}{escapedquotes}
+Characters in \member{quotes} that will interpret escape characters
+defined in \member{escape}. This is only used in posix mode, and includes
+just \character{"} by default.
+\versionadded{2.3}
+\end{memberdesc}
+
+\begin{memberdesc}{whitespace_split}
+If true, tokens will only be split in whitespaces. This is useful, for
+example, for parsing command lines with \class{shlex}, getting tokens
+in a similar way to shell arguments.
+\versionadded{2.3}
+\end{memberdesc}
+
 \begin{memberdesc}{infile}
 The name of the current input file, as initially set at class
 instantiation time or stacked by later source requests.  It may
@@ -168,13 +212,6 @@ need to use this, you can read the module source code to learn the
 details.
 \end{memberdesc}
 
-Note that any character not declared to be a word character,
-whitespace, or a quote will be returned as a single-character token.
-
-Quote and comment characters are not recognized within words.  Thus,
-the bare words \samp{ain't} and \samp{ain\#t} would be returned as single
-tokens by the default parser.
-
 \begin{memberdesc}{lineno}
 Source line number (count of newlines seen so far plus one).
 \end{memberdesc}
@@ -183,3 +220,56 @@ Source line number (count of newlines seen so far plus one).
 The token buffer.  It may be useful to examine this when catching
 exceptions.
 \end{memberdesc}
+
+\begin{memberdesc}{eof}
+Token used to determine end of file. This will be set to the empty
+string (\code{""}), in non-posix mode, and to \code{None} in posix
+mode.
+\versionadded{2.3}
+\end{memberdesc}
+
+\subsection{Parsing Rules\label{shlex-parsing-rules}}
+
+When operating in non-posix mode, \class{shlex} with try to obey to the
+following rules.
+
+\begin{itemize}
+\item Quote characters are not recognized within words
+      (\code{Do"Not"Separate} is parsed as the single word
+      \code{Do"Not"Separate});
+\item Escape characters are not recognized;
+\item Enclosing characters in quotes preserve the literal value of
+      all characters within the quotes;
+\item Closing quotes separate words (\code{"Do"Separate} is parsed
+      as \code{"Do"} and \code{Separate});
+\item If \member{whitespace_split} is \code{False}, any character not
+      declared to be a word character, whitespace, or a quote will be
+      returned as a single-character token. If it is \code{True},
+      \class{shlex} will only split words in whitespaces;
+\item EOF is signaled with an empty string (\code{""});
+\item It's not possible to parse empty strings, even if quoted.
+\end{itemize}
+
+When operating in posix mode, \class{shlex} will try to obey to the
+following parsing rules.
+
+\begin{itemize}
+\item Quotes are stripped out, and do not separate words
+      (\code{"Do"Not"Separate"} is parsed as the single word
+      \code{DoNotSeparate});
+\item Non-quoted escape characters (e.g. \character{\textbackslash})
+      preserve the literal value of the next character that follows;
+\item Enclosing characters in quotes which are not part of
+      \member{escapedquotes} (e.g. \character{'}) preserve the literal
+      value of all characters within the quotes;
+\item Enclosing characters in quotes which are part of
+      \member{escapedquotes} (e.g. \character{"}) preserves the literal
+      value of all characters within the quotes, with the exception of
+      the characters mentioned in \member{escape}. The escape characters
+      retain its special meaning only when followed by the quote in use,
+      or the escape character itself. Otherwise the escape character
+      will be considered a normal character.
+\item EOF is signaled with a \code{None} value;
+\item Quoted empty strings (\code{""}) are allowed;
+\end{itemize}
+
author	Gustavo Niemeyer <gustavo@niemeyer.net>	2003-04-17 21:31:33 (GMT)
committer	Gustavo Niemeyer <gustavo@niemeyer.net>	2003-04-17 21:31:33 (GMT)
commit	68d8cef89a307bafc752da68dce078306bc51352 (patch)
tree	a1a740dce058de6a3810ceb011675ab57dd900c6 /Doc/lib/libshlex.tex
parent	84c2b1b9aa3a596b597d37e6258c790987e50963 (diff)
download	cpython-68d8cef89a307bafc752da68dce078306bc51352.zip cpython-68d8cef89a307bafc752da68dce078306bc51352.tar.gz cpython-68d8cef89a307bafc752da68dce078306bc51352.tar.bz2