summaryrefslogtreecommitdiffstats
path: root/Doc/lib/libmultifile.tex
blob: aa81d4a10600628f4f7ea1fc745cd1a10bcb08d2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
\section{\module{multifile} ---
         Support for files containing distinct parts}

\declaremodule{standard}{multifile}
\modulesynopsis{Support for reading files which contain distinct
                parts, such as some MIME data.}
\sectionauthor{Eric S. Raymond}{esr@snark.thyrsus.com}


The \class{MultiFile} object enables you to treat sections of a text
file as file-like input objects, with \code{''} being returned by
\method{readline()} when a given delimiter pattern is encountered.  The
defaults of this class are designed to make it useful for parsing
MIME multipart messages, but by subclassing it and overriding methods 
it can be easily adapted for more general use.

\begin{classdesc}{MultiFile}{fp\optional{, seekable}}
Create a multi-file.  You must instantiate this class with an input
object argument for the \class{MultiFile} instance to get lines from,
such as a file object returned by \function{open()}.

\class{MultiFile} only ever looks at the input object's
\method{readline()}, \method{seek()} and \method{tell()} methods, and
the latter two are only needed if you want random access to the
individual MIME parts. To use \class{MultiFile} on a non-seekable
stream object, set the optional \var{seekable} argument to false; this
will prevent using the input object's \method{seek()} and
\method{tell()} methods.
\end{classdesc}

It will be useful to know that in \class{MultiFile}'s view of the world, text
is composed of three kinds of lines: data, section-dividers, and
end-markers.  MultiFile is designed to support parsing of
messages that may have multiple nested message parts, each with its
own pattern for section-divider and end-marker lines.

\begin{seealso}
  \seemodule{email}{Comprehensive email handling package; supersedes
                    the \module{multifile} module.}
\end{seealso}


\subsection{MultiFile Objects \label{MultiFile-objects}}

A \class{MultiFile} instance has the following methods:

\begin{methoddesc}{readline}{str}
Read a line.  If the line is data (not a section-divider or end-marker
or real EOF) return it.  If the line matches the most-recently-stacked
boundary, return \code{''} and set \code{self.last} to 1 or 0 according as
the match is or is not an end-marker.  If the line matches any other
stacked boundary, raise an error.  On encountering end-of-file on the
underlying stream object, the method raises \exception{Error} unless
all boundaries have been popped.
\end{methoddesc}

\begin{methoddesc}{readlines}{str}
Return all lines remaining in this part as a list of strings.
\end{methoddesc}

\begin{methoddesc}{read}{}
Read all lines, up to the next section.  Return them as a single
(multiline) string.  Note that this doesn't take a size argument!
\end{methoddesc}

\begin{methoddesc}{seek}{pos\optional{, whence}}
Seek.  Seek indices are relative to the start of the current section.
The \var{pos} and \var{whence} arguments are interpreted as for a file
seek.
\end{methoddesc}

\begin{methoddesc}{tell}{}
Return the file position relative to the start of the current section.
\end{methoddesc}

\begin{methoddesc}{next}{}
Skip lines to the next section (that is, read lines until a
section-divider or end-marker has been consumed).  Return true if
there is such a section, false if an end-marker is seen.  Re-enable
the most-recently-pushed boundary.
\end{methoddesc}

\begin{methoddesc}{is_data}{str}
Return true if \var{str} is data and false if it might be a section
boundary.  As written, it tests for a prefix other than \code{'-}\code{-'} at
start of line (which all MIME boundaries have) but it is declared so
it can be overridden in derived classes.

Note that this test is used intended as a fast guard for the real
boundary tests; if it always returns false it will merely slow
processing, not cause it to fail.
\end{methoddesc}

\begin{methoddesc}{push}{str}
Push a boundary string.  When a decorated version of this boundary 
is found as an input line, it will be interpreted as a section-divider 
or end-marker (depending on the decoration, see \rfc{2045}).  All subsequent
reads will return the empty string to indicate end-of-file, until a
call to \method{pop()} removes the boundary a or \method{next()} call
reenables it.

It is possible to push more than one boundary.  Encountering the
most-recently-pushed boundary will return EOF; encountering any other
boundary will raise an error.
\end{methoddesc}

\begin{methoddesc}{pop}{}
Pop a section boundary.  This boundary will no longer be interpreted
as EOF.
\end{methoddesc}

\begin{methoddesc}{section_divider}{str}
Turn a boundary into a section-divider line.  By default, this
method prepends \code{'-}\code{-'} (which MIME section boundaries have) but
it is declared so it can be overridden in derived classes.  This
method need not append LF or CR-LF, as comparison with the result
ignores trailing whitespace. 
\end{methoddesc}

\begin{methoddesc}{end_marker}{str}
Turn a boundary string into an end-marker line.  By default, this
method prepends \code{'-}\code{-'} and appends \code{'-}\code{-'} (like a
MIME-multipart end-of-message marker) but it is declared so it can be
overridden in derived classes.  This method need not append LF or
CR-LF, as comparison with the result ignores trailing whitespace.
\end{methoddesc}

Finally, \class{MultiFile} instances have two public instance variables:

\begin{memberdesc}{level}
Nesting depth of the current part.
\end{memberdesc}

\begin{memberdesc}{last}
True if the last end-of-file was for an end-of-message marker. 
\end{memberdesc}


\subsection{\class{MultiFile} Example \label{multifile-example}}
\sectionauthor{Skip Montanaro}{skip@mojam.com}

\begin{verbatim}
import mimetools
import multifile
import StringIO

def extract_mime_part_matching(stream, mimetype):
    """Return the first element in a multipart MIME message on stream
    matching mimetype."""

    msg = mimetools.Message(stream)
    msgtype = msg.gettype()
    params = msg.getplist()

    data = StringIO.StringIO()
    if msgtype[:10] == "multipart/":

        file = multifile.MultiFile(stream)
        file.push(msg.getparam("boundary"))
        while file.next():
            submsg = mimetools.Message(file)
            try:
                data = StringIO.StringIO()
                mimetools.decode(file, data, submsg.getencoding())
            except ValueError:
                continue
            if submsg.gettype() == mimetype:
                break
        file.pop()
    return data.getvalue()
\end{verbatim}