From 66913e221312e38cc542896d4db9b45720a20672 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 6 Mar 2009 23:40:56 +0000 Subject: Issue #5433: Excessive newline detection optimization in IncrementalNewlineDecoder --- Lib/test/test_io.py | 13 +++++++++++++ Modules/_textio.c | 42 ++++++++++++++++++++++++++++++------------ 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 3189f9c..5fc53ea 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -1915,6 +1915,19 @@ class IncrementalNewlineDecoderTest(unittest.TestCase): decoder = self.IncrementalNewlineDecoder(decoder, translate=True) self.check_newline_decoding_utf8(decoder) + def test_newline_bytes(self): + # Issue 5433: Excessive optimization in IncrementalNewlineDecoder + def _check(dec): + self.assertEquals(dec.newlines, None) + self.assertEquals(dec.decode("\u0D00"), "\u0D00") + self.assertEquals(dec.newlines, None) + self.assertEquals(dec.decode("\u0A00"), "\u0A00") + self.assertEquals(dec.newlines, None) + dec = self.IncrementalNewlineDecoder(None, translate=False) + _check(dec) + dec = self.IncrementalNewlineDecoder(None, translate=True) + _check(dec) + class CIncrementalNewlineDecoderTest(IncrementalNewlineDecoderTest): pass diff --git a/Modules/_textio.c b/Modules/_textio.c index 145f8ea..dbed2fd 100644 --- a/Modules/_textio.c +++ b/Modules/_textio.c @@ -305,22 +305,40 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self, for the \r *byte* with the libc's optimized memchr. */ if (seennl == SEEN_LF || seennl == 0) { - int has_cr, has_lf; - has_lf = (seennl == SEEN_LF) || - (memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL); - has_cr = (memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL); - if (has_lf && !has_cr) { - only_lf = 1; - seennl = SEEN_LF; - } + only_lf = !(memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL); } - if (!self->translate) { + if (only_lf) { + /* If not already seen, quick scan for a possible "\n" character. + (there's nothing else to be done, even when in translation mode) + */ + if (seennl == 0 && + memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL) { + Py_UNICODE *s, *end; + s = in_str; + end = in_str + len; + for (;;) { + Py_UNICODE c; + /* Fast loop for non-control characters */ + while (*s > '\n') + s++; + c = *s++; + if (c == '\n') { + seennl |= SEEN_LF; + break; + } + if (s > end) + break; + } + } + /* Finished: we have scanned for newlines, and none of them + need translating */ + } + else if (!self->translate) { Py_UNICODE *s, *end; + /* We have already seen all newline types, no need to scan again */ if (seennl == SEEN_ALL) goto endscan; - if (only_lf) - goto endscan; s = in_str; end = in_str + len; for (;;) { @@ -347,7 +365,7 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self, endscan: ; } - else if (!only_lf) { + else { PyObject *translated = NULL; Py_UNICODE *out_str; Py_UNICODE *in, *out, *end; -- cgit v0.12