From 86821b2563915e4f11cde045da8ebe01beffa634 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 7 Jan 2001 21:19:34 +0000 Subject: MS Win32 .readline() speedup, as discussed on Python-Dev. This is a tricky variant that never needs to "search from the right". Also fixed unlikely memory leak in get_line, if string size overflows INTMAX. Also new std test test_bufio to make sure .readline() works. --- Lib/test/output/test_bufio | 1 + Lib/test/test_bufio.py | 60 ++++++++++++++ Objects/fileobject.c | 199 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 245 insertions(+), 15 deletions(-) create mode 100644 Lib/test/output/test_bufio create mode 100644 Lib/test/test_bufio.py diff --git a/Lib/test/output/test_bufio b/Lib/test/output/test_bufio new file mode 100644 index 0000000..c153797 --- /dev/null +++ b/Lib/test/output/test_bufio @@ -0,0 +1 @@ +test_bufio diff --git a/Lib/test/test_bufio.py b/Lib/test/test_bufio.py new file mode 100644 index 0000000..1a820d4 --- /dev/null +++ b/Lib/test/test_bufio.py @@ -0,0 +1,60 @@ +from test_support import TestFailed, TESTFN + +# Simple test to ensure that optimizations in fileobject.c deliver +# the expected results. For best testing, run this under a debug-build +# Python too (to exercise asserts in the C code). + +# Repeat string 'pattern' as often as needed to reach total length +# 'length'. Then call try_one with that string, a string one larger +# than that, and a string one smaller than that. The main driver +# feeds this all small sizes and various powers of 2, so we exercise +# all likely stdio buffer sizes, and "off by one" errors on both +# sides. +def drive_one(pattern, length): + q, r = divmod(length, len(pattern)) + teststring = pattern * q + pattern[:r] + assert len(teststring) == length + try_one(teststring) + try_one(teststring + "x") + try_one(teststring[:-1]) + +# Write s + "\n" + s to file, then open it and ensure that successive +# .readline()s deliver what we wrote. +def try_one(s): + # Since C doesn't guarantee we can write/read arbitrary bytes in text + # files, use binary mode. + f = open(TESTFN, "wb") + # write once with \n and once without + f.write(s) + f.write("\n") + f.write(s) + f.close() + f = open(TESTFN, "rb") + line = f.readline() + if line != s + "\n": + raise TestFailed("Expected %r got %r" % (s + "\n", line)) + line = f.readline() + if line != s: + raise TestFailed("Expected %r got %r" % (s, line)) + line = f.readline() + if line: + raise TestFailed("Expected EOF but got %r" % line) + f.close() + +# A pattern with prime length, to avoid simple relationships with +# stdio buffer sizes. +primepat = "1234567890\00\01\02\03\04\05\06\07" + +nullpat = "\0" * 1000 + +try: + for size in range(1, 257) + [512, 1000, 1024, 2048, 4096, 8192, 10000, + 16384, 32768, 65536, 1000000]: + drive_one(primepat, size) + drive_one(nullpat, size) +finally: + try: + import os + os.unlink(TESTFN) + except: + pass diff --git a/Objects/fileobject.c b/Objects/fileobject.c index aaa6a90..5dd2460 100644 --- a/Objects/fileobject.c +++ b/Objects/fileobject.c @@ -244,7 +244,7 @@ file_close(PyFileObject *f, PyObject *args) /* a portable fseek() function return 0 on success, non-zero on failure (with errno set) */ int -#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 +#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 _portable_fseek(FILE *fp, fpos_t offset, int whence) #else _portable_fseek(FILE *fp, off_t offset, int whence) @@ -256,7 +256,7 @@ _portable_fseek(FILE *fp, off_t offset, int whence) return fseek64(fp, offset, whence); #elif defined(__BEOS__) return _fseek(fp, offset, whence); -#elif defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_FPOS_T >= 8 +#elif defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_FPOS_T >= 8 /* lacking a 64-bit capable fseek() (as Win64 does) use a 64-bit capable fsetpos() and tell() to implement fseek()*/ fpos_t pos; @@ -287,7 +287,7 @@ _portable_fseek(FILE *fp, off_t offset, int whence) /* a portable ftell() function Return -1 on failure with errno set appropriately, current file position on success */ -#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 +#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 fpos_t #else off_t @@ -314,13 +314,13 @@ file_seek(PyFileObject *f, PyObject *args) { int whence; int ret; -#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 +#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 fpos_t offset, pos; #else off_t offset; #endif /* !MS_WIN64 */ PyObject *offobj; - + if (f->f_fp == NULL) return err_closed(); whence = 0; @@ -334,7 +334,7 @@ file_seek(PyFileObject *f, PyObject *args) #endif if (PyErr_Occurred()) return NULL; - + Py_BEGIN_ALLOW_THREADS errno = 0; ret = _portable_fseek(f->f_fp, offset, whence); @@ -355,13 +355,13 @@ static PyObject * file_truncate(PyFileObject *f, PyObject *args) { int ret; -#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 +#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 fpos_t newsize; #else off_t newsize; #endif PyObject *newsizeobj; - + if (f->f_fp == NULL) return err_closed(); newsizeobj = NULL; @@ -416,7 +416,7 @@ file_truncate(PyFileObject *f, PyObject *args) Py_END_ALLOW_THREADS if (ret != 0) goto onioerror; #endif /* !MS_WIN32 */ - + Py_INCREF(Py_None); return Py_None; @@ -430,7 +430,7 @@ onioerror: static PyObject * file_tell(PyFileObject *f, PyObject *args) { -#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 +#if defined(HAVE_LARGEFILE_SUPPORT) && SIZEOF_OFF_T < 8 && SIZEOF_FPOS_T >= 8 fpos_t pos; #else off_t pos; @@ -470,7 +470,7 @@ static PyObject * file_flush(PyFileObject *f, PyObject *args) { int res; - + if (f->f_fp == NULL) return err_closed(); if (!PyArg_NoArgs(args)) @@ -559,7 +559,7 @@ file_read(PyFileObject *f, PyObject *args) long bytesrequested = -1; size_t bytesread, buffersize, chunksize; PyObject *v; - + if (f->f_fp == NULL) return err_closed(); if (!PyArg_ParseTuple(args, "|l:read", &bytesrequested)) @@ -610,7 +610,7 @@ file_readinto(PyFileObject *f, PyObject *args) { char *ptr; size_t ntodo, ndone, nnow; - + if (f->f_fp == NULL) return err_closed(); if (!PyArg_Parse(args, "w#", &ptr, &ntodo)) @@ -634,6 +634,170 @@ file_readinto(PyFileObject *f, PyObject *args) return PyInt_FromLong((long)ndone); } +/************************************************************************** +Win32 MS routine to get next line. + +Under MSVC 6: + ++ MS threadsafe getc is very slow (multiple layers of function calls + before+after each character, to lock+unlock the stream). ++ The stream-locking functions are MS-internal -- can't access them + from user code. ++ There's nothing Tim could find in the MS C or platform SDK libraries + that can worm around this. ++ MS fgets locks/unlocks only once per line; it's the only hook we have. + +So we use fgets for speed(!), despite that it's painful. + +MS realloc is also slow. + +In the usual case, we have one pleasantly small line already sitting in a +stdio buffer, and we optimize heavily for that case. + +CAUTION: This routine cheats, relying on how MSVC 6 works internally. +They seem to be relatively safe cheats, but we should expect this code +to break someday. +**************************************************************************/ + +/* if Win32 and MS's compiler */ +#if defined(MS_WIN32) && defined(_MSC_VER) +#define USE_MS_GETLINE_HACK +#endif + +#ifdef USE_MS_GETLINE_HACK +static PyObject* +ms_getline_hack(FILE *fp) +{ +#define INITBUFSIZE 100 +#define INCBUFSIZE 1000 + PyObject* v; /* the string object result */ + size_t total_v_size; /* total # chars in v's buffer */ + char* pvfree; /* address of next free slot */ + char* pvend; /* address one beyond last free slot */ + char* p; /* temp */ + + if (fp->_cnt > 0) { /* HACK: "_cnt" isn't advertised */ + /* optimize for normal case: something sitting in the + * buffer ready to go; avoid thread fiddling & realloc + * if possible + */ + char msbuf[INITBUFSIZE]; + memset(msbuf, '\n', INITBUFSIZE); + p = fgets(msbuf, INITBUFSIZE, fp); + /* since we didn't lock the file, there's no guarantee + * anything was still in the buffer + */ + if (p == NULL) { + clearerr(fp); + if (PyErr_CheckSignals()) + return NULL; + v = PyString_FromStringAndSize("", 0); + return v; + } + /* fgets read *something* */ + p = memchr(msbuf, '\n', INITBUFSIZE); + if (p != NULL) { + /* Did the \n come from fgets or from us? + * Since fgets stops at the first \n, and then + * writes \0, if it's from fgets a \0 must be next. + * But if that's so, it could not have come from us, + * since the \n's we filled the buffer with have only + * more \n's to the right. + */ + pvend = msbuf + INITBUFSIZE; + if (p+1 < pvend && *(p+1) == '\0') { + /* it's from fgets: we win! */ + v = PyString_FromStringAndSize(msbuf, + p - msbuf + 1); + return v; + } + /* Must be from us: fgets didn't fill the buffer + * and didn't find a newline, so it must be the + * last and newline-free line of the file. + */ + assert(p > msbuf && *(p-1) == '\0'); + v = PyString_FromStringAndSize(msbuf, p - msbuf - 1); + return v; + } + /* yuck: fgets overwrote all the newlines, i.e. the entire + * buffer. So this line isn't over yet, or maybe it is but + * we're exactly at EOF; in either case, we're tired . + */ + assert(msbuf[INITBUFSIZE-1] == '\0'); + total_v_size = INITBUFSIZE + INCBUFSIZE; + v = PyString_FromStringAndSize((char*)NULL, + (int)total_v_size); + if (v == NULL) + return v; + /* copy over everything except the last null byte */ + memcpy(BUF(v), msbuf, INITBUFSIZE-1); + pvfree = BUF(v) + INITBUFSIZE - 1; + } + else { + /* The stream isn't ready or isn't buffered. */ + v = PyString_FromStringAndSize((char*)NULL, INITBUFSIZE); + if (v == NULL) + return v; + total_v_size = INITBUFSIZE; + pvfree = BUF(v); + } + + /* Keep reading stuff into v; if it ever ends successfully, break + * after setting p one beyond the end of the line. + */ + for (;;) { + size_t nfree; + + Py_BEGIN_ALLOW_THREADS + pvend = BUF(v) + total_v_size; + nfree = pvend - pvfree; + memset(pvfree, '\n', nfree); + p = fgets(pvfree, nfree, fp); + Py_END_ALLOW_THREADS + + if (p == NULL) { + clearerr(fp); + if (PyErr_CheckSignals()) { + Py_DECREF(v); + return NULL; + } + p = pvfree; + break; + } + /* See the "normal case" comments above for details. */ + p = memchr(pvfree, '\n', nfree); + if (p != NULL) { + if (p+1 < pvend && *(p+1) == '\0') { + /* \n came from fgets */ + ++p; + break; + } + /* \n came from us; last line of file, no newline */ + assert(p > pvfree && *(p-1) == '\0'); + --p; + break; + } + /* expand buffer and try again */ + assert(*(pvend-1) == '\0'); + total_v_size += INCBUFSIZE; + if (total_v_size > INT_MAX) { + PyErr_SetString(PyExc_OverflowError, + "line is longer than a Python string can hold"); + Py_DECREF(v); + return NULL; + } + if (_PyString_Resize(&v, (int)total_v_size) < 0) + return NULL; + /* overwrite the trailing null byte */ + pvfree = BUF(v) + (total_v_size - INCBUFSIZE - 1); + } + if (BUF(v) + total_v_size != p) + _PyString_Resize(&v, p - BUF(v)); + return v; +#undef INITBUFSIZE +#undef INCBUFSIZE +} +#endif /* ifdef USE_MS_GETLINE_HACK */ /* Internal routine to get a line. Size argument interpretation: @@ -661,6 +825,10 @@ get_line(PyFileObject *f, int n) size_t n1, n2; PyObject *v; +#ifdef USE_MS_GETLINE_HACK + if (n == 0) + return ms_getline_hack(fp); +#endif n2 = n > 0 ? n : 100; v = PyString_FromStringAndSize((char *)NULL, n2); if (v == NULL) @@ -695,6 +863,7 @@ get_line(PyFileObject *f, int n) if (n2 > INT_MAX) { PyErr_SetString(PyExc_OverflowError, "line is longer than a Python string can hold"); + Py_DECREF(v); return NULL; } if (_PyString_Resize(&v, n2) < 0) @@ -999,7 +1168,7 @@ file_writelines(PyFileObject *f, PyObject *args) if (!PyString_Check(v)) { const char *buffer; int len; - if (((f->f_binary && + if (((f->f_binary && PyObject_AsReadBuffer(v, (const void**)&buffer, &len)) || @@ -1255,7 +1424,7 @@ int PyObject_AsFileDescriptor(PyObject *o) Py_DECREF(meth); if (fno == NULL) return -1; - + if (PyInt_Check(fno)) { fd = PyInt_AsLong(fno); Py_DECREF(fno); -- cgit v0.12