summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTim Peters <tim.peters@gmail.com>2001-01-15 10:36:56 (GMT)
committerTim Peters <tim.peters@gmail.com>2001-01-15 10:36:56 (GMT)
commit142297ac9204042743c72cee04ec501236c7986d (patch)
tree5ce8f523a247584ee116a7b5f0f6783802fb13de
parentf29b64d243a5a1fd64923b9fe40f582fc6eb592a (diff)
downloadcpython-142297ac9204042743c72cee04ec501236c7986d.zip
cpython-142297ac9204042743c72cee04ec501236c7986d.tar.gz
cpython-142297ac9204042743c72cee04ec501236c7986d.tar.bz2
Speed getline_via_fgets(), by supplying two "fast paths", although one is
faster than the other. Should be faster for Mark Favas's 254-character mail log lines, and *is* 3-4% quicker for my test case with much shorter lines (but they're typical of *my* text files, and I'm tired of optimizing for everyone else at my expense <wink> -- in fact, the only one who loses here is Guido ...).
-rw-r--r--Objects/fileobject.c135
1 files changed, 81 insertions, 54 deletions
diff --git a/Objects/fileobject.c b/Objects/fileobject.c
index 40ac43e..69ee860 100644
--- a/Objects/fileobject.c
+++ b/Objects/fileobject.c
@@ -688,77 +688,105 @@ static PyObject*
getline_via_fgets(FILE *fp)
{
/* INITBUFSIZE is the maximum line length that lets us get away with the fast
- * no-realloc path. get_line uses 100 for its initial size, but isn't trying
- * to avoid reallocs. Under MSVC 6, and using files with lines all under 100
- * chars long, dropping this from 200 to 100 bought less than 1% speedup.
- * Since many kinds of log files have lines exceeding 100 chars, the tiny
- * slowdown from using 200 is more than offset by the large speedup for such
- * log files.
- * INCBUFSIZE is the amount by which we grow the buffer, if INITBUFSIZE isn't
- * enough. It doesn't much matter what this set to.
+ * no-realloc, one-fgets()-call path. Boosting it isn't free, because we have
+ * to fill this much of the buffer with a known value in order to figure out
+ * how much of the buffer fgets() overwrites. So if INITBUFSIZE is larger
+ * than "most" lines, we waste time filling unused buffer slots. 100 is
+ * surely adequate for most peoples' email archives, chewing over source code,
+ * etc -- "regular old text files".
+ * MAXBUFSIZE is the maximum line length that lets us get away with the less
+ * fast (but still zippy) no-realloc, two-fgets()-call path. See above for
+ * cautions about boosting that. 300 was chosen because the worst real-life
+ * text-crunching job reported on Python-Dev was a mail-log crawler where over
+ * half the lines were 254 chars.
+ * INCBUFSIZE is the amount by which we grow the buffer, if MAXBUFSIZE isn't
+ * enough. It doesn't much matter what this is set to: we only get here for
+ * absurdly long lines anyway.
*/
-#define INITBUFSIZE 200
+#define INITBUFSIZE 100
+#define MAXBUFSIZE 300
#define INCBUFSIZE 1000
+ char* p; /* temp */
+ char buf[MAXBUFSIZE];
PyObject* v; /* the string object result */
- size_t total_v_size; /* total # chars in v's buffer */
char* pvfree; /* address of next free slot */
char* pvend; /* address one beyond last free slot */
- char* p; /* temp */
- char buf[INITBUFSIZE];
+ size_t nfree; /* # of free buffer slots; pvend-pvfree */
+ size_t total_v_size; /* total # of slots in buffer */
/* Optimize for normal case: avoid _PyString_Resize if at all
- * possible via first reading into auto buf.
+ * possible via first reading into stack buffer "buf".
*/
- Py_BEGIN_ALLOW_THREADS
- memset(buf, '\n', INITBUFSIZE);
- p = fgets(buf, INITBUFSIZE, fp);
- Py_END_ALLOW_THREADS
+ total_v_size = INITBUFSIZE; /* start small and pray */
+ pvfree = buf;
+ for (;;) {
+ Py_BEGIN_ALLOW_THREADS
+ pvend = buf + total_v_size;
+ nfree = pvend - pvfree;
+ memset(pvfree, '\n', nfree);
+ p = fgets(pvfree, nfree, fp);
+ Py_END_ALLOW_THREADS
- if (p == NULL) {
- clearerr(fp);
- if (PyErr_CheckSignals())
- return NULL;
- v = PyString_FromStringAndSize("", 0);
- return v;
- }
- /* fgets read *something* */
- p = memchr(buf, '\n', INITBUFSIZE);
- if (p != NULL) {
- /* Did the \n come from fgets or from us?
- * Since fgets stops at the first \n, and then writes \0, if
- * it's from fgets a \0 must be next. But if that's so, it
- * could not have come from us, since the \n's we filled the
- * buffer with have only more \n's to the right.
- */
- pvend = buf + INITBUFSIZE;
- if (p+1 < pvend && *(p+1) == '\0') {
- /* It's from fgets: we win! In particular, we
- * haven't done any mallocs yet, and can build the
- * final result on the first try.
+ if (p == NULL) {
+ clearerr(fp);
+ if (PyErr_CheckSignals())
+ return NULL;
+ v = PyString_FromStringAndSize(buf, pvfree - buf);
+ return v;
+ }
+ /* fgets read *something* */
+ p = memchr(pvfree, '\n', nfree);
+ if (p != NULL) {
+ /* Did the \n come from fgets or from us?
+ * Since fgets stops at the first \n, and then writes
+ * \0, if it's from fgets a \0 must be next. But if
+ * that's so, it could not have come from us, since
+ * the \n's we filled the buffer with have only more
+ * \n's to the right.
*/
- v = PyString_FromStringAndSize(buf, p - buf + 1);
+ if (p+1 < pvend && *(p+1) == '\0') {
+ /* It's from fgets: we win! In particular,
+ * we haven't done any mallocs yet, and can
+ * build the final result on the first try.
+ */
+ ++p; /* include \n from fgets */
+ }
+ else {
+ /* Must be from us: fgets didn't fill the
+ * buffer and didn't find a newline, so it
+ * must be the last and newline-free line of
+ * the file.
+ */
+ assert(p > pvfree && *(p-1) == '\0');
+ --p; /* don't include \0 from fgets */
+ }
+ v = PyString_FromStringAndSize(buf, p - buf);
return v;
}
- /* Must be from us: fgets didn't fill the buffer and didn't
- * find a newline, so it must be the last and newline-free
- * line of the file.
+ /* yuck: fgets overwrote all the newlines, i.e. the entire
+ * buffer. So this line isn't over yet, or maybe it is but
+ * we're exactly at EOF. If we haven't already, try using the
+ * rest of the stack buffer.
*/
- assert(p > buf && *(p-1) == '\0');
- v = PyString_FromStringAndSize(buf, p - buf - 1);
- return v;
+ assert(*(pvend-1) == '\0');
+ if (pvfree == buf) {
+ pvfree = pvend - 1; /* overwrite trailing null */
+ total_v_size = MAXBUFSIZE;
+ }
+ else
+ break;
}
- /* yuck: fgets overwrote all the newlines, i.e. the entire buffer.
- * So this line isn't over yet, or maybe it is but we're exactly at
- * EOF; in either case, we're tired <wink>.
+
+ /* The stack buffer isn't big enough; malloc a string object and read
+ * into its buffer.
*/
- assert(buf[INITBUFSIZE-1] == '\0');
- total_v_size = INITBUFSIZE + INCBUFSIZE;
+ total_v_size = MAXBUFSIZE + INCBUFSIZE;
v = PyString_FromStringAndSize((char*)NULL, (int)total_v_size);
if (v == NULL)
return v;
/* copy over everything except the last null byte */
- memcpy(BUF(v), buf, INITBUFSIZE-1);
- pvfree = BUF(v) + INITBUFSIZE - 1;
+ memcpy(BUF(v), buf, MAXBUFSIZE-1);
+ pvfree = BUF(v) + MAXBUFSIZE - 1;
/* Keep reading stuff into v; if it ever ends successfully, break
* after setting p one beyond the end of the line. The code here is
@@ -766,8 +794,6 @@ getline_via_fgets(FILE *fp)
* the code above for detailed comments about the logic.
*/
for (;;) {
- size_t nfree;
-
Py_BEGIN_ALLOW_THREADS
pvend = BUF(v) + total_v_size;
nfree = pvend - pvfree;
@@ -814,6 +840,7 @@ getline_via_fgets(FILE *fp)
_PyString_Resize(&v, p - BUF(v));
return v;
#undef INITBUFSIZE
+#undef MAXBUFSIZE
#undef INCBUFSIZE
}
#endif /* ifdef USE_FGETS_IN_GETLINE */