From ee1a53cbb1302971a65ce6eba9b5e538a3595df0 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 2 Feb 2003 02:57:53 +0000 Subject: cPickle.c: Full support for the new LONG1 and LONG4. Added comments. Assorted code cleanups; e.g., sizeof(char) is 1 by definition, so there's no need to do things like multiply by sizeof(char) in hairy malloc arguments. Fixed an undetected-overflow bug in readline_file(). longobject.c: Fixed a really stupid bug in the new _PyLong_NumBits. pickle.py: Fixed stupid bug in save_long(): When proto is 2, it wrote LONG1 or LONG4, but forgot to return then -- it went on to append the proto 1 LONG opcode too. Fixed equally stupid cancelling bugs in load_long1() and load_long4(): they *returned* the unpickled long instead of pushing it on the stack. The return values were ignored. Tests passed before only because save_long() pickled the long twice. Fixed bugs in encode_long(). Noted that decode_long() is quadratic-time despite our hopes, because long(string, 16) is still quadratic-time in len(string). It's hex() that's linear-time. I don't know a way to make decode_long() linear-time in Python, short of maybe transforming the 256's-complement bytes into marshal's funky internal format, and letting marshal decode that. It would be more valuable to make long(string, 16) linear time. pickletester.py: Added a global "protocols" vector so tests can try all the protocols in a sane way. Changed test_ints() and test_unicode() to do so. Added a new test_long(), but the tail end of it is disabled because it "takes forever" under pickle.py (but runs very quickly under cPickle: cPickle proto 2 for longs is linear-time). --- Lib/pickle.py | 22 ++++-- Lib/test/pickletester.py | 60 +++++++++++++--- Modules/cPickle.c | 182 +++++++++++++++++++++++++++++++++++++++++------ Objects/longobject.c | 4 +- 4 files changed, 227 insertions(+), 41 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 92b4802..ba0e38b 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -553,6 +553,7 @@ class Pickler: self.write(LONG1 + chr(n) + bytes) else: self.write(LONG4 + pack(" 0 ashex = hex(x) - if x >> (nbits - 1) == 0: + njunkchars = 2 + ashex.endswith('L') + newnibbles = len(ashex) - njunkchars + if newnibbles < nibbles: + ashex = "0x" + "0" * (nibbles - newnibbles) + ashex[2:] + if int(ashex[2], 16) < 8: # "looks positive", so need a byte of sign bits - ashex = "0xff" + x[2:] + ashex = "0xff" + ashex[2:] if ashex.endswith('L'): ashex = ashex[2:-1] else: ashex = ashex[2:] - assert len(ashex) & 1 == 0 + assert len(ashex) & 1 == 0, (x, ashex) binary = _binascii.unhexlify(ashex) return binary[::-1] +# XXX OOPS! This is still quadratic-time. While hex(n) is linear-time +# XXX in the # of digits in n, long(s, 16) is still quadratic-time +# XXX in len(s). def decode_long(data): r"""Decode a long from a two's complement little-endian binary string. @@ -1445,7 +1453,7 @@ def decode_long(data): if nbytes == 0: return 0L ashex = _binascii.hexlify(data[::-1]) - n = long(ashex, 16) + n = long(ashex, 16) # quadratic time if data[-1] >= '\x80': n -= 1L << (nbytes * 8) return n diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 8211dcf..6615307 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -1,6 +1,11 @@ import unittest from test.test_support import TestFailed, have_unicode, TESTFN +# Tests that try a number of pickle protocols should have a +# for proto in protocols: +# kind of outer loop. Bump the 3 to 4 if/when protocol 3 is invented. +protocols = range(3) + class C: def __cmp__(self, other): return cmp(self.__dict__, other.__dict__) @@ -28,6 +33,9 @@ class metaclass(type): class use_metaclass(object): __metaclass__ = metaclass +# DATA and BINDATA are the protocol 0 and protocol 1 pickles of the object +# returned by create_data(). + # break into multiple strings to avoid confusing font-lock-mode DATA = """(lp1 I0 @@ -210,20 +218,22 @@ class AbstractPickleTests(unittest.TestCase): def test_unicode(self): endcases = [unicode(''), unicode('<\\u>'), unicode('<\\\u1234>'), unicode('<\n>'), unicode('<\\>')] - for u in endcases: - p = self.dumps(u) - u2 = self.loads(p) - self.assertEqual(u2, u) + for proto in protocols: + for u in endcases: + p = self.dumps(u, proto) + u2 = self.loads(p) + self.assertEqual(u2, u) def test_ints(self): import sys - n = sys.maxint - while n: - for expected in (-n, n): - s = self.dumps(expected) - n2 = self.loads(s) - self.assertEqual(expected, n2) - n = n >> 1 + for proto in protocols: + n = sys.maxint + while n: + for expected in (-n, n): + s = self.dumps(expected, proto) + n2 = self.loads(s) + self.assertEqual(expected, n2) + n = n >> 1 def test_maxint64(self): maxint64 = (1L << 63) - 1 @@ -235,6 +245,34 @@ class AbstractPickleTests(unittest.TestCase): data = 'I' + str(maxint64) + 'JUNK\n.' self.assertRaises(ValueError, self.loads, data) + def test_long(self): + for proto in protocols: + # 256 bytes is where LONG4 begins + for nbits in 1, 8, 8*254, 8*255, 8*256, 8*257: + nbase = 1L << nbits + for npos in nbase-1, nbase, nbase+1: + for n in npos, -npos: + pickle = self.dumps(n, proto) + got = self.loads(pickle) + self.assertEqual(n, got) + # Try a monster. This is quadratic-time in protos 0 & 1, so don't + # bother with those. + # XXX Damn. pickle.py is still quadratic-time here, due to + # XXX long(string, 16). cPickle runs this in an eyeblink, but I + # XXX gave up waiting for pickle.py to get beyond "loading". Giving + # XXX up for now. + return + print "building long" + nbase = long("deadbeeffeedface", 16) + nbase += nbase << 1000000 + for n in nbase, -nbase: + print "dumping" + p = self.dumps(n, 2) + print "loading" + got = self.loads(p) + print "checking" + self.assertEqual(n, got) + def test_reduce(self): pass diff --git a/Modules/cPickle.c b/Modules/cPickle.c index 4364f4d..b59f573 100644 --- a/Modules/cPickle.c +++ b/Modules/cPickle.c @@ -464,7 +464,7 @@ write_other(Picklerobject *self, char *s, int n) static int -read_file(Unpicklerobject *self, char **s, int n) +read_file(Unpicklerobject *self, char **s, int n) { size_t nbytesread; @@ -472,7 +472,7 @@ read_file(Unpicklerobject *self, char **s, int n) int size; size = ((n < 32) ? 32 : n); - if (!( self->buf = (char *)malloc(size * sizeof(char)))) { + if (!( self->buf = (char *)malloc(size))) { PyErr_NoMemory(); return -1; } @@ -480,12 +480,11 @@ read_file(Unpicklerobject *self, char **s, int n) self->buf_size = size; } else if (n > self->buf_size) { - self->buf = (char *)realloc(self->buf, n * sizeof(char)); + self->buf = (char *)realloc(self->buf, n); if (!self->buf) { PyErr_NoMemory(); return -1; } - self->buf_size = n; } @@ -514,16 +513,16 @@ readline_file(Unpicklerobject *self, char **s) int i; if (self->buf_size == 0) { - if (!( self->buf = (char *)malloc(40 * sizeof(char)))) { + if (!( self->buf = (char *)malloc(40))) { PyErr_NoMemory(); return -1; } - self->buf_size = 40; } i = 0; while (1) { + int bigger; for (; i < (self->buf_size - 1); i++) { if (feof(self->fp) || (self->buf[i] = getc(self->fp)) == '\n') { @@ -532,14 +531,17 @@ readline_file(Unpicklerobject *self, char **s) return i + 1; } } - self->buf = (char *)realloc(self->buf, - (self->buf_size * 2) * sizeof(char)); + bigger = self->buf_size << 1; + if (bigger <= 0) { /* overflow */ + PyErr_NoMemory(); + return -1; + } + self->buf = (char *)realloc(self->buf, bigger); if (!self->buf) { PyErr_NoMemory(); return -1; } - - self->buf_size *= 2; + self->buf_size = bigger; } } @@ -620,14 +622,18 @@ readline_other(Unpicklerobject *self, char **s) return str_size; } - +/* Copy the first n bytes from s into newly malloc'ed memory, plus a + * trailing 0 byte. Return a pointer to that, or NULL if out of memory. + * The caller is responsible for free()'ing the return value. + */ static char * -pystrndup(char *s, int l) +pystrndup(char *s, int n) { - char *r; - if (!( r=malloc((l+1)*sizeof(char)))) return (char*)PyErr_NoMemory(); - memcpy(r,s,l); - r[l]=0; + char *r = (char *)malloc(n+1); + if (r == NULL) + return (char*)PyErr_NoMemory(); + memcpy(r, s, n); + r[n] = 0; return r; } @@ -1012,11 +1018,93 @@ save_int(Picklerobject *self, PyObject *args) static int save_long(Picklerobject *self, PyObject *args) { - int size, res = -1; - PyObject *repr = 0; + int size; + int res = -1; + PyObject *repr = NULL; static char l = LONG; + if (self->proto >= 2) { + /* Linear-time pickling. */ + size_t nbits; + size_t nbytes; + unsigned char *pdata; + char c_str[5]; + int i; + int sign = _PyLong_Sign(args); + + if (sign == 0) { + /* It's 0 -- an empty bytestring. */ + c_str[0] = LONG1; + c_str[1] = 0; + i = self->write_func(self, c_str, 2); + if (i < 0) goto finally; + res = 0; + goto finally; + } + nbits = _PyLong_NumBits(args); + if (nbits == (size_t)-1 && PyErr_Occurred()) + goto finally; + /* How many bytes do we need? There are nbits >> 3 full + * bytes of data, and nbits & 7 leftover bits. If there + * are any leftover bits, then we clearly need another + * byte. Wnat's not so obvious is that we *probably* + * need another byte even if there aren't any leftovers: + * the most-significant bit of the most-significant byte + * acts like a sign bit, and it's usually got a sense + * opposite of the one we need. The exception is longs + * of the form -(2**(8*j-1)) for j > 0. Such a long is + * its own 256's-complement, so has the right sign bit + * even without the extra byte. That's a pain to check + * for in advance, though, so we always grab an extra + * byte at the start, and cut it back later if possible. + */ + nbytes = (nbits >> 3) + 1; + if ((int)nbytes < 0 || (size_t)(int)nbytes != nbytes) { + PyErr_SetString(PyExc_OverflowError, "long too large " + "to pickle"); + goto finally; + } + repr = PyString_FromStringAndSize(NULL, (int)nbytes); + if (repr == NULL) goto finally; + pdata = (unsigned char *)PyString_AS_STRING(repr); + i = _PyLong_AsByteArray((PyLongObject *)args, + pdata, nbytes, + 1 /* little endian */, 1 /* signed */); + if (i < 0) goto finally; + /* If the long is negative, this may be a byte more than + * needed. This is so iff the MSB is all redundant sign + * bits. + */ + if (sign < 0 && nbytes > 1 && pdata[nbytes - 1] == 0xff && + (pdata[nbytes - 2] & 0x80) != 0) + --nbytes; + + if (nbytes < 256) { + c_str[0] = LONG1; + c_str[1] = (char)nbytes; + size = 2; + } + else { + c_str[0] = LONG4; + size = (int)nbytes; + for (i = 1; i < 5; i++) { + c_str[i] = (char)(size & 0xff); + size >>= 8; + } + size = 5; + } + i = self->write_func(self, c_str, size); + if (i < 0) goto finally; + i = self->write_func(self, (char *)pdata, (int)nbytes); + if (i < 0) goto finally; + res = 0; + goto finally; + } + + /* proto < 2: write the repr and newline. This is quadratic-time + * (in the number of digits), in both directions. + */ if (!( repr = PyObject_Repr(args))) goto finally; @@ -1038,7 +1126,6 @@ save_long(Picklerobject *self, PyObject *args) finally: Py_XDECREF(repr); - return res; } @@ -2687,9 +2774,13 @@ load_int(Unpicklerobject *self) return res; } - +/* s contains x bytes of a little-endian integer. Return its value as a + * C int. Obscure: when x is 1 or 2, this is an unsigned little-endian + * int, but when x is 4 it's a signed one. This is an historical source + * of x-platform bugs. + */ static long -calc_binint(char *s, int x) +calc_binint(char *s, int x) { unsigned char c; int i; @@ -2786,6 +2877,45 @@ load_long(Unpicklerobject *self) return res; } +/* 'size' bytes contain the # of bytes of little-endian 256's-complement + * data following. + */ +static int +load_counted_long(Unpicklerobject *self, int size) +{ + int i; + char *nbytes; + unsigned char *pdata; + PyObject *along; + + assert(size == 1 || size == 4); + i = self->read_func(self, &nbytes, size); + if (i < 0) return -1; + + size = calc_binint(nbytes, size); + if (size < 0) { + /* Corrupt or hostile pickle -- we never write one like + * this. + */ + PyErr_SetString(PyExc_ValueError, "LONG pickle has negative " + "byte count"); + return -1; + } + + if (size == 0) + along = PyLong_FromLong(0L); + else { + /* Read the raw little-endian bytes & convert. */ + i = self->read_func(self, &(char *)pdata, size); + if (i < 0) return -1; + along = _PyLong_FromByteArray(pdata, (size_t)size, + 1 /* little endian */, 1 /* signed */); + } + if (along == NULL) + return -1; + PDATA_PUSH(self->stack, along, -1); + return 0; +} static int load_float(Unpicklerobject *self) @@ -3784,6 +3914,16 @@ load(Unpicklerobject *self) break; continue; + case LONG1: + if (load_counted_long(self, 1) < 0) + break; + continue; + + case LONG4: + if (load_counted_long(self, 4) < 0) + break; + continue; + case FLOAT: if (load_float(self) < 0) break; diff --git a/Objects/longobject.c b/Objects/longobject.c index 7835784..9d7243f 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -264,13 +264,13 @@ int _PyLong_Sign(PyObject *vv) { PyLongObject *v = (PyLongObject *)vv; - const int ndigits = v->ob_size; + const int ndigits = ABS(v->ob_size); assert(v != NULL); assert(PyLong_Check(v)); assert(ndigits == 0 || v->ob_digit[ndigits - 1] != 0); - return ndigits == 0 ? 0 : (ndigits < 0 ? -1 : 1); + return v->ob_size == 0 ? 0 : (v->ob_size < 0 ? -1 : 1); } size_t -- cgit v0.12