6 files changed, 165 insertions, 55 deletions
diff --git a/Lib/pickle.py b/Lib/pickle.py
index e3c112f..ea8d8b5 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -42,19 +42,22 @@ __all__ = ["PickleError", "PicklingError", "UnpicklingError", "Pickler",
 bytes_types = (bytes, bytearray, memoryview)
 
 # These are purely informational; no code uses these.
-format_version = "2.0"                  # File format version we write
+format_version = "3.0"                  # File format version we write
 compatible_formats = ["1.0",            # Original protocol 0
                       "1.1",            # Protocol 0 with INST added
                       "1.2",            # Original protocol 1
                       "1.3",            # Protocol 1 with BINFLOAT added
                       "2.0",            # Protocol 2
+                      "3.0",            # Protocol 3
                       ]                 # Old format versions we can read
 
 # This is the highest protocol number we know how to read.
-HIGHEST_PROTOCOL = 2
+HIGHEST_PROTOCOL = 3
 
 # The protocol we write by default.  May be less than HIGHEST_PROTOCOL.
-DEFAULT_PROTOCOL = 2
+# We intentionally write a protocol that Python 2.x cannot read;
+# there are too many issues with that.
+DEFAULT_PROTOCOL = 3
 
 # Why use struct.pack() for pickling but marshal.loads() for
 # unpickling?  struct.pack() is 40% faster than marshal.dumps(), but
@@ -161,6 +164,10 @@ LONG4          = b'\x8b'  # push really big long
 
 _tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3]
 
+# Protocol 3 (Python 3.x)
+
+BINBYTES       = b'B'   # push bytes; counted binary string argument
+SHORT_BINBYTES = b'C'   #  "     "   ;    "      "       "      " < 256 bytes
 
 __all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$",x)])
 
@@ -494,20 +501,19 @@ class Pickler:
             self.write(FLOAT + repr(obj).encode("ascii") + b'\n')
     dispatch[float] = save_float
 
-    def save_string(self, obj, pack=struct.pack):
-        if self.bin:
-            n = len(obj)
-            if n < 256:
-                self.write(SHORT_BINSTRING + bytes([n]) + bytes(obj))
-            else:
-                self.write(BINSTRING + pack("<i", n) + bytes(obj))
+    def save_bytes(self, obj, pack=struct.pack):
+        if self.proto < 3:
+            self.save_reduce(bytes, (list(obj),))
+            return
+        n = len(obj)
+        if n < 256:
+            self.write(SHORT_BINBYTES + bytes([n]) + bytes(obj))
         else:
-            # Strip leading 'b' due to repr() of bytes() returning b'...'
-            self.write(STRING + repr(obj).lstrip("b").encode("ascii") + b'\n')
+            self.write(BINBYTES + pack("<i", n) + bytes(obj))
         self.memoize(obj)
-    dispatch[bytes] = save_string
+    dispatch[bytes] = save_bytes
 
-    def save_unicode(self, obj, pack=struct.pack):
+    def save_str(self, obj, pack=struct.pack):
         if self.bin:
             encoded = obj.encode('utf-8')
             n = len(encoded)
@@ -518,7 +524,7 @@ class Pickler:
             self.write(UNICODE + bytes(obj.encode('raw-unicode-escape')) +
                        b'\n')
         self.memoize(obj)
-    dispatch[str] = save_unicode
+    dispatch[str] = save_str
 
     def save_tuple(self, obj):
         write = self.write
@@ -775,7 +781,7 @@ def whichmodule(func, funcname):
 
 class Unpickler:
 
-    def __init__(self, file):
+    def __init__(self, file, *, encoding="ASCII", errors="strict"):
         """This takes a binary file for reading a pickle data stream.
 
         The protocol version of the pickle is detected automatically, so no
@@ -787,10 +793,16 @@ class Unpickler:
         Thus file-like object can be a binary file object opened for
         reading, a BytesIO object, or any other custom object that
         meets this interface.
+
+        Optional keyword arguments are encoding and errors, which are
+        used to decode 8-bit string instances pickled by Python 2.x.
+        These default to 'ASCII' and 'strict', respectively.
         """
         self.readline = file.readline
         self.read = file.read
         self.memo = {}
+        self.encoding = encoding
+        self.errors = errors
 
     def load(self):
         """Read a pickled object representation from the open file.
@@ -831,7 +843,7 @@ class Unpickler:
 
     def load_proto(self):
         proto = ord(self.read(1))
-        if not 0 <= proto <= 2:
+        if not 0 <= proto <= HIGHEST_PROTOCOL:
             raise ValueError("unsupported pickle protocol: %d" % proto)
     dispatch[PROTO[0]] = load_proto
 
@@ -924,9 +936,16 @@ class Unpickler:
 
     def load_binstring(self):
         len = mloads(b'i' + self.read(4))
-        self.append(self.read(len))
+        data = self.read(len)
+        value = str(data, self.encoding, self.errors)
+        self.append(value)
     dispatch[BINSTRING[0]] = load_binstring
 
+    def load_binbytes(self):
+        len = mloads(b'i' + self.read(4))
+        self.append(self.read(len))
+    dispatch[BINBYTES[0]] = load_binbytes
+
     def load_unicode(self):
         self.append(str(self.readline()[:-1], 'raw-unicode-escape'))
     dispatch[UNICODE[0]] = load_unicode
@@ -938,9 +957,16 @@ class Unpickler:
 
     def load_short_binstring(self):
         len = ord(self.read(1))
-        self.append(bytes(self.read(len)))
+        data = bytes(self.read(len))
+        value = str(data, self.encoding, self.errors)
+        self.append(value)
     dispatch[SHORT_BINSTRING[0]] = load_short_binstring
 
+    def load_short_binbytes(self):
+        len = ord(self.read(1))
+        self.append(bytes(self.read(len)))
+    dispatch[SHORT_BINBYTES[0]] = load_short_binbytes
+
     def load_tuple(self):
         k = self.marker()
         self.stack[k:] = [tuple(self.stack[k+1:])]
diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index ca09c03..37dad9b 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -746,6 +746,11 @@ pyfloat = StackObject(
               doc="A Python float object.")
 
 pystring = StackObject(
+               name='string',
+               obtype=bytes,
+               doc="A Python (8-bit) string object.")
+
+pybytes = StackObject(
                name='bytes',
                obtype=bytes,
                doc="A Python bytes object.")
@@ -753,7 +758,7 @@ pystring = StackObject(
 pyunicode = StackObject(
                 name='str',
                 obtype=str,
-                doc="A Python string object.")
+                doc="A Python (Unicode) string object.")
 
 pynone = StackObject(
              name="None",
@@ -868,7 +873,7 @@ class OpcodeInfo(object):
             assert isinstance(x, StackObject)
         self.stack_after = stack_after
 
-        assert isinstance(proto, int) and 0 <= proto <= 2
+        assert isinstance(proto, int) and 0 <= proto <= 3
         self.proto = proto
 
         assert isinstance(doc, str)
@@ -995,7 +1000,9 @@ opcodes = [
 
       The argument is a repr-style string, with bracketing quote characters,
       and perhaps embedded escapes.  The argument extends until the next
-      newline character.
+      newline character.  (Actually, they are decoded into a str instance
+      using the encoding given to the Unpickler constructor. or the default,
+      'ASCII'.)
       """),
 
     I(name='BINSTRING',
@@ -1008,7 +1015,9 @@ opcodes = [
 
       There are two arguments:  the first is a 4-byte little-endian signed int
       giving the number of bytes in the string, and the second is that many
-      bytes, which are taken literally as the string content.
+      bytes, which are taken literally as the string content.  (Actually,
+      they are decoded into a str instance using the encoding given to the
+      Unpickler constructor. or the default, 'ASCII'.)
       """),
 
     I(name='SHORT_BINSTRING',
@@ -1021,6 +1030,36 @@ opcodes = [
 
       There are two arguments:  the first is a 1-byte unsigned int giving
       the number of bytes in the string, and the second is that many bytes,
+      which are taken literally as the string content.  (Actually, they
+      are decoded into a str instance using the encoding given to the
+      Unpickler constructor. or the default, 'ASCII'.)
+      """),
+
+    # Bytes (protocol 3 only; older protocols don't support bytes at all)
+
+    I(name='BINBYTES',
+      code='B',
+      arg=string4,
+      stack_before=[],
+      stack_after=[pybytes],
+      proto=3,
+      doc="""Push a Python bytes object.
+
+      There are two arguments:  the first is a 4-byte little-endian signed int
+      giving the number of bytes in the string, and the second is that many
+      bytes, which are taken literally as the bytes content.
+      """),
+
+    I(name='SHORT_BINBYTES',
+      code='C',
+      arg=string1,
+      stack_before=[],
+      stack_after=[pybytes],
+      proto=1,
+      doc="""Push a Python string object.
+
+      There are two arguments:  the first is a 1-byte unsigned int giving
+      the number of bytes in the string, and the second is that many bytes,
       which are taken literally as the string content.
       """),
 
@@ -2006,9 +2045,9 @@ class _Example:
 
 _dis_test = r"""
 >>> import pickle
->>> x = [1, 2, (3, 4), {bytes(b'abc'): "def"}]
->>> pkl = pickle.dumps(x, 0)
->>> dis(pkl)
+>>> x = [1, 2, (3, 4), {b'abc': "def"}]
+>>> pkl0 = pickle.dumps(x, 0)
+>>> dis(pkl0)
     0: (    MARK
     1: l        LIST       (MARK at 0)
     2: p    PUT        0
@@ -2025,19 +2064,32 @@ _dis_test = r"""
    25: (    MARK
    26: d        DICT       (MARK at 25)
    27: p    PUT        2
-   30: S    STRING     'abc'
-   37: p    PUT        3
-   40: V    UNICODE    'def'
-   45: p    PUT        4
-   48: s    SETITEM
-   49: a    APPEND
-   50: .    STOP
+   30: c    GLOBAL     'builtins bytes'
+   46: p    PUT        3
+   49: (    MARK
+   50: (        MARK
+   51: l            LIST       (MARK at 50)
+   52: p        PUT        4
+   55: L        LONG       97
+   59: a        APPEND
+   60: L        LONG       98
+   64: a        APPEND
+   65: L        LONG       99
+   69: a        APPEND
+   70: t        TUPLE      (MARK at 49)
+   71: p    PUT        5
+   74: R    REDUCE
+   75: V    UNICODE    'def'
+   80: p    PUT        6
+   83: s    SETITEM
+   84: a    APPEND
+   85: .    STOP
 highest protocol among opcodes = 0
 
 Try again with a "binary" pickle.
 
->>> pkl = pickle.dumps(x, 1)
->>> dis(pkl)
+>>> pkl1 = pickle.dumps(x, 1)
+>>> dis(pkl1)
     0: ]    EMPTY_LIST
     1: q    BINPUT     0
     3: (    MARK
@@ -2050,13 +2102,24 @@ Try again with a "binary" pickle.
    14: q        BINPUT     1
    16: }        EMPTY_DICT
    17: q        BINPUT     2
-   19: U        SHORT_BINSTRING 'abc'
-   24: q        BINPUT     3
-   26: X        BINUNICODE 'def'
-   34: q        BINPUT     4
-   36: s        SETITEM
-   37: e        APPENDS    (MARK at 3)
-   38: .    STOP
+   19: c        GLOBAL     'builtins bytes'
+   35: q        BINPUT     3
+   37: (        MARK
+   38: ]            EMPTY_LIST
+   39: q            BINPUT     4
+   41: (            MARK
+   42: K                BININT1    97
+   44: K                BININT1    98
+   46: K                BININT1    99
+   48: e                APPENDS    (MARK at 41)
+   49: t            TUPLE      (MARK at 37)
+   50: q        BINPUT     5
+   52: R        REDUCE
+   53: X        BINUNICODE 'def'
+   61: q        BINPUT     6
+   63: s        SETITEM
+   64: e        APPENDS    (MARK at 3)
+   65: .    STOP
 highest protocol among opcodes = 1
 
 Exercise the INST/OBJ/BUILD family.
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 5ac2bdc..0230f3c 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -490,6 +490,12 @@ class AbstractPickleTests(unittest.TestCase):
                 u2 = self.loads(p)
                 self.assertEqual(u2, u)
 
+    def test_bytes(self):
+        for proto in protocols:
+            for u in b'', b'xyz', b'xyz'*100:
+                p = self.dumps(u)
+                self.assertEqual(self.loads(p), u)
+
     def test_ints(self):
         import sys
         for proto in protocols:
@@ -532,8 +538,8 @@ class AbstractPickleTests(unittest.TestCase):
 
     @run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
     def test_float_format(self):
-        # make sure that floats are formatted locale independent
-        self.assertEqual(self.dumps(1.2)[0:3], b'F1.')
+        # make sure that floats are formatted locale independent with proto 0
+        self.assertEqual(self.dumps(1.2, 0)[0:3], b'F1.')
 
     def test_reduce(self):
         pass
@@ -624,6 +630,12 @@ class AbstractPickleTests(unittest.TestCase):
                            (2, 2): pickle.TUPLE2,
                            (2, 3): pickle.TUPLE3,
                            (2, 4): pickle.TUPLE,
+
+                           (3, 0): pickle.EMPTY_TUPLE,
+                           (3, 1): pickle.TUPLE1,
+                           (3, 2): pickle.TUPLE2,
+                           (3, 3): pickle.TUPLE3,
+                           (3, 4): pickle.TUPLE,
                           }
         a = ()
         b = (1,)
@@ -643,14 +655,17 @@ class AbstractPickleTests(unittest.TestCase):
         expected_opcode = {(0, None): pickle.NONE,
                            (1, None): pickle.NONE,
                            (2, None): pickle.NONE,
+                           (3, None): pickle.NONE,
 
                            (0, True): pickle.INT,
                            (1, True): pickle.INT,
                            (2, True): pickle.NEWTRUE,
+                           (3, True): pickle.NEWTRUE,
 
                            (0, False): pickle.INT,
                            (1, False): pickle.INT,
                            (2, False): pickle.NEWFALSE,
+                           (3, False): pickle.NEWFALSE,
                           }
         for proto in protocols:
             for x in None, False, True:
@@ -955,7 +970,7 @@ class AbstractPickleModuleTests(unittest.TestCase):
 
     def test_highest_protocol(self):
         # Of course this needs to be changed when HIGHEST_PROTOCOL changes.
-        self.assertEqual(self.module.HIGHEST_PROTOCOL, 2)
+        self.assertEqual(self.module.HIGHEST_PROTOCOL, 3)
 
     def test_callapi(self):
         from io import BytesIO
diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index aa09a6a..67d83c7c 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py
@@ -12,23 +12,19 @@ class PickleTests(AbstractPickleTests, AbstractPickleModuleTests):
     module = pickle
     error = KeyError
 
-    def dumps(self, arg, proto=0, fast=0):
-        # Ignore fast
+    def dumps(self, arg, proto=None):
         return pickle.dumps(arg, proto)
 
     def loads(self, buf):
-        # Ignore fast
         return pickle.loads(buf)
 
 class PicklerTests(AbstractPickleTests):
 
     error = KeyError
 
-    def dumps(self, arg, proto=0, fast=0):
+    def dumps(self, arg, proto=None):
         f = io.BytesIO()
         p = pickle.Pickler(f, proto)
-        if fast:
-            p.fast = fast
         p.dump(arg)
         f.seek(0)
         return bytes(f.read())
@@ -40,14 +36,12 @@ class PicklerTests(AbstractPickleTests):
 
 class PersPicklerTests(AbstractPersistentPicklerTests):
 
-    def dumps(self, arg, proto=0, fast=0):
+    def dumps(self, arg, proto=None):
         class PersPickler(pickle.Pickler):
             def persistent_id(subself, obj):
                 return self.persistent_id(obj)
         f = io.BytesIO()
         p = PersPickler(f, proto)
-        if fast:
-            p.fast = fast
         p.dump(arg)
         f.seek(0)
         return f.read()
diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py
index 3e5b35a..932dcd1 100644
--- a/Lib/test/test_pickletools.py
+++ b/Lib/test/test_pickletools.py
@@ -6,7 +6,7 @@ from test.pickletester import AbstractPickleModuleTests
 
 class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests):
 
-    def dumps(self, arg, proto=0, fast=0):
+    def dumps(self, arg, proto=None):
         return pickletools.optimize(pickle.dumps(arg, proto))
 
     def loads(self, buf):
diff --git a/Misc/NEWS b/Misc/NEWS
index 65643bf..db686cd 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -23,6 +23,18 @@ Extension Modules
 Library
 -------
 
+- A new pickle protocol (protocol 3) is added with explicit support
+  for bytes.  This is the default protocol.  It intentionally cannot
+  be unpickled by Python 2.x.
+
+- When a pickle written by Python 2.x contains an (8-bit) str
+  instance, this is now decoded to a (Unicode) str instance.  The
+  encoding used to do this defaults to ASCII, but can be overridden
+  via two new keyword arguments to the Unpickler class.  Previously
+  this would create bytes instances, which is usually wrong: str
+  instances are often used to pickle attribute names etc., and text is
+  more common than binary data anyway.
+
 - Default to ASCII as the locale.getpreferredencoding, if the POSIX
   system doesn't support CODESET and LANG isn't set or doesn't
   allow deduction of an encoding.