summaryrefslogtreecommitdiffstats
path: root/Tools
diff options
context:
space:
mode:
Diffstat (limited to 'Tools')
-rw-r--r--Tools/README12
-rw-r--r--Tools/buildbot/external-common.bat5
-rw-r--r--Tools/ccbench/ccbench.py118
-rw-r--r--Tools/gdb/libpython.py95
-rw-r--r--Tools/iobench/iobench.py19
-rw-r--r--Tools/msi/msi.py162
-rw-r--r--Tools/msi/msilib.py18
-rw-r--r--Tools/msi/uuids.py99
-rwxr-xr-xTools/pybench/pybench.py1
-rw-r--r--Tools/scripts/README3
-rwxr-xr-xTools/scripts/findnocoding.py4
-rwxr-xr-xTools/scripts/patchcheck.py53
-rwxr-xr-xTools/scripts/pysetup34
-rwxr-xr-xTools/scripts/pysource.py2
-rwxr-xr-xTools/scripts/reindent.py18
-rwxr-xr-xTools/scripts/run_tests.py47
-rw-r--r--Tools/stringbench/README68
-rwxr-xr-xTools/stringbench/stringbench.py1482
-rw-r--r--Tools/unicode/comparecodecs.py2
-rw-r--r--Tools/unicode/makeunicodedata.py380
-rw-r--r--Tools/unittestgui/unittestgui.py1
21 files changed, 2127 insertions, 466 deletions
diff --git a/Tools/README b/Tools/README
index c1f89ba..0d961de 100644
--- a/Tools/README
+++ b/Tools/README
@@ -3,7 +3,7 @@ while building or extending Python.
buildbot Batchfiles for running on Windows buildslaves.
-ccbench A Python concurrency benchmark.
+ccbench A Python threads-based concurrency benchmark. (*)
demo Several Python programming demos.
@@ -17,13 +17,13 @@ i18n Tools for internationalization. pygettext.py
and msgfmt.py generates a binary message catalog
from a catalog in text format.
-iobench Benchmark for the new Python I/O system.
+iobench Benchmark for the new Python I/O system. (*)
msi Support for packaging Python as an MSI package on Windows.
parser Un-parsing tool to generate code from an AST.
-pybench Comprehensive Python benchmarking suite.
+pybench Low-level benchmarking for the Python evaluation loop. (*)
pynche A Tkinter-based color editor.
@@ -32,6 +32,9 @@ scripts A number of useful single-file programs, e.g. tabnanny.py
tabs and spaces, and 2to3, which converts Python 2 code
to Python 3 code.
+stringbench A suite of micro-benchmarks for various operations on
+ strings (both 8-bit and unicode). (*)
+
test2to3 A demonstration of how to use 2to3 transparently in setup.py.
unicode Tools for generating unicodedata and codecs from unicode.org
@@ -40,3 +43,6 @@ unicode Tools for generating unicodedata and codecs from unicode.org
unittestgui A Tkinter based GUI test runner for unittest, with test
discovery.
+
+
+(*) A generic benchmark suite is maintained separately at http://hg.python.org/benchmarks/
diff --git a/Tools/buildbot/external-common.bat b/Tools/buildbot/external-common.bat
index 1ff282e..244d5f5 100644
--- a/Tools/buildbot/external-common.bat
+++ b/Tools/buildbot/external-common.bat
@@ -41,3 +41,8 @@ if not exist sqlite-3.7.4 (
rd /s/q sqlite-source-3.6.21
svn export http://svn.python.org/projects/external/sqlite-3.7.4
)
+
+@rem lzma
+if not exist xz-5.0.3 (
+ svn export http://svn.python.org/projects/external/xz-5.0.3
+)
diff --git a/Tools/ccbench/ccbench.py b/Tools/ccbench/ccbench.py
index 9f7118f..c705521 100644
--- a/Tools/ccbench/ccbench.py
+++ b/Tools/ccbench/ccbench.py
@@ -435,70 +435,70 @@ def run_bandwidth_client(**kwargs):
def run_bandwidth_test(func, args, nthreads):
# Create a listening socket to receive the packets. We use UDP which should
# be painlessly cross-platform.
- sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
- sock.bind(("127.0.0.1", 0))
- addr = sock.getsockname()
+ with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
+ sock.bind(("127.0.0.1", 0))
+ addr = sock.getsockname()
- duration = BANDWIDTH_DURATION
- packet_size = BANDWIDTH_PACKET_SIZE
-
- results = []
- threads = []
- end_event = []
- start_cond = threading.Condition()
- started = False
- if nthreads > 0:
- # Warm up
- func(*args)
+ duration = BANDWIDTH_DURATION
+ packet_size = BANDWIDTH_PACKET_SIZE
results = []
- loop = TimedLoop(func, args)
- ready = []
- ready_cond = threading.Condition()
-
- def run():
+ threads = []
+ end_event = []
+ start_cond = threading.Condition()
+ started = False
+ if nthreads > 0:
+ # Warm up
+ func(*args)
+
+ results = []
+ loop = TimedLoop(func, args)
+ ready = []
+ ready_cond = threading.Condition()
+
+ def run():
+ with ready_cond:
+ ready.append(None)
+ ready_cond.notify()
+ with start_cond:
+ while not started:
+ start_cond.wait()
+ loop(start_time, duration * 1.5, end_event, do_yield=False)
+
+ for i in range(nthreads):
+ threads.append(threading.Thread(target=run))
+ for t in threads:
+ t.setDaemon(True)
+ t.start()
+ # Wait for threads to be ready
with ready_cond:
- ready.append(None)
- ready_cond.notify()
- with start_cond:
- while not started:
- start_cond.wait()
- loop(start_time, duration * 1.5, end_event, do_yield=False)
-
- for i in range(nthreads):
- threads.append(threading.Thread(target=run))
- for t in threads:
- t.setDaemon(True)
- t.start()
- # Wait for threads to be ready
- with ready_cond:
- while len(ready) < nthreads:
- ready_cond.wait()
-
- # Run the client and wait for the first packet to arrive before
- # unblocking the background threads.
- process = run_bandwidth_client(addr=addr,
- packet_size=packet_size,
- duration=duration)
- _time = time.time
- # This will also wait for the parent to be ready
- s = _recv(sock, packet_size)
- remote_addr = eval(s.partition('#')[0])
-
- with start_cond:
- start_time = _time()
- started = True
- start_cond.notify(nthreads)
-
- n = 0
- first_time = None
- while not end_event and BW_END not in s:
- _sendto(sock, s, remote_addr)
+ while len(ready) < nthreads:
+ ready_cond.wait()
+
+ # Run the client and wait for the first packet to arrive before
+ # unblocking the background threads.
+ process = run_bandwidth_client(addr=addr,
+ packet_size=packet_size,
+ duration=duration)
+ _time = time.time
+ # This will also wait for the parent to be ready
s = _recv(sock, packet_size)
- if first_time is None:
- first_time = _time()
- n += 1
- end_time = _time()
+ remote_addr = eval(s.partition('#')[0])
+
+ with start_cond:
+ start_time = _time()
+ started = True
+ start_cond.notify(nthreads)
+
+ n = 0
+ first_time = None
+ while not end_event and BW_END not in s:
+ _sendto(sock, s, remote_addr)
+ s = _recv(sock, packet_size)
+ if first_time is None:
+ first_time = _time()
+ n += 1
+ end_time = _time()
end_event.append(None)
for t in threads:
diff --git a/Tools/gdb/libpython.py b/Tools/gdb/libpython.py
index 8bbbb10..30347cb 100644
--- a/Tools/gdb/libpython.py
+++ b/Tools/gdb/libpython.py
@@ -49,6 +49,11 @@ import sys
_type_char_ptr = gdb.lookup_type('char').pointer() # char*
_type_unsigned_char_ptr = gdb.lookup_type('unsigned char').pointer() # unsigned char*
_type_void_ptr = gdb.lookup_type('void').pointer() # void*
+_type_unsigned_short_ptr = gdb.lookup_type('unsigned short').pointer()
+_type_unsigned_int_ptr = gdb.lookup_type('unsigned int').pointer()
+
+# value computed later, see PyUnicodeObjectPtr.proxy()
+_is_pep393 = None
SIZEOF_VOID_P = _type_void_ptr.sizeof
@@ -322,7 +327,6 @@ class PyObjectPtr(object):
name_map = {'bool': PyBoolObjectPtr,
'classobj': PyClassObjectPtr,
- 'instance': PyInstanceObjectPtr,
'NoneType': PyNoneStructPtr,
'frame': PyFrameObjectPtr,
'set' : PySetObjectPtr,
@@ -396,7 +400,7 @@ class ProxyAlreadyVisited(object):
def _write_instance_repr(out, visited, name, pyop_attrdict, address):
- '''Shared code for use by old-style and new-style classes:
+ '''Shared code for use by all classes:
write a representation to file-like object "out"'''
out.write('<')
out.write(name)
@@ -479,7 +483,7 @@ class HeapTypeObjectPtr(PyObjectPtr):
def proxyval(self, visited):
'''
- Support for new-style classes.
+ Support for classes.
Currently we just locate the dictionary using a transliteration to
python of _PyObject_GetDictPtr, ignoring descriptors
@@ -496,7 +500,7 @@ class HeapTypeObjectPtr(PyObjectPtr):
attr_dict = {}
tp_name = self.safe_tp_name()
- # New-style class:
+ # Class:
return InstanceProxy(tp_name, attr_dict, long(self._gdbval))
def write_repr(self, out, visited):
@@ -668,44 +672,6 @@ class PyDictObjectPtr(PyObjectPtr):
pyop_value.write_repr(out, visited)
out.write('}')
-class PyInstanceObjectPtr(PyObjectPtr):
- _typename = 'PyInstanceObject'
-
- def proxyval(self, visited):
- # Guard against infinite loops:
- if self.as_address() in visited:
- return ProxyAlreadyVisited('<...>')
- visited.add(self.as_address())
-
- # Get name of class:
- in_class = self.pyop_field('in_class')
- cl_name = in_class.pyop_field('cl_name').proxyval(visited)
-
- # Get dictionary of instance attributes:
- in_dict = self.pyop_field('in_dict').proxyval(visited)
-
- # Old-style class:
- return InstanceProxy(cl_name, in_dict, long(self._gdbval))
-
- def write_repr(self, out, visited):
- # Guard against infinite loops:
- if self.as_address() in visited:
- out.write('<...>')
- return
- visited.add(self.as_address())
-
- # Old-style class:
-
- # Get name of class:
- in_class = self.pyop_field('in_class')
- cl_name = in_class.pyop_field('cl_name').proxyval(visited)
-
- # Get dictionary of instance attributes:
- pyop_in_dict = self.pyop_field('in_dict')
-
- _write_instance_repr(out, visited,
- cl_name, pyop_in_dict, self.as_address())
-
class PyListObjectPtr(PyObjectPtr):
_typename = 'PyListObject'
@@ -1123,15 +1089,46 @@ class PyUnicodeObjectPtr(PyObjectPtr):
return _type_Py_UNICODE.sizeof
def proxyval(self, visited):
- # From unicodeobject.h:
- # Py_ssize_t length; /* Length of raw Unicode data in buffer */
- # Py_UNICODE *str; /* Raw Unicode buffer */
- field_length = long(self.field('length'))
- field_str = self.field('str')
+ global _is_pep393
+ if _is_pep393 is None:
+ fields = gdb.lookup_type('PyUnicodeObject').target().fields()
+ _is_pep393 = 'data' in [f.name for f in fields]
+ if _is_pep393:
+ # Python 3.3 and newer
+ may_have_surrogates = False
+ compact = self.field('_base')
+ ascii = compact['_base']
+ state = ascii['state']
+ is_compact_ascii = (int(state['ascii']) and int(state['compact']))
+ if not int(state['ready']):
+ # string is not ready
+ field_length = long(compact['wstr_length'])
+ may_have_surrogates = True
+ field_str = ascii['wstr']
+ else:
+ field_length = long(ascii['length'])
+ if is_compact_ascii:
+ field_str = ascii.address + 1
+ elif int(state['compact']):
+ field_str = compact.address + 1
+ else:
+ field_str = self.field('data')['any']
+ repr_kind = int(state['kind'])
+ if repr_kind == 1:
+ field_str = field_str.cast(_type_unsigned_char_ptr)
+ elif repr_kind == 2:
+ field_str = field_str.cast(_type_unsigned_short_ptr)
+ elif repr_kind == 4:
+ field_str = field_str.cast(_type_unsigned_int_ptr)
+ else:
+ # Python 3.2 and earlier
+ field_length = long(self.field('length'))
+ field_str = self.field('str')
+ may_have_surrogates = self.char_width() == 2
# Gather a list of ints from the Py_UNICODE array; these are either
- # UCS-2 or UCS-4 code points:
- if self.char_width() > 2:
+ # UCS-1, UCS-2 or UCS-4 code points:
+ if not may_have_surrogates:
Py_UNICODEs = [int(field_str[i]) for i in safe_range(field_length)]
else:
# A more elaborate routine if sizeof(Py_UNICODE) is 2 in the
diff --git a/Tools/iobench/iobench.py b/Tools/iobench/iobench.py
index 5ec6f17..408be7b 100644
--- a/Tools/iobench/iobench.py
+++ b/Tools/iobench/iobench.py
@@ -1,13 +1,14 @@
# -*- coding: utf-8 -*-
# This file should be kept compatible with both Python 2.6 and Python >= 3.0.
-import time
+import functools
+import hashlib
+import itertools
import os
+import platform
import re
import sys
-import hashlib
-import functools
-import itertools
+import time
from optparse import OptionParser
out = sys.stdout
@@ -307,6 +308,16 @@ def run_all_tests(options):
"large": 2,
}
+ print("Python %s" % sys.version)
+ if sys.version_info < (3, 3):
+ if sys.maxunicode > 0xffff:
+ text = "UCS-4 (wide build)"
+ else:
+ text = "UTF-16 (narrow build)"
+ else:
+ text = "PEP 393"
+ print("Unicode: %s" % text)
+ print(platform.platform())
binary_files = list(get_binary_files())
text_files = list(get_text_files())
if "b" in options:
diff --git a/Tools/msi/msi.py b/Tools/msi/msi.py
index 508816d..38f3443 100644
--- a/Tools/msi/msi.py
+++ b/Tools/msi/msi.py
@@ -2,12 +2,11 @@
# (C) 2003 Martin v. Loewis
# See "FOO" in comments refers to MSDN sections with the title FOO.
import msilib, schema, sequence, os, glob, time, re, shutil, zipfile
+import subprocess, tempfile
from msilib import Feature, CAB, Directory, Dialog, Binary, add_data
import uisample
from win32com.client import constants
from distutils.spawn import find_executable
-from uuids import product_codes
-import tempfile
# Settings can be overridden in config.py below
# 0 for official python.org releases
@@ -77,19 +76,16 @@ upgrade_code_64='{6A965A0C-6EE6-4E3A-9983-3263F56311EC}'
if snapshot:
current_version = "%s.%s.%s" % (major, minor, int(time.time()/3600/24))
- product_code = msilib.gen_uuid()
-else:
- product_code = product_codes[current_version]
if full_current_version is None:
full_current_version = current_version
extensions = [
- 'bz2.pyd',
'pyexpat.pyd',
'select.pyd',
'unicodedata.pyd',
'winsound.pyd',
+ '_bz2.pyd',
'_elementtree.pyd',
'_socket.pyd',
'_ssl.pyd',
@@ -100,7 +96,10 @@ extensions = [
'_ctypes_test.pyd',
'_sqlite3.pyd',
'_hashlib.pyd',
- '_multiprocessing.pyd'
+ '_multiprocessing.pyd',
+ '_lzma.pyd',
+ '_decimal.pyd',
+ '_testbuffer.pyd'
]
# Well-known component UUIDs
@@ -119,6 +118,7 @@ pythondll_uuid = {
"30":"{6953bc3b-6768-4291-8410-7914ce6e2ca8}",
"31":"{4afcba0b-13e4-47c3-bebe-477428b46913}",
"32":"{3ff95315-1096-4d31-bd86-601d5438ad5e}",
+ "33":"{f7581ca4-d368-4eea-8f82-d48c64c4f047}",
} [major+minor]
# Compute the name that Sphinx gives to the docfile
@@ -185,12 +185,19 @@ dll_path = os.path.join(srcdir, PCBUILD, dll_file)
msilib.set_arch_from_file(dll_path)
if msilib.pe_type(dll_path) != msilib.pe_type("msisupport.dll"):
raise SystemError("msisupport.dll for incorrect architecture")
+
if msilib.Win64:
upgrade_code = upgrade_code_64
- # Bump the last digit of the code by one, so that 32-bit and 64-bit
- # releases get separate product codes
- digit = hex((int(product_code[-2],16)+1)%16)[-1]
- product_code = product_code[:-2] + digit + '}'
+
+if snapshot:
+ product_code = msilib.gen_uuid()
+else:
+ # official release: generate UUID from the download link that the file will have
+ import uuid
+ product_code = uuid.uuid3(uuid.NAMESPACE_URL,
+ 'http://www.python.org/ftp/python/%s.%s.%s/python-%s%s.msi' %
+ (major, minor, micro, full_current_version, msilib.arch_ext))
+ product_code = '{%s}' % product_code
if testpackage:
ext = 'px'
@@ -904,16 +911,27 @@ class PyDirectory(Directory):
kw['componentflags'] = 2 #msidbComponentAttributesOptional
Directory.__init__(self, *args, **kw)
- def check_unpackaged(self):
- self.unpackaged_files.discard('__pycache__')
- self.unpackaged_files.discard('.svn')
- if self.unpackaged_files:
- print "Warning: Unpackaged files in %s" % self.absolute
- print self.unpackaged_files
+def hgmanifest():
+ # Fetch file list from Mercurial
+ process = subprocess.Popen(['hg', 'manifest'], stdout=subprocess.PIPE)
+ stdout, stderr = process.communicate()
+ # Create nested directories for file tree
+ result = {}
+ for line in stdout.splitlines():
+ components = line.split('/')
+ d = result
+ while len(components) > 1:
+ d1 = d.setdefault(components[0], {})
+ d = d1
+ del components[0]
+ d[components[0]] = None
+ return result
+
# See "File Table", "Component Table", "Directory Table",
# "FeatureComponents Table"
def add_files(db):
+ hgfiles = hgmanifest()
cab = CAB("python")
tmpfiles = []
# Add all executables, icons, text files into the TARGETDIR component
@@ -975,104 +993,40 @@ def add_files(db):
# Add all .py files in Lib, except tkinter, test
dirs = []
- pydirs = [(root,"Lib")]
+ pydirs = [(root, "Lib", hgfiles["Lib"], default_feature)]
while pydirs:
# Commit every now and then, or else installer will complain
db.Commit()
- parent, dir = pydirs.pop()
- if dir == ".svn" or dir == '__pycache__' or dir.startswith("plat-"):
+ parent, dir, files, feature = pydirs.pop()
+ if dir.startswith("plat-"):
continue
- elif dir in ["tkinter", "idlelib", "Icons"]:
+ if dir in ["tkinter", "idlelib", "turtledemo"]:
if not have_tcl:
continue
+ feature = tcltk
tcltk.set_current()
- elif dir in ['test', 'tests', 'data', 'output']:
- # test: Lib, Lib/email, Lib/ctypes, Lib/sqlite3
- # tests: Lib/distutils
- # data: Lib/email/test
- # output: Lib/test
- testsuite.set_current()
+ elif dir in ('test', 'tests'):
+ feature = testsuite
elif not have_ctypes and dir == "ctypes":
continue
- else:
- default_feature.set_current()
+ feature.set_current()
lib = PyDirectory(db, cab, parent, dir, dir, "%s|%s" % (parent.make_short(dir), dir))
- # Add additional files
dirs.append(lib)
- lib.glob("*.txt")
- if dir=='site-packages':
- lib.add_file("README.txt", src="README")
- continue
- files = lib.glob("*.py")
- files += lib.glob("*.pyw")
- if files:
- # Add an entry to the RemoveFile table to remove bytecode files.
- lib.remove_pyc()
- # package READMEs if present
- lib.glob("README")
- if dir=='Lib':
- lib.add_file('wsgiref.egg-info')
- if dir=='test' and parent.physical=='Lib':
- lib.add_file("185test.db")
- lib.add_file("audiotest.au")
- lib.add_file("sgml_input.html")
- lib.add_file("testtar.tar")
- lib.add_file("test_difflib_expect.html")
- lib.add_file("check_soundcard.vbs")
- lib.add_file("empty.vbs")
- lib.add_file("Sine-1000Hz-300ms.aif")
- lib.add_file("mime.types")
- lib.glob("*.uue")
- lib.glob("*.pem")
- lib.glob("*.pck")
- lib.glob("cfgparser.*")
- lib.add_file("zip_cp437_header.zip")
- lib.add_file("zipdir.zip")
- if dir=='capath':
- lib.glob("*.0")
- if dir=='tests' and parent.physical=='distutils':
- lib.add_file("Setup.sample")
- if dir=='decimaltestdata':
- lib.glob("*.decTest")
- if dir=='xmltestdata':
- lib.glob("*.xml")
- lib.add_file("test.xml.out")
- if dir=='output':
- lib.glob("test_*")
- if dir=='sndhdrdata':
- lib.glob("sndhdr.*")
- if dir=='idlelib':
- lib.glob("*.def")
- lib.add_file("idle.bat")
- lib.add_file("ChangeLog")
- if dir=="Icons":
- lib.glob("*.gif")
- lib.add_file("idle.icns")
- if dir=="command" and parent.physical=="distutils":
- lib.glob("wininst*.exe")
- lib.add_file("command_template")
- if dir=="lib2to3":
- lib.removefile("pickle", "*.pickle")
- if dir=="macholib":
- lib.add_file("README.ctypes")
- lib.glob("fetch_macholib*")
- if dir=='turtledemo':
- lib.add_file("turtle.cfg")
- if dir=="pydoc_data":
- lib.add_file("_pydoc.css")
- if dir=="data" and parent.physical=="test" and parent.basedir.physical=="email":
- # This should contain all non-.svn files listed in subversion
- for f in os.listdir(lib.absolute):
- if f.endswith(".txt") or f==".svn":continue
- if f.endswith(".au") or f.endswith(".gif"):
- lib.add_file(f)
+ has_py = False
+ for name, subdir in files.items():
+ if subdir is None:
+ assert os.path.isfile(os.path.join(lib.absolute, name))
+ if name == 'README':
+ lib.add_file("README.txt", src="README")
else:
- print("WARNING: New file %s in email/test/data" % f)
- for f in os.listdir(lib.absolute):
- if os.path.isdir(os.path.join(lib.absolute, f)):
- pydirs.append((lib, f))
- for d in dirs:
- d.check_unpackaged()
+ lib.add_file(name)
+ has_py = has_py or name.endswith(".py") or name.endswith(".pyw")
+ else:
+ assert os.path.isdir(os.path.join(lib.absolute, name))
+ pydirs.append((lib, name, subdir, feature))
+
+ if has_py:
+ lib.remove_pyc()
# Add DLLs
default_feature.set_current()
lib = DLLs
@@ -1159,6 +1113,8 @@ def add_files(db):
lib.add_file("README.txt", src="README")
if f == 'Scripts':
lib.add_file("2to3.py", src="2to3")
+ lib.add_file("pydoc3.py", src="pydoc3")
+ lib.add_file("pysetup3.py", src="pysetup3")
if have_tcl:
lib.start_component("pydocgui.pyw", tcltk, keyfile="pydocgui.pyw")
lib.add_file("pydocgui.pyw")
diff --git a/Tools/msi/msilib.py b/Tools/msi/msilib.py
index 5795d0e..472d9d4 100644
--- a/Tools/msi/msilib.py
+++ b/Tools/msi/msilib.py
@@ -408,7 +408,7 @@ class Directory:
self.physical = physical
self.logical = logical
self.component = None
- self.short_names = sets.Set()
+ self.short_names = {}
self.ids = sets.Set()
self.keyfiles = {}
self.componentflags = componentflags
@@ -456,23 +456,25 @@ class Directory:
[(feature.id, component)])
def make_short(self, file):
+ long = file
file = re.sub(r'[\?|><:/*"+,;=\[\]]', '_', file) # restrictions on short names
- parts = file.split(".")
+ parts = file.split(".", 1)
if len(parts)>1:
- suffix = parts[-1].upper()
+ suffix = parts[1].upper()
else:
- suffix = None
+ suffix = ''
prefix = parts[0].upper()
- if len(prefix) <= 8 and (not suffix or len(suffix)<=3):
+ if len(prefix) <= 8 and '.' not in suffix and len(suffix) <= 3:
if suffix:
file = prefix+"."+suffix
else:
file = prefix
- assert file not in self.short_names
+ assert file not in self.short_names, (file, self.short_names[file])
else:
prefix = prefix[:6]
if suffix:
- suffix = suffix[:3]
+ # last three characters of last suffix
+ suffix = suffix.rsplit('.')[-1][:3]
pos = 1
while 1:
if suffix:
@@ -484,7 +486,7 @@ class Directory:
assert pos < 10000
if pos in (10, 100, 1000):
prefix = prefix[:-1]
- self.short_names.add(file)
+ self.short_names[file] = long
return file
def add_file(self, file, src=None, version=None, language=None):
diff --git a/Tools/msi/uuids.py b/Tools/msi/uuids.py
deleted file mode 100644
index 80d17ad..0000000
--- a/Tools/msi/uuids.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# This should be extended for each Python release.
-# The product code must change whenever the name of the MSI file
-# changes, and when new component codes are issued for existing
-# components. See "Changing the Product Code". As we change the
-# component codes with every build, we need a new product code
-# each time. For intermediate (snapshot) releases, they are automatically
-# generated. For official releases, we record the product codes,
-# so people can refer to them.
-product_codes = {
- '2.5.101': '{bc14ce3e-5e72-4a64-ac1f-bf59a571898c}', # 2.5a1
- '2.5.102': '{5eed51c1-8e9d-4071-94c5-b40de5d49ba5}', # 2.5a2
- '2.5.103': '{73dcd966-ffec-415f-bb39-8342c1f47017}', # 2.5a3
- '2.5.111': '{c797ecf8-a8e6-4fec-bb99-526b65f28626}', # 2.5b1
- '2.5.112': '{32beb774-f625-439d-b587-7187487baf15}', # 2.5b2
- '2.5.113': '{89f23918-11cf-4f08-be13-b9b2e6463fd9}', # 2.5b3
- '2.5.121': '{8e9321bc-6b24-48a3-8fd4-c95f8e531e5f}', # 2.5c1
- '2.5.122': '{a6cd508d-9599-45da-a441-cbffa9f7e070}', # 2.5c2
- '2.5.150': '{0a2c5854-557e-48c8-835a-3b9f074bdcaa}', # 2.5.0
- '2.5.1121':'{0378b43e-6184-4c2f-be1a-4a367781cd54}', # 2.5.1c1
- '2.5.1150':'{31800004-6386-4999-a519-518f2d78d8f0}', # 2.5.1
- '2.5.2150':'{6304a7da-1132-4e91-a343-a296269eab8a}', # 2.5.2c1
- '2.5.2150':'{6b976adf-8ae8-434e-b282-a06c7f624d2f}', # 2.5.2
- '2.6.101': '{0ba82e1b-52fd-4e03-8610-a6c76238e8a8}', # 2.6a1
- '2.6.102': '{3b27e16c-56db-4570-a2d3-e9a26180c60b}', # 2.6a2
- '2.6.103': '{cd06a9c5-bde5-4bd7-9874-48933997122a}', # 2.6a3
- '2.6.104': '{dc6ed634-474a-4a50-a547-8de4b7491e53}', # 2.6a4
- '2.6.111': '{3f82079a-5bee-4c4a-8a41-8292389e24ae}', # 2.6b1
- '2.6.112': '{8a0e5970-f3e6-4737-9a2b-bc5ff0f15fb5}', # 2.6b2
- '2.6.113': '{df4f5c21-6fcc-4540-95de-85feba634e76}', # 2.6b3
- '2.6.121': '{bbd34464-ddeb-4028-99e5-f16c4a8fbdb3}', # 2.6c1
- '2.6.122': '{8f64787e-a023-4c60-bfee-25d3a3f592c6}', # 2.6c2
- '2.6.150': '{110eb5c4-e995-4cfb-ab80-a5f315bea9e8}', # 2.6.0
- '2.6.1150':'{9cc89170-000b-457d-91f1-53691f85b223}', # 2.6.1
- '2.6.2121':'{adac412b-b209-4c15-b6ab-dca1b6e47144}', # 2.6.2c1
- '2.6.2150':'{24aab420-4e30-4496-9739-3e216f3de6ae}', # 2.6.2
- '2.6.3121':'{a73e0254-dcda-4fe4-bf37-c7e1c4f4ebb6}', # 2.6.3c1
- '2.6.3150':'{3d9ac095-e115-4e94-bdef-7f7edf17697d}', # 2.6.3
- '2.6.4121':'{727de605-0359-4606-a94b-c2033652379b}', # 2.6.4c1
- '2.6.4122':'{4f7603c6-6352-4299-a398-150a31b19acc}', # 2.6.4c2
- '2.6.4150':'{e7394a0f-3f80-45b1-87fc-abcd51893246}', # 2.6.4
- '2.6.5121':'{e0e273d7-7598-4701-8325-c90c069fd5ff}', # 2.6.5c1
- '2.6.5122':'{fa227b76-0671-4dc6-b826-c2ff2a70dfd5}', # 2.6.5c2
- '2.6.5150':'{4723f199-fa64-4233-8e6e-9fccc95a18ee}', # 2.6.5
- '2.7.101': '{eca1bbef-432c-49ae-a667-c213cc7bbf22}', # 2.7a1
- '2.7.102': '{21ce16ed-73c4-460d-9b11-522f417b2090}', # 2.7a2
- '2.7.103': '{6e7dbd55-ba4a-48ac-a688-6c75db4d7500}', # 2.7a3
- '2.7.104': '{ee774ba3-74a5-48d9-b425-b35a287260c8}', # 2.7a4
- '2.7.111': '{9cfd9ec7-a9c7-4980-a1c6-054fc6493eb3}', # 2.7b1
- '2.7.112': '{9a72faf6-c304-4165-8595-9291ff30cac6}', # 2.7b2
- '2.7.121': '{f530c94a-dd53-4de9-948e-b632b9cb48d2}', # 2.7c1
- '2.7.122': '{f80905d2-dd8d-4b8e-8a40-c23c93dca07d}', # 2.7c2
- '2.7.150': '{20c31435-2a0a-4580-be8b-ac06fc243ca4}', # 2.7.0
- '3.0.101': '{8554263a-3242-4857-9359-aa87bc2c58c2}', # 3.0a1
- '3.0.102': '{692d6e2c-f0ac-40b8-a133-7191aeeb67f9}', # 3.0a2
- '3.0.103': '{49cb2995-751a-4753-be7a-d0b1bb585e06}', # 3.0a3
- '3.0.104': '{87cb019e-19fd-4238-b1c7-85751437d646}', # 3.0a4
- '3.0.105': '{cf2659af-19ec-43d2-8c35-0f6a09439d42}', # 3.0a5
- '3.0.111': '{36c26f55-837d-45cf-848c-5f5c0fb47a28}', # 3.0b1
- '3.0.112': '{056a0fbc-c8fe-4c61-aade-c4411b70c998}', # 3.0b2
- '3.0.113': '{2b2e89a9-83af-43f9-b7d5-96e80c5a3f26}', # 3.0b3
- '3.0.114': '{e95c31af-69be-4dd7-96e6-e5fc85e660e6}', # 3.0b4
- '3.0.121': '{d0979c5e-cd3c-42ec-be4c-e294da793573}', # 3.0c1
- '3.0.122': '{f707b8e9-a257-4045-818e-4923fc20fbb6}', # 3.0c2
- '3.0.123': '{5e7208f1-8643-4ea2-ab5e-4644887112e3}', # 3.0c3
- '3.0.150': '{e0e56e21-55de-4f77-a109-1baa72348743}', # 3.0.0
- '3.0.1121':'{d35b1ea5-3d70-4872-bf7e-cd066a77a9c9}', # 3.0.1c1
- '3.0.1150':'{de2f2d9c-53e2-40ee-8209-74da63cb060e}', # 3.0.1
- '3.0.2121':'{cef79e7f-9809-49e2-afd2-e24148d7c855}', # 3.0.2c1
- '3.0.2150':'{0cf3b95a-8382-4607-9779-c36407ff362c}', # 3.0.2
- '3.1.101': '{c423eada-c498-4d51-9eb4-bfeae647e0a0}', # 3.1a1
- '3.1.102': '{f6e199bf-dc64-42f3-87d4-1525991a013e}', # 3.1a2
- '3.1.111': '{c3c82893-69b2-4676-8554-1b6ee6c191e9}', # 3.1b1
- '3.1.121': '{da2b5170-12f3-4d99-8a1f-54926cca7acd}', # 3.1c1
- '3.1.122': '{bceb5133-e2ee-4109-951f-ac7e941a1692}', # 3.1c2
- '3.1.150': '{3ad61ee5-81d2-4d7e-adef-da1dd37277d1}', # 3.1.0
- '3.1.1121':'{5782f957-6d49-41d4-bad0-668715dfd638}', # 3.1.1c1
- '3.1.1150':'{7ff90460-89b7-435b-b583-b37b2815ccc7}', # 3.1.1
- '3.1.2121':'{ec45624a-378c-43be-91f3-3f7a59b0d90c}', # 3.1.2c1
- '3.1.2150':'{d40af016-506c-43fb-a738-bd54fa8c1e85}', # 3.1.2
- '3.2.101' :'{b411f168-7a36-4fff-902c-a554d1c78a4f}', # 3.2a1
- '3.2.102' :'{79ff73b7-8359-410f-b9c5-152d2026f8c8}', # 3.2a2
- '3.2.103' :'{e7635c65-c221-4b9b-b70a-5611b8369d77}', # 3.2a3
- '3.2.104' :'{748cd139-75b8-4ca8-98a7-58262298181e}', # 3.2a4
- '3.2.111' :'{20bfc16f-c7cd-4fc0-8f96-9914614a3c50}', # 3.2b1
- '3.2.112' :'{0e350c98-8d73-4993-b686-cfe87160046e}', # 3.2b2
- '3.2.121' :'{2094968d-7583-47f6-a7fd-22304532e09f}', # 3.2rc1
- '3.2.122' :'{4f3edfa6-cf70-469a-825f-e1206aa7f412}', # 3.2rc2
- '3.2.123' :'{90c673d7-8cfd-4969-9816-f7d70bad87f3}', # 3.2rc3
- '3.2.150' :'{b2042d5e-986d-44ec-aee3-afe4108ccc93}', # 3.2.0
- '3.2.1121':'{4f90de4a-83dd-4443-b625-ca130ff361dd}', # 3.2.1rc1
- '3.2.1122':'{dc5eb04d-ff8a-4bed-8f96-23942fd59e5f}', # 3.2.1rc2
- '3.2.1150':'{34b2530c-6349-4292-9dc3-60bda4aed93c}', # 3.2.1
- '3.2.2121':'{DFB29A53-ACC4-44e6-85A6-D0DA26FE8E4E}', # 3.2.2rc1
- '3.2.2150':'{4CDE3168-D060-4b7c-BC74-4D8F9BB01AFD}', # 3.2.2
- '3.2.3121':'{B8E8CFF7-E4C6-4a7c-9F06-BB3A8B75DDA8}', # 3.2.3rc1
- '3.2.3122':'{E8DCD3E0-12B6-4fb7-9DB5-543C2E67372E}', # 3.2.3rc2
- '3.2.3150':'{789C9644-9F82-44d3-B4CA-AC31F46F5882}', # 3.2.3
-
-}
diff --git a/Tools/pybench/pybench.py b/Tools/pybench/pybench.py
index 8eaad63..cc1e55c 100755
--- a/Tools/pybench/pybench.py
+++ b/Tools/pybench/pybench.py
@@ -107,6 +107,7 @@ def get_machine_details():
print('Getting machine details...')
buildno, builddate = platform.python_build()
python = platform.python_version()
+ # XXX this is now always UCS4, maybe replace it with 'PEP393' in 3.3+?
if sys.maxunicode == 65535:
# UCS2 build (standard)
unitype = 'UCS2'
diff --git a/Tools/scripts/README b/Tools/scripts/README
index 8c02529..eb28a9e 100644
--- a/Tools/scripts/README
+++ b/Tools/scripts/README
@@ -15,7 +15,7 @@ db2pickle.py Dump a database file to a pickle
diff.py Print file diffs in context, unified, or ndiff formats
dutree.py Format du(1) output as a tree sorted by size
eptags.py Create Emacs TAGS file for Python modules
-find_recursionlimit.py Find the maximum recursion limit on this machine
+find_recursionlimit.py Find the maximum recursion limit on this machine
finddiv.py A grep-like tool that looks for division operators
findlinksto.py Recursively find symbolic links to a given path prefix
findnocoding.py Find source files which need an encoding declaration
@@ -53,6 +53,7 @@ redemo.py Basic regular expression demonstration facility
reindent.py Change .py files to use 4-space indents
reindent-rst.py Fix-up reStructuredText file whitespace
rgrep.py Reverse grep through a file (useful for big logfiles)
+run_tests.py Run the test suite with more sensible default options
serve.py Small wsgiref-based web server, used in make serve in Doc
suff.py Sort a list of files by suffix
svneol.py Set svn:eol-style on all files in directory
diff --git a/Tools/scripts/findnocoding.py b/Tools/scripts/findnocoding.py
index a494a48..5aa1feb 100755
--- a/Tools/scripts/findnocoding.py
+++ b/Tools/scripts/findnocoding.py
@@ -2,7 +2,7 @@
"""List all those Python files that require a coding directive
-Usage: nocoding.py dir1 [dir2...]
+Usage: findnocoding.py dir1 [dir2...]
"""
__author__ = "Oleg Broytmann, Georg Brandl"
@@ -50,7 +50,7 @@ def has_correct_encoding(text, codec):
def needs_declaration(fullpath):
try:
- infile = open(fullpath, 'rU')
+ infile = open(fullpath)
except IOError: # Oops, the file was removed - ignore it
return None
diff --git a/Tools/scripts/patchcheck.py b/Tools/scripts/patchcheck.py
index 0e18dd9..503c67a 100755
--- a/Tools/scripts/patchcheck.py
+++ b/Tools/scripts/patchcheck.py
@@ -49,29 +49,15 @@ def mq_patches_applied():
@status("Getting the list of files that have been added/changed",
info=lambda x: n_files_str(len(x)))
def changed_files():
- """Get the list of changed or added files from the VCS."""
- if os.path.isdir(os.path.join(SRCDIR, '.hg')):
- vcs = 'hg'
- cmd = 'hg status --added --modified --no-status'
- if mq_patches_applied():
- cmd += ' --rev qparent'
- elif os.path.isdir('.svn'):
- vcs = 'svn'
- cmd = 'svn status --quiet --non-interactive --ignore-externals'
- else:
+ """Get the list of changed or added files from Mercurial."""
+ if not os.path.isdir(os.path.join(SRCDIR, '.hg')):
sys.exit('need a checkout to get modified files')
- st = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
- try:
- st.wait()
- if vcs == 'hg':
- return [x.decode().rstrip() for x in st.stdout]
- else:
- output = (x.decode().rstrip().rsplit(None, 1)[-1]
- for x in st.stdout if x[0] in b'AM')
- return set(path for path in output if os.path.isfile(path))
- finally:
- st.stdout.close()
+ cmd = 'hg status --added --modified --no-status'
+ if mq_patches_applied():
+ cmd += ' --rev qparent'
+ with subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) as st:
+ return [x.decode().rstrip() for x in st.stdout]
def report_modified_files(file_paths):
@@ -89,10 +75,8 @@ def report_modified_files(file_paths):
def normalize_whitespace(file_paths):
"""Make sure that the whitespace for .py files have been normalized."""
reindent.makebackup = False # No need to create backups.
- fixed = []
- for path in (x for x in file_paths if x.endswith('.py')):
- if reindent.check(os.path.join(SRCDIR, path)):
- fixed.append(path)
+ fixed = [path for path in file_paths if path.endswith('.py') and
+ reindent.check(os.path.join(SRCDIR, path))]
return fixed
@@ -148,6 +132,21 @@ def reported_news(file_paths):
"""Check if Misc/NEWS has been changed."""
return 'Misc/NEWS' in file_paths
+@status("configure regenerated", modal=True, info=str)
+def regenerated_configure(file_paths):
+ """Check if configure has been regenerated."""
+ if 'configure.ac' in file_paths:
+ return "yes" if 'configure' in file_paths else "no"
+ else:
+ return "not needed"
+
+@status("pyconfig.h.in regenerated", modal=True, info=str)
+def regenerated_pyconfig_h_in(file_paths):
+ """Check if pyconfig.h.in has been regenerated."""
+ if 'configure.ac' in file_paths:
+ return "yes" if 'pyconfig.h.in' in file_paths else "no"
+ else:
+ return "not needed"
def main():
file_paths = changed_files()
@@ -167,6 +166,10 @@ def main():
credit_given(special_files)
# Misc/NEWS changed.
reported_news(special_files)
+ # Regenerated configure, if necessary.
+ regenerated_configure(file_paths)
+ # Regenerated pyconfig.h.in, if necessary.
+ regenerated_pyconfig_h_in(file_paths)
# Test suite run and passed.
if python_files or c_files:
diff --git a/Tools/scripts/pysetup3 b/Tools/scripts/pysetup3
new file mode 100755
index 0000000..e6a908d
--- /dev/null
+++ b/Tools/scripts/pysetup3
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+import sys
+from packaging.run import main
+sys.exit(main())
diff --git a/Tools/scripts/pysource.py b/Tools/scripts/pysource.py
index 048131e..c7dbe60 100755
--- a/Tools/scripts/pysource.py
+++ b/Tools/scripts/pysource.py
@@ -42,7 +42,7 @@ def _open(fullpath):
return None
try:
- return open(fullpath, 'rU')
+ return open(fullpath)
except IOError as err: # Access denied, or a special file - ignore it
print_debug("%s: access denied: %s" % (fullpath, err))
return None
diff --git a/Tools/scripts/reindent.py b/Tools/scripts/reindent.py
index b18993b..4a916ea 100755
--- a/Tools/scripts/reindent.py
+++ b/Tools/scripts/reindent.py
@@ -8,6 +8,8 @@
-r (--recurse) Recurse. Search for all .py files in subdirectories too.
-n (--nobackup) No backup. Does not make a ".bak" file before reindenting.
-v (--verbose) Verbose. Print informative msgs; else no output.
+ (--newline) Newline. Specify the newline character to use (CRLF, LF).
+ Default is the same as the original file.
-h (--help) Help. Print this usage information and exit.
Change Python (.py) files to use 4-space indents and no hard tab characters.
@@ -50,6 +52,8 @@ verbose = False
recurse = False
dryrun = False
makebackup = True
+spec_newline = None
+"""A specified newline to be used in the output (set by --newline option)"""
def usage(msg=None):
@@ -62,13 +66,12 @@ def errprint(*args):
sys.stderr.write(" ".join(str(arg) for arg in args))
sys.stderr.write("\n")
-
def main():
import getopt
- global verbose, recurse, dryrun, makebackup
+ global verbose, recurse, dryrun, makebackup, spec_newline
try:
opts, args = getopt.getopt(sys.argv[1:], "drnvh",
- ["dryrun", "recurse", "nobackup", "verbose", "help"])
+ ["dryrun", "recurse", "nobackup", "verbose", "newline=", "help"])
except getopt.error as msg:
usage(msg)
return
@@ -81,6 +84,11 @@ def main():
makebackup = False
elif o in ('-v', '--verbose'):
verbose = True
+ elif o in ('--newline',):
+ if not a.upper() in ('CRLF', 'LF'):
+ usage()
+ return
+ spec_newline = dict(CRLF='\r\n', LF='\n')[a.upper()]
elif o in ('-h', '--help'):
usage()
return
@@ -118,9 +126,9 @@ def check(file):
errprint("%s: I/O Error: %s" % (file, str(msg)))
return
- newline = r.newlines
+ newline = spec_newline if spec_newline else r.newlines
if isinstance(newline, tuple):
- errprint("%s: mixed newlines detected; cannot process file" % file)
+ errprint("%s: mixed newlines detected; cannot continue without --newline" % file)
return
if r.run():
diff --git a/Tools/scripts/run_tests.py b/Tools/scripts/run_tests.py
new file mode 100755
index 0000000..f750e19
--- /dev/null
+++ b/Tools/scripts/run_tests.py
@@ -0,0 +1,47 @@
+"""Run Python's test suite in a fast, rigorous way.
+
+The defaults are meant to be reasonably thorough, while skipping certain
+tests that can be time-consuming or resource-intensive (e.g. largefile),
+or distracting (e.g. audio and gui). These defaults can be overridden by
+simply passing a -u option to this script.
+
+"""
+
+import os
+import sys
+import test.support
+
+
+def is_multiprocess_flag(arg):
+ return arg.startswith('-j') or arg.startswith('--multiprocess')
+
+
+def is_resource_use_flag(arg):
+ return arg.startswith('-u') or arg.startswith('--use')
+
+
+def main(regrtest_args):
+ args = [sys.executable,
+ '-W', 'default', # Warnings set to 'default'
+ '-bb', # Warnings about bytes/bytearray
+ '-E', # Ignore environment variables
+ ]
+ # Allow user-specified interpreter options to override our defaults.
+ args.extend(test.support.args_from_interpreter_flags())
+ args.extend(['-m', 'test', # Run the test suite
+ '-r', # Randomize test order
+ '-w', # Re-run failed tests in verbose mode
+ ])
+ if sys.platform == 'win32':
+ args.append('-n') # Silence alerts under Windows
+ if not any(is_multiprocess_flag(arg) for arg in regrtest_args):
+ args.extend(['-j', '0']) # Use all CPU cores
+ if not any(is_resource_use_flag(arg) for arg in regrtest_args):
+ args.extend(['-u', 'all,-largefile,-audio,-gui'])
+ args.extend(regrtest_args)
+ print(' '.join(args))
+ os.execv(sys.executable, args)
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
diff --git a/Tools/stringbench/README b/Tools/stringbench/README
new file mode 100644
index 0000000..a271f12
--- /dev/null
+++ b/Tools/stringbench/README
@@ -0,0 +1,68 @@
+stringbench is a set of performance tests comparing byte string
+operations with unicode operations. The two string implementations
+are loosely based on each other and sometimes the algorithm for one is
+faster than the other.
+
+These test set was started at the Need For Speed sprint in Reykjavik
+to identify which string methods could be sped up quickly and to
+identify obvious places for improvement.
+
+Here is an example of a benchmark
+
+
+@bench('"Andrew".startswith("A")', 'startswith single character', 1000)
+def startswith_single(STR):
+ s1 = STR("Andrew")
+ s2 = STR("A")
+ s1_startswith = s1.startswith
+ for x in _RANGE_1000:
+ s1_startswith(s2)
+
+The bench decorator takes three parameters. The first is a short
+description of how the code works. In most cases this is Python code
+snippet. It is not the code which is actually run because the real
+code is hand-optimized to focus on the method being tested.
+
+The second parameter is a group title. All benchmarks with the same
+group title are listed together. This lets you compare different
+implementations of the same algorithm, such as "t in s"
+vs. "s.find(t)".
+
+The last is a count. Each benchmark loops over the algorithm either
+100 or 1000 times, depending on the algorithm performance. The output
+time is the time per benchmark call so the reader needs a way to know
+how to scale the performance.
+
+These parameters become function attributes.
+
+
+Here is an example of the output
+
+
+========== count newlines
+38.54 41.60 92.7 ...text.with.2000.newlines.count("\n") (*100)
+========== early match, single character
+1.14 1.18 96.8 ("A"*1000).find("A") (*1000)
+0.44 0.41 105.6 "A" in "A"*1000 (*1000)
+1.15 1.17 98.1 ("A"*1000).index("A") (*1000)
+
+The first column is the run time in milliseconds for byte strings.
+The second is the run time for unicode strings. The third is a
+percentage; byte time / unicode time. It's the percentage by which
+unicode is faster than byte strings.
+
+The last column contains the code snippet and the repeat count for the
+internal benchmark loop.
+
+The times are computed with 'timeit.py' which repeats the test more
+and more times until the total time takes over 0.2 seconds, returning
+the best time for a single iteration.
+
+The final line of the output is the cumulative time for byte and
+unicode strings, and the overall performance of unicode relative to
+bytes. For example
+
+4079.83 5432.25 75.1 TOTAL
+
+However, this has no meaning as it evenly weights every test.
+
diff --git a/Tools/stringbench/stringbench.py b/Tools/stringbench/stringbench.py
new file mode 100755
index 0000000..a0a21fa
--- /dev/null
+++ b/Tools/stringbench/stringbench.py
@@ -0,0 +1,1482 @@
+
+# Various microbenchmarks comparing unicode and byte string performance
+# Please keep this file both 2.x and 3.x compatible!
+
+import timeit
+import itertools
+import operator
+import re
+import sys
+import datetime
+import optparse
+
+VERSION = '2.0'
+
+def p(*args):
+ sys.stdout.write(' '.join(str(s) for s in args) + '\n')
+
+if sys.version_info >= (3,):
+ BYTES = bytes_from_str = lambda x: x.encode('ascii')
+ UNICODE = unicode_from_str = lambda x: x
+else:
+ BYTES = bytes_from_str = lambda x: x
+ UNICODE = unicode_from_str = lambda x: x.decode('ascii')
+
+class UnsupportedType(TypeError):
+ pass
+
+
+p('stringbench v%s' % VERSION)
+p(sys.version)
+p(datetime.datetime.now())
+
+REPEAT = 1
+REPEAT = 3
+#REPEAT = 7
+
+if __name__ != "__main__":
+ raise SystemExit("Must run as main program")
+
+parser = optparse.OptionParser()
+parser.add_option("-R", "--skip-re", dest="skip_re",
+ action="store_true",
+ help="skip regular expression tests")
+parser.add_option("-8", "--8-bit", dest="bytes_only",
+ action="store_true",
+ help="only do 8-bit string benchmarks")
+parser.add_option("-u", "--unicode", dest="unicode_only",
+ action="store_true",
+ help="only do Unicode string benchmarks")
+
+
+_RANGE_1000 = list(range(1000))
+_RANGE_100 = list(range(100))
+_RANGE_10 = list(range(10))
+
+dups = {}
+def bench(s, group, repeat_count):
+ def blah(f):
+ if f.__name__ in dups:
+ raise AssertionError("Multiple functions with same name: %r" %
+ (f.__name__,))
+ dups[f.__name__] = 1
+ f.comment = s
+ f.is_bench = True
+ f.group = group
+ f.repeat_count = repeat_count
+ return f
+ return blah
+
+def uses_re(f):
+ f.uses_re = True
+
+####### 'in' comparisons
+
+@bench('"A" in "A"*1000', "early match, single character", 1000)
+def in_test_quick_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("A")
+ for x in _RANGE_1000:
+ s2 in s1
+
+@bench('"B" in "A"*1000', "no match, single character", 1000)
+def in_test_no_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("B")
+ for x in _RANGE_1000:
+ s2 in s1
+
+
+@bench('"AB" in "AB"*1000', "early match, two characters", 1000)
+def in_test_quick_match_two_characters(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("AB")
+ for x in _RANGE_1000:
+ s2 in s1
+
+@bench('"BC" in "AB"*1000', "no match, two characters", 1000)
+def in_test_no_match_two_character(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("BC")
+ for x in _RANGE_1000:
+ s2 in s1
+
+@bench('"BC" in ("AB"*300+"C")', "late match, two characters", 1000)
+def in_test_slow_match_two_characters(STR):
+ s1 = STR("AB" * 300+"C")
+ s2 = STR("BC")
+ for x in _RANGE_1000:
+ s2 in s1
+
+@bench('s="ABC"*33; (s+"E") in ((s+"D")*300+s+"E")',
+ "late match, 100 characters", 100)
+def in_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*300 + m+e
+ s2 = m+e
+ for x in _RANGE_100:
+ s2 in s1
+
+# Try with regex
+@uses_re
+@bench('s="ABC"*33; re.compile(s+"D").search((s+"D")*300+s+"E")',
+ "late match, 100 characters", 100)
+def re_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*300 + m+e
+ s2 = m+e
+ pat = re.compile(s2)
+ search = pat.search
+ for x in _RANGE_100:
+ search(s1)
+
+
+#### same tests as 'in' but use 'find'
+
+@bench('("A"*1000).find("A")', "early match, single character", 1000)
+def find_test_quick_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("A")
+ s1_find = s1.find
+ for x in _RANGE_1000:
+ s1_find(s2)
+
+@bench('("A"*1000).find("B")', "no match, single character", 1000)
+def find_test_no_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("B")
+ s1_find = s1.find
+ for x in _RANGE_1000:
+ s1_find(s2)
+
+
+@bench('("AB"*1000).find("AB")', "early match, two characters", 1000)
+def find_test_quick_match_two_characters(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("AB")
+ s1_find = s1.find
+ for x in _RANGE_1000:
+ s1_find(s2)
+
+@bench('("AB"*1000).find("BC")', "no match, two characters", 1000)
+def find_test_no_match_two_character(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("BC")
+ s1_find = s1.find
+ for x in _RANGE_1000:
+ s1_find(s2)
+
+@bench('("AB"*1000).find("CA")', "no match, two characters", 1000)
+def find_test_no_match_two_character_bis(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("CA")
+ s1_find = s1.find
+ for x in _RANGE_1000:
+ s1_find(s2)
+
+@bench('("AB"*300+"C").find("BC")', "late match, two characters", 1000)
+def find_test_slow_match_two_characters(STR):
+ s1 = STR("AB" * 300+"C")
+ s2 = STR("BC")
+ s1_find = s1.find
+ for x in _RANGE_1000:
+ s1_find(s2)
+
+@bench('("AB"*300+"CA").find("CA")', "late match, two characters", 1000)
+def find_test_slow_match_two_characters_bis(STR):
+ s1 = STR("AB" * 300+"CA")
+ s2 = STR("CA")
+ s1_find = s1.find
+ for x in _RANGE_1000:
+ s1_find(s2)
+
+@bench('s="ABC"*33; ((s+"D")*500+s+"E").find(s+"E")',
+ "late match, 100 characters", 100)
+def find_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + m+e
+ s2 = m+e
+ s1_find = s1.find
+ for x in _RANGE_100:
+ s1_find(s2)
+
+@bench('s="ABC"*33; ((s+"D")*500+"E"+s).find("E"+s)',
+ "late match, 100 characters", 100)
+def find_test_slow_match_100_characters_bis(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + e+m
+ s2 = e+m
+ s1_find = s1.find
+ for x in _RANGE_100:
+ s1_find(s2)
+
+
+#### Same tests for 'rfind'
+
+@bench('("A"*1000).rfind("A")', "early match, single character", 1000)
+def rfind_test_quick_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("A")
+ s1_rfind = s1.rfind
+ for x in _RANGE_1000:
+ s1_rfind(s2)
+
+@bench('("A"*1000).rfind("B")', "no match, single character", 1000)
+def rfind_test_no_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("B")
+ s1_rfind = s1.rfind
+ for x in _RANGE_1000:
+ s1_rfind(s2)
+
+
+@bench('("AB"*1000).rfind("AB")', "early match, two characters", 1000)
+def rfind_test_quick_match_two_characters(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("AB")
+ s1_rfind = s1.rfind
+ for x in _RANGE_1000:
+ s1_rfind(s2)
+
+@bench('("AB"*1000).rfind("BC")', "no match, two characters", 1000)
+def rfind_test_no_match_two_character(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("BC")
+ s1_rfind = s1.rfind
+ for x in _RANGE_1000:
+ s1_rfind(s2)
+
+@bench('("AB"*1000).rfind("CA")', "no match, two characters", 1000)
+def rfind_test_no_match_two_character_bis(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("CA")
+ s1_rfind = s1.rfind
+ for x in _RANGE_1000:
+ s1_rfind(s2)
+
+@bench('("C"+"AB"*300).rfind("CA")', "late match, two characters", 1000)
+def rfind_test_slow_match_two_characters(STR):
+ s1 = STR("C" + "AB" * 300)
+ s2 = STR("CA")
+ s1_rfind = s1.rfind
+ for x in _RANGE_1000:
+ s1_rfind(s2)
+
+@bench('("BC"+"AB"*300).rfind("BC")', "late match, two characters", 1000)
+def rfind_test_slow_match_two_characters_bis(STR):
+ s1 = STR("BC" + "AB" * 300)
+ s2 = STR("BC")
+ s1_rfind = s1.rfind
+ for x in _RANGE_1000:
+ s1_rfind(s2)
+
+@bench('s="ABC"*33; ("E"+s+("D"+s)*500).rfind("E"+s)',
+ "late match, 100 characters", 100)
+def rfind_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = e+m + (d+m)*500
+ s2 = e+m
+ s1_rfind = s1.rfind
+ for x in _RANGE_100:
+ s1_rfind(s2)
+
+@bench('s="ABC"*33; (s+"E"+("D"+s)*500).rfind(s+"E")',
+ "late match, 100 characters", 100)
+def rfind_test_slow_match_100_characters_bis(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = m+e + (d+m)*500
+ s2 = m+e
+ s1_rfind = s1.rfind
+ for x in _RANGE_100:
+ s1_rfind(s2)
+
+
+#### Now with index.
+# Skip the ones which fail because that would include exception overhead.
+
+@bench('("A"*1000).index("A")', "early match, single character", 1000)
+def index_test_quick_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("A")
+ s1_index = s1.index
+ for x in _RANGE_1000:
+ s1_index(s2)
+
+@bench('("AB"*1000).index("AB")', "early match, two characters", 1000)
+def index_test_quick_match_two_characters(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("AB")
+ s1_index = s1.index
+ for x in _RANGE_1000:
+ s1_index(s2)
+
+@bench('("AB"*300+"C").index("BC")', "late match, two characters", 1000)
+def index_test_slow_match_two_characters(STR):
+ s1 = STR("AB" * 300+"C")
+ s2 = STR("BC")
+ s1_index = s1.index
+ for x in _RANGE_1000:
+ s1_index(s2)
+
+@bench('s="ABC"*33; ((s+"D")*500+s+"E").index(s+"E")',
+ "late match, 100 characters", 100)
+def index_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + m+e
+ s2 = m+e
+ s1_index = s1.index
+ for x in _RANGE_100:
+ s1_index(s2)
+
+
+#### Same for rindex
+
+@bench('("A"*1000).rindex("A")', "early match, single character", 1000)
+def rindex_test_quick_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("A")
+ s1_rindex = s1.rindex
+ for x in _RANGE_1000:
+ s1_rindex(s2)
+
+@bench('("AB"*1000).rindex("AB")', "early match, two characters", 1000)
+def rindex_test_quick_match_two_characters(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("AB")
+ s1_rindex = s1.rindex
+ for x in _RANGE_1000:
+ s1_rindex(s2)
+
+@bench('("C"+"AB"*300).rindex("CA")', "late match, two characters", 1000)
+def rindex_test_slow_match_two_characters(STR):
+ s1 = STR("C" + "AB" * 300)
+ s2 = STR("CA")
+ s1_rindex = s1.rindex
+ for x in _RANGE_1000:
+ s1_rindex(s2)
+
+@bench('s="ABC"*33; ("E"+s+("D"+s)*500).rindex("E"+s)',
+ "late match, 100 characters", 100)
+def rindex_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = e + m + (d+m)*500
+ s2 = e + m
+ s1_rindex = s1.rindex
+ for x in _RANGE_100:
+ s1_rindex(s2)
+
+
+#### Same for partition
+
+@bench('("A"*1000).partition("A")', "early match, single character", 1000)
+def partition_test_quick_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("A")
+ s1_partition = s1.partition
+ for x in _RANGE_1000:
+ s1_partition(s2)
+
+@bench('("A"*1000).partition("B")', "no match, single character", 1000)
+def partition_test_no_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("B")
+ s1_partition = s1.partition
+ for x in _RANGE_1000:
+ s1_partition(s2)
+
+
+@bench('("AB"*1000).partition("AB")', "early match, two characters", 1000)
+def partition_test_quick_match_two_characters(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("AB")
+ s1_partition = s1.partition
+ for x in _RANGE_1000:
+ s1_partition(s2)
+
+@bench('("AB"*1000).partition("BC")', "no match, two characters", 1000)
+def partition_test_no_match_two_character(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("BC")
+ s1_partition = s1.partition
+ for x in _RANGE_1000:
+ s1_partition(s2)
+
+@bench('("AB"*300+"C").partition("BC")', "late match, two characters", 1000)
+def partition_test_slow_match_two_characters(STR):
+ s1 = STR("AB" * 300+"C")
+ s2 = STR("BC")
+ s1_partition = s1.partition
+ for x in _RANGE_1000:
+ s1_partition(s2)
+
+@bench('s="ABC"*33; ((s+"D")*500+s+"E").partition(s+"E")',
+ "late match, 100 characters", 100)
+def partition_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + m+e
+ s2 = m+e
+ s1_partition = s1.partition
+ for x in _RANGE_100:
+ s1_partition(s2)
+
+
+#### Same for rpartition
+
+@bench('("A"*1000).rpartition("A")', "early match, single character", 1000)
+def rpartition_test_quick_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("A")
+ s1_rpartition = s1.rpartition
+ for x in _RANGE_1000:
+ s1_rpartition(s2)
+
+@bench('("A"*1000).rpartition("B")', "no match, single character", 1000)
+def rpartition_test_no_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("B")
+ s1_rpartition = s1.rpartition
+ for x in _RANGE_1000:
+ s1_rpartition(s2)
+
+
+@bench('("AB"*1000).rpartition("AB")', "early match, two characters", 1000)
+def rpartition_test_quick_match_two_characters(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("AB")
+ s1_rpartition = s1.rpartition
+ for x in _RANGE_1000:
+ s1_rpartition(s2)
+
+@bench('("AB"*1000).rpartition("BC")', "no match, two characters", 1000)
+def rpartition_test_no_match_two_character(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("BC")
+ s1_rpartition = s1.rpartition
+ for x in _RANGE_1000:
+ s1_rpartition(s2)
+
+@bench('("C"+"AB"*300).rpartition("CA")', "late match, two characters", 1000)
+def rpartition_test_slow_match_two_characters(STR):
+ s1 = STR("C" + "AB" * 300)
+ s2 = STR("CA")
+ s1_rpartition = s1.rpartition
+ for x in _RANGE_1000:
+ s1_rpartition(s2)
+
+@bench('s="ABC"*33; ("E"+s+("D"+s)*500).rpartition("E"+s)',
+ "late match, 100 characters", 100)
+def rpartition_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = e + m + (d+m)*500
+ s2 = e + m
+ s1_rpartition = s1.rpartition
+ for x in _RANGE_100:
+ s1_rpartition(s2)
+
+
+#### Same for split(s, 1)
+
+@bench('("A"*1000).split("A", 1)', "early match, single character", 1000)
+def split_test_quick_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("A")
+ s1_split = s1.split
+ for x in _RANGE_1000:
+ s1_split(s2, 1)
+
+@bench('("A"*1000).split("B", 1)', "no match, single character", 1000)
+def split_test_no_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("B")
+ s1_split = s1.split
+ for x in _RANGE_1000:
+ s1_split(s2, 1)
+
+
+@bench('("AB"*1000).split("AB", 1)', "early match, two characters", 1000)
+def split_test_quick_match_two_characters(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("AB")
+ s1_split = s1.split
+ for x in _RANGE_1000:
+ s1_split(s2, 1)
+
+@bench('("AB"*1000).split("BC", 1)', "no match, two characters", 1000)
+def split_test_no_match_two_character(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("BC")
+ s1_split = s1.split
+ for x in _RANGE_1000:
+ s1_split(s2, 1)
+
+@bench('("AB"*300+"C").split("BC", 1)', "late match, two characters", 1000)
+def split_test_slow_match_two_characters(STR):
+ s1 = STR("AB" * 300+"C")
+ s2 = STR("BC")
+ s1_split = s1.split
+ for x in _RANGE_1000:
+ s1_split(s2, 1)
+
+@bench('s="ABC"*33; ((s+"D")*500+s+"E").split(s+"E", 1)',
+ "late match, 100 characters", 100)
+def split_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = (m+d)*500 + m+e
+ s2 = m+e
+ s1_split = s1.split
+ for x in _RANGE_100:
+ s1_split(s2, 1)
+
+
+#### Same for rsplit(s, 1)
+
+@bench('("A"*1000).rsplit("A", 1)', "early match, single character", 1000)
+def rsplit_test_quick_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("A")
+ s1_rsplit = s1.rsplit
+ for x in _RANGE_1000:
+ s1_rsplit(s2, 1)
+
+@bench('("A"*1000).rsplit("B", 1)', "no match, single character", 1000)
+def rsplit_test_no_match_single_character(STR):
+ s1 = STR("A" * 1000)
+ s2 = STR("B")
+ s1_rsplit = s1.rsplit
+ for x in _RANGE_1000:
+ s1_rsplit(s2, 1)
+
+
+@bench('("AB"*1000).rsplit("AB", 1)', "early match, two characters", 1000)
+def rsplit_test_quick_match_two_characters(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("AB")
+ s1_rsplit = s1.rsplit
+ for x in _RANGE_1000:
+ s1_rsplit(s2, 1)
+
+@bench('("AB"*1000).rsplit("BC", 1)', "no match, two characters", 1000)
+def rsplit_test_no_match_two_character(STR):
+ s1 = STR("AB" * 1000)
+ s2 = STR("BC")
+ s1_rsplit = s1.rsplit
+ for x in _RANGE_1000:
+ s1_rsplit(s2, 1)
+
+@bench('("C"+"AB"*300).rsplit("CA", 1)', "late match, two characters", 1000)
+def rsplit_test_slow_match_two_characters(STR):
+ s1 = STR("C" + "AB" * 300)
+ s2 = STR("CA")
+ s1_rsplit = s1.rsplit
+ for x in _RANGE_1000:
+ s1_rsplit(s2, 1)
+
+@bench('s="ABC"*33; ("E"+s+("D"+s)*500).rsplit("E"+s, 1)',
+ "late match, 100 characters", 100)
+def rsplit_test_slow_match_100_characters(STR):
+ m = STR("ABC"*33)
+ d = STR("D")
+ e = STR("E")
+ s1 = e + m + (d+m)*500
+ s2 = e + m
+ s1_rsplit = s1.rsplit
+ for x in _RANGE_100:
+ s1_rsplit(s2, 1)
+
+
+#### Benchmark the operator-based methods
+
+@bench('"A"*10', "repeat 1 character 10 times", 1000)
+def repeat_single_10_times(STR):
+ s = STR("A")
+ for x in _RANGE_1000:
+ s * 10
+
+@bench('"A"*1000', "repeat 1 character 1000 times", 1000)
+def repeat_single_1000_times(STR):
+ s = STR("A")
+ for x in _RANGE_1000:
+ s * 1000
+
+@bench('"ABCDE"*10', "repeat 5 characters 10 times", 1000)
+def repeat_5_10_times(STR):
+ s = STR("ABCDE")
+ for x in _RANGE_1000:
+ s * 10
+
+@bench('"ABCDE"*1000', "repeat 5 characters 1000 times", 1000)
+def repeat_5_1000_times(STR):
+ s = STR("ABCDE")
+ for x in _RANGE_1000:
+ s * 1000
+
+# + for concat
+
+@bench('"Andrew"+"Dalke"', "concat two strings", 1000)
+def concat_two_strings(STR):
+ s1 = STR("Andrew")
+ s2 = STR("Dalke")
+ for x in _RANGE_1000:
+ s1+s2
+
+@bench('s1+s2+s3+s4+...+s20', "concat 20 strings of words length 4 to 15",
+ 1000)
+def concat_many_strings(STR):
+ s1=STR('TIXSGYNREDCVBHJ')
+ s2=STR('PUMTLXBZVDO')
+ s3=STR('FVZNJ')
+ s4=STR('OGDXUW')
+ s5=STR('WEIMRNCOYVGHKB')
+ s6=STR('FCQTNMXPUZH')
+ s7=STR('TICZJYRLBNVUEAK')
+ s8=STR('REYB')
+ s9=STR('PWUOQ')
+ s10=STR('EQHCMKBS')
+ s11=STR('AEVDFOH')
+ s12=STR('IFHVD')
+ s13=STR('JGTCNLXWOHQ')
+ s14=STR('ITSKEPYLROZAWXF')
+ s15=STR('THEK')
+ s16=STR('GHPZFBUYCKMNJIT')
+ s17=STR('JMUZ')
+ s18=STR('WLZQMTB')
+ s19=STR('KPADCBW')
+ s20=STR('TNJHZQAGBU')
+ for x in _RANGE_1000:
+ (s1 + s2+ s3+ s4+ s5+ s6+ s7+ s8+ s9+s10+
+ s11+s12+s13+s14+s15+s16+s17+s18+s19+s20)
+
+
+#### Benchmark join
+
+def get_bytes_yielding_seq(STR, arg):
+ if STR is BYTES and sys.version_info >= (3,):
+ raise UnsupportedType
+ return STR(arg)
+
+@bench('"A".join("")',
+ "join empty string, with 1 character sep", 100)
+def join_empty_single(STR):
+ sep = STR("A")
+ s2 = get_bytes_yielding_seq(STR, "")
+ sep_join = sep.join
+ for x in _RANGE_100:
+ sep_join(s2)
+
+@bench('"ABCDE".join("")',
+ "join empty string, with 5 character sep", 100)
+def join_empty_5(STR):
+ sep = STR("ABCDE")
+ s2 = get_bytes_yielding_seq(STR, "")
+ sep_join = sep.join
+ for x in _RANGE_100:
+ sep_join(s2)
+
+@bench('"A".join("ABC..Z")',
+ "join string with 26 characters, with 1 character sep", 1000)
+def join_alphabet_single(STR):
+ sep = STR("A")
+ s2 = get_bytes_yielding_seq(STR, "ABCDEFGHIJKLMnOPQRSTUVWXYZ")
+ sep_join = sep.join
+ for x in _RANGE_1000:
+ sep_join(s2)
+
+@bench('"ABCDE".join("ABC..Z")',
+ "join string with 26 characters, with 5 character sep", 1000)
+def join_alphabet_5(STR):
+ sep = STR("ABCDE")
+ s2 = get_bytes_yielding_seq(STR, "ABCDEFGHIJKLMnOPQRSTUVWXYZ")
+ sep_join = sep.join
+ for x in _RANGE_1000:
+ sep_join(s2)
+
+@bench('"A".join(list("ABC..Z"))',
+ "join list of 26 characters, with 1 character sep", 1000)
+def join_alphabet_list_single(STR):
+ sep = STR("A")
+ s2 = [STR(x) for x in "ABCDEFGHIJKLMnOPQRSTUVWXYZ"]
+ sep_join = sep.join
+ for x in _RANGE_1000:
+ sep_join(s2)
+
+@bench('"ABCDE".join(list("ABC..Z"))',
+ "join list of 26 characters, with 5 character sep", 1000)
+def join_alphabet_list_five(STR):
+ sep = STR("ABCDE")
+ s2 = [STR(x) for x in "ABCDEFGHIJKLMnOPQRSTUVWXYZ"]
+ sep_join = sep.join
+ for x in _RANGE_1000:
+ sep_join(s2)
+
+@bench('"A".join(["Bob"]*100))',
+ "join list of 100 words, with 1 character sep", 1000)
+def join_100_words_single(STR):
+ sep = STR("A")
+ s2 = [STR("Bob")]*100
+ sep_join = sep.join
+ for x in _RANGE_1000:
+ sep_join(s2)
+
+@bench('"ABCDE".join(["Bob"]*100))',
+ "join list of 100 words, with 5 character sep", 1000)
+def join_100_words_5(STR):
+ sep = STR("ABCDE")
+ s2 = [STR("Bob")]*100
+ sep_join = sep.join
+ for x in _RANGE_1000:
+ sep_join(s2)
+
+#### split tests
+
+@bench('("Here are some words. "*2).split()', "split whitespace (small)", 1000)
+def whitespace_split(STR):
+ s = STR("Here are some words. "*2)
+ s_split = s.split
+ for x in _RANGE_1000:
+ s_split()
+
+@bench('("Here are some words. "*2).rsplit()', "split whitespace (small)", 1000)
+def whitespace_rsplit(STR):
+ s = STR("Here are some words. "*2)
+ s_rsplit = s.rsplit
+ for x in _RANGE_1000:
+ s_rsplit()
+
+@bench('("Here are some words. "*2).split(None, 1)',
+ "split 1 whitespace", 1000)
+def whitespace_split_1(STR):
+ s = STR("Here are some words. "*2)
+ s_split = s.split
+ N = None
+ for x in _RANGE_1000:
+ s_split(N, 1)
+
+@bench('("Here are some words. "*2).rsplit(None, 1)',
+ "split 1 whitespace", 1000)
+def whitespace_rsplit_1(STR):
+ s = STR("Here are some words. "*2)
+ s_rsplit = s.rsplit
+ N = None
+ for x in _RANGE_1000:
+ s_rsplit(N, 1)
+
+@bench('("Here are some words. "*2).partition(" ")',
+ "split 1 whitespace", 1000)
+def whitespace_partition(STR):
+ sep = STR(" ")
+ s = STR("Here are some words. "*2)
+ s_partition = s.partition
+ for x in _RANGE_1000:
+ s_partition(sep)
+
+@bench('("Here are some words. "*2).rpartition(" ")',
+ "split 1 whitespace", 1000)
+def whitespace_rpartition(STR):
+ sep = STR(" ")
+ s = STR("Here are some words. "*2)
+ s_rpartition = s.rpartition
+ for x in _RANGE_1000:
+ s_rpartition(sep)
+
+human_text = """\
+Python is a dynamic object-oriented programming language that can be
+used for many kinds of software development. It offers strong support
+for integration with other languages and tools, comes with extensive
+standard libraries, and can be learned in a few days. Many Python
+programmers report substantial productivity gains and feel the language
+encourages the development of higher quality, more maintainable code.
+
+Python runs on Windows, Linux/Unix, Mac OS X, OS/2, Amiga, Palm
+Handhelds, and Nokia mobile phones. Python has also been ported to the
+Java and .NET virtual machines.
+
+Python is distributed under an OSI-approved open source license that
+makes it free to use, even for commercial products.
+"""*25
+human_text_bytes = bytes_from_str(human_text)
+human_text_unicode = unicode_from_str(human_text)
+def _get_human_text(STR):
+ if STR is UNICODE:
+ return human_text_unicode
+ if STR is BYTES:
+ return human_text_bytes
+ raise AssertionError
+
+@bench('human_text.split()', "split whitespace (huge)", 10)
+def whitespace_split_huge(STR):
+ s = _get_human_text(STR)
+ s_split = s.split
+ for x in _RANGE_10:
+ s_split()
+
+@bench('human_text.rsplit()', "split whitespace (huge)", 10)
+def whitespace_rsplit_huge(STR):
+ s = _get_human_text(STR)
+ s_rsplit = s.rsplit
+ for x in _RANGE_10:
+ s_rsplit()
+
+
+
+@bench('"this\\nis\\na\\ntest\\n".split("\\n")', "split newlines", 1000)
+def newlines_split(STR):
+ s = STR("this\nis\na\ntest\n")
+ s_split = s.split
+ nl = STR("\n")
+ for x in _RANGE_1000:
+ s_split(nl)
+
+
+@bench('"this\\nis\\na\\ntest\\n".rsplit("\\n")', "split newlines", 1000)
+def newlines_rsplit(STR):
+ s = STR("this\nis\na\ntest\n")
+ s_rsplit = s.rsplit
+ nl = STR("\n")
+ for x in _RANGE_1000:
+ s_rsplit(nl)
+
+@bench('"this\\nis\\na\\ntest\\n".splitlines()', "split newlines", 1000)
+def newlines_splitlines(STR):
+ s = STR("this\nis\na\ntest\n")
+ s_splitlines = s.splitlines
+ for x in _RANGE_1000:
+ s_splitlines()
+
+## split text with 2000 newlines
+
+def _make_2000_lines():
+ import random
+ r = random.Random(100)
+ chars = list(map(chr, range(32, 128)))
+ i = 0
+ while i < len(chars):
+ chars[i] = " "
+ i += r.randrange(9)
+ s = "".join(chars)
+ s = s*4
+ words = []
+ for i in range(2000):
+ start = r.randrange(96)
+ n = r.randint(5, 65)
+ words.append(s[start:start+n])
+ return "\n".join(words)+"\n"
+
+_text_with_2000_lines = _make_2000_lines()
+_text_with_2000_lines_bytes = bytes_from_str(_text_with_2000_lines)
+_text_with_2000_lines_unicode = unicode_from_str(_text_with_2000_lines)
+def _get_2000_lines(STR):
+ if STR is UNICODE:
+ return _text_with_2000_lines_unicode
+ if STR is BYTES:
+ return _text_with_2000_lines_bytes
+ raise AssertionError
+
+
+@bench('"...text...".split("\\n")', "split 2000 newlines", 10)
+def newlines_split_2000(STR):
+ s = _get_2000_lines(STR)
+ s_split = s.split
+ nl = STR("\n")
+ for x in _RANGE_10:
+ s_split(nl)
+
+@bench('"...text...".rsplit("\\n")', "split 2000 newlines", 10)
+def newlines_rsplit_2000(STR):
+ s = _get_2000_lines(STR)
+ s_rsplit = s.rsplit
+ nl = STR("\n")
+ for x in _RANGE_10:
+ s_rsplit(nl)
+
+@bench('"...text...".splitlines()', "split 2000 newlines", 10)
+def newlines_splitlines_2000(STR):
+ s = _get_2000_lines(STR)
+ s_splitlines = s.splitlines
+ for x in _RANGE_10:
+ s_splitlines()
+
+
+## split text on "--" characters
+@bench(
+ '"this--is--a--test--of--the--emergency--broadcast--system".split("--")',
+ "split on multicharacter separator (small)", 1000)
+def split_multichar_sep_small(STR):
+ s = STR("this--is--a--test--of--the--emergency--broadcast--system")
+ s_split = s.split
+ pat = STR("--")
+ for x in _RANGE_1000:
+ s_split(pat)
+@bench(
+ '"this--is--a--test--of--the--emergency--broadcast--system".rsplit("--")',
+ "split on multicharacter separator (small)", 1000)
+def rsplit_multichar_sep_small(STR):
+ s = STR("this--is--a--test--of--the--emergency--broadcast--system")
+ s_rsplit = s.rsplit
+ pat = STR("--")
+ for x in _RANGE_1000:
+ s_rsplit(pat)
+
+## split dna text on "ACTAT" characters
+@bench('dna.split("ACTAT")',
+ "split on multicharacter separator (dna)", 10)
+def split_multichar_sep_dna(STR):
+ s = _get_dna(STR)
+ s_split = s.split
+ pat = STR("ACTAT")
+ for x in _RANGE_10:
+ s_split(pat)
+
+@bench('dna.rsplit("ACTAT")',
+ "split on multicharacter separator (dna)", 10)
+def rsplit_multichar_sep_dna(STR):
+ s = _get_dna(STR)
+ s_rsplit = s.rsplit
+ pat = STR("ACTAT")
+ for x in _RANGE_10:
+ s_rsplit(pat)
+
+
+
+## split with limits
+
+GFF3_example = "\t".join([
+ "I", "Genomic_canonical", "region", "357208", "396183", ".", "+", ".",
+ "ID=Sequence:R119;note=Clone R119%3B Genbank AF063007;Name=R119"])
+
+@bench('GFF3_example.split("\\t")', "tab split", 1000)
+def tab_split_no_limit(STR):
+ sep = STR("\t")
+ s = STR(GFF3_example)
+ s_split = s.split
+ for x in _RANGE_1000:
+ s_split(sep)
+
+@bench('GFF3_example.split("\\t", 8)', "tab split", 1000)
+def tab_split_limit(STR):
+ sep = STR("\t")
+ s = STR(GFF3_example)
+ s_split = s.split
+ for x in _RANGE_1000:
+ s_split(sep, 8)
+
+@bench('GFF3_example.rsplit("\\t")', "tab split", 1000)
+def tab_rsplit_no_limit(STR):
+ sep = STR("\t")
+ s = STR(GFF3_example)
+ s_rsplit = s.rsplit
+ for x in _RANGE_1000:
+ s_rsplit(sep)
+
+@bench('GFF3_example.rsplit("\\t", 8)', "tab split", 1000)
+def tab_rsplit_limit(STR):
+ sep = STR("\t")
+ s = STR(GFF3_example)
+ s_rsplit = s.rsplit
+ for x in _RANGE_1000:
+ s_rsplit(sep, 8)
+
+#### Count characters
+
+@bench('...text.with.2000.newlines.count("\\n")',
+ "count newlines", 10)
+def count_newlines(STR):
+ s = _get_2000_lines(STR)
+ s_count = s.count
+ nl = STR("\n")
+ for x in _RANGE_10:
+ s_count(nl)
+
+# Orchid sequences concatenated, from Biopython
+_dna = """
+CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACATAATAATTGATCGGGTT
+AATCTGGAGGATCTGTTTACTTTGGTCACCCATGAGCATTTGCTGTTGAAGTGACCTAGAATTGCCATCG
+AGCCTCCTTGGGAGCTTTCTTGTTGGCGAGATCTAAACCCTTGCCCGGCGCAGTTTTGCTCCAAGTCGTT
+TGACACATAATTGGTGAAGGGGGTGGCATCCTTCCCTGACCCTCCCCCAACTATTTTTTTAACAACTCTC
+AGCAACGGAGACTCAGTCTTCGGCAAATGCGATAAATGGTGTGAATTGCAGAATCCCGTGCACCATCGAG
+TCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCACGCCTGCCTGGGCATTGCGAGTCATAT
+CTCTCCCTTAACGAGGCTGTCCATACATACTGTTCAGCCGGTGCGGATGTGAGTTTGGCCCCTTGTTCTT
+TGGTACGGGGGGTCTAAGAGCTGCATGGGCTTTTGATGGTCCTAAATACGGCAAGAGGTGGACGAACTAT
+GCTACAACAAAATTGTTGTGCAGAGGCCCCGGGTTGTCGTATTAGATGGGCCACCGTAATCTGAAGACCC
+TTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGATGGCCATTTGGTTGCGACCCCAGGTCAG
+GTGAGCAACAGCTGTCGTAACAAGGTTTCCGTAGGGTGAACTGCGGAAGGATCATTGTTGAGATCACATA
+ATAATTGATCGAGTTAATCTGGAGGATCTGTTTACTTGGGTCACCCATGGGCATTTGCTGTTGAAGTGAC
+CTAGATTTGCCATCGAGCCTCCTTGGGAGCATCCTTGTTGGCGATATCTAAACCCTCAATTTTTCCCCCA
+ATCAAATTACACAAAATTGGTGGAGGGGGTGGCATTCTTCCCTTACCCTCCCCCAAATATTTTTTTAACA
+ACTCTCAGCAACGGATATCTCAGCTCTTGCATCGATGAAGAACCCACCGAAATGCGATAAATGGTGTGAA
+TTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCACG
+CCTGCCTGGGCATTGCGAGTCATATCTCTCCCTTAACGAGGCTGTCCATACATACTGTTCAGCCGGTGCG
+GATGTGAGTTTGGCCCCTTGTTCTTTGGTACGGGGGGTCTAAGAGATGCATGGGCTTTTGATGGTCCTAA
+ATACGGCAAGAGGTGGACGAACTATGCTACAACAAAATTGTTGTGCAAAGGCCCCGGGTTGTCGTATAAG
+ATGGGCCACCGATATCTGAAGACCCTTTTGGACCCCATTGGAGCCCATCAACCCATGTCAGTTGATGGCC
+ATTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACATAATAATTGATCGA
+GTTAATCTGGAGGATCTGTTTACTTGGGTCACCCATGGGCATTTGCTGTTGAAGTGACCTAGATTTGCCA
+TCGAGCCTCCTTGGGAGCTTTCTTGTTGGCGATATCTAAACCCTTGCCCGGCAGAGTTTTGGGAATCCCG
+TGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCACGCCTGCCTGGGCAT
+TGCGAGTCATATCTCTCCCTTAACGAGGCTGTCCATACACACCTGTTCAGCCGGTGCGGATGTGAGTTTG
+GCCCCTTGTTCTTTGGTACGGGGGGTCTAAGAGCTGCATGGGCTTTTGATGGTCCTAAATACGGCAAGAG
+GTGGACGAACTATGCTACAACAAAATTGTTGTGCAAAGGCCCCGGGTTGTCGTATTAGATGGGCCACCAT
+AATCTGAAGACCCTTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGATGGCCATTTGGTTGC
+GACCCAGTCAGGTGAGGGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACATAATAATTGATCGAG
+TTAATCTGGAGGATCTGTTTACTTTGGTCACCCATGGGCATTTGCTGTTGAAGTGACCTAGATTTGCCAT
+CGAGCCTCCTTGGGAGCTTTCTTGTTGGCGAGATCTAAACCCTTGCCCGGCGGAGTTTGGCGCCAAGTCA
+TATGACACATAATTGGTGAAGGGGGTGGCATCCTGCCCTGACCCTCCCCAAATTATTTTTTTAACAACTC
+TCAGCAACGGATATCTCGGCTCTTGCATCGATGAAGAACGCAGCGAAATGCGATAAATGGTGTGAATTGC
+AGAATCCCGTGAACCATCGAGTCTTTGGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCACGCCT
+GCCTGGGCATTGGGAATCATATCTCTCCCCTAACGAGGCTATCCAAACATACTGTTCATCCGGTGCGGAT
+GTGAGTTTGGCCCCTTGTTCTTTGGTACCGGGGGTCTAAGAGCTGCATGGGCATTTGATGGTCCTCAAAA
+CGGCAAGAGGTGGACGAACTATGCCACAACAAAATTGTTGTCCCAAGGCCCCGGGTTGTCGTATTAGATG
+GGCCACCGTAACCTGAAGACCCTTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGATGACCA
+TTTGTTGCGACCCCAGTCAGCTGAGCAACCCGCTGAGTGGAAGGTCATTGCCGATATCACATAATAATTG
+ATCGAGTTAATCTGGAGGATCTGTTTACTTGGTCACCCATGAGCATTTGCTGTTGAAGTGACCTAGATTT
+GCCATCGAGCCTCCTTGGGAGTTTTCTTGTTGGCGAGATCTAAACCCTTGCCCGGCGGAGTTGTGCGCCA
+AGTCATATGACACATAATTGGTGAAGGGGGTGGCATCCTGCCCTGACCCTCCCCAAATTATTTTTTTAAC
+AACTCTCAGCAACGGATATCTCGGCTCTTGCATCGATGAAGAACGCAGCGAAATGCGATAAATGGTGTGA
+ATTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCAC
+GCCTGCCTGGGCATTGCGAGTCATATCTCTCCCTTAACGAGGCTGTCCATACATACTGTTCATCCGGTGC
+GGATGTGAGTTTGGCCCCTTGTTCTTTGGTACGGGGGGTCTAAGAGCTGCATGGGCATTTGATGGTCCTC
+AAAACGGCAAGAGGTGGACGAACTATGCTACAACCAAATTGTTGTCCCAAGGCCCCGGGTTGTCGTATTA
+GATGGGCCACCGTAACCTGAAGACCCTTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGATG
+ACCATGTGTTGCGACCCCAGTCAGCTGAGCAACGCGCTGAGCGTAACAAGGTTTCCGTAGGTGGACCTCC
+GGGAGGATCATTGTTGAGATCACATAATAATTGATCGAGGTAATCTGGAGGATCTGCATATTTTGGTCAC
+"""
+_dna = "".join(_dna.splitlines())
+_dna = _dna * 25
+_dna_bytes = bytes_from_str(_dna)
+_dna_unicode = unicode_from_str(_dna)
+
+def _get_dna(STR):
+ if STR is UNICODE:
+ return _dna_unicode
+ if STR is BYTES:
+ return _dna_bytes
+ raise AssertionError
+
+@bench('dna.count("AACT")', "count AACT substrings in DNA example", 10)
+def count_aact(STR):
+ seq = _get_dna(STR)
+ seq_count = seq.count
+ needle = STR("AACT")
+ for x in _RANGE_10:
+ seq_count(needle)
+
+##### startswith and endswith
+
+@bench('"Andrew".startswith("A")', 'startswith single character', 1000)
+def startswith_single(STR):
+ s1 = STR("Andrew")
+ s2 = STR("A")
+ s1_startswith = s1.startswith
+ for x in _RANGE_1000:
+ s1_startswith(s2)
+
+@bench('"Andrew".startswith("Andrew")', 'startswith multiple characters',
+ 1000)
+def startswith_multiple(STR):
+ s1 = STR("Andrew")
+ s2 = STR("Andrew")
+ s1_startswith = s1.startswith
+ for x in _RANGE_1000:
+ s1_startswith(s2)
+
+@bench('"Andrew".startswith("Anders")',
+ 'startswith multiple characters - not!', 1000)
+def startswith_multiple_not(STR):
+ s1 = STR("Andrew")
+ s2 = STR("Anders")
+ s1_startswith = s1.startswith
+ for x in _RANGE_1000:
+ s1_startswith(s2)
+
+
+# endswith
+
+@bench('"Andrew".endswith("w")', 'endswith single character', 1000)
+def endswith_single(STR):
+ s1 = STR("Andrew")
+ s2 = STR("w")
+ s1_endswith = s1.endswith
+ for x in _RANGE_1000:
+ s1_endswith(s2)
+
+@bench('"Andrew".endswith("Andrew")', 'endswith multiple characters', 1000)
+def endswith_multiple(STR):
+ s1 = STR("Andrew")
+ s2 = STR("Andrew")
+ s1_endswith = s1.endswith
+ for x in _RANGE_1000:
+ s1_endswith(s2)
+
+@bench('"Andrew".endswith("Anders")',
+ 'endswith multiple characters - not!', 1000)
+def endswith_multiple_not(STR):
+ s1 = STR("Andrew")
+ s2 = STR("Anders")
+ s1_endswith = s1.endswith
+ for x in _RANGE_1000:
+ s1_endswith(s2)
+
+#### Strip
+
+@bench('"Hello!\\n".strip()', 'strip terminal newline', 1000)
+def terminal_newline_strip_right(STR):
+ s = STR("Hello!\n")
+ s_strip = s.strip
+ for x in _RANGE_1000:
+ s_strip()
+
+@bench('"Hello!\\n".rstrip()', 'strip terminal newline', 1000)
+def terminal_newline_rstrip(STR):
+ s = STR("Hello!\n")
+ s_rstrip = s.rstrip
+ for x in _RANGE_1000:
+ s_rstrip()
+
+@bench('"\\nHello!".strip()', 'strip terminal newline', 1000)
+def terminal_newline_strip_left(STR):
+ s = STR("\nHello!")
+ s_strip = s.strip
+ for x in _RANGE_1000:
+ s_strip()
+
+@bench('"\\nHello!\\n".strip()', 'strip terminal newline', 1000)
+def terminal_newline_strip_both(STR):
+ s = STR("\nHello!\n")
+ s_strip = s.strip
+ for x in _RANGE_1000:
+ s_strip()
+
+@bench('"\\nHello!".rstrip()', 'strip terminal newline', 1000)
+def terminal_newline_lstrip(STR):
+ s = STR("\nHello!")
+ s_lstrip = s.lstrip
+ for x in _RANGE_1000:
+ s_lstrip()
+
+@bench('s="Hello!\\n"; s[:-1] if s[-1]=="\\n" else s',
+ 'strip terminal newline', 1000)
+def terminal_newline_if_else(STR):
+ s = STR("Hello!\n")
+ NL = STR("\n")
+ for x in _RANGE_1000:
+ s[:-1] if (s[-1] == NL) else s
+
+
+# Strip multiple spaces or tabs
+
+@bench('"Hello\\t \\t".strip()', 'strip terminal spaces and tabs', 1000)
+def terminal_space_strip(STR):
+ s = STR("Hello\t \t!")
+ s_strip = s.strip
+ for x in _RANGE_1000:
+ s_strip()
+
+@bench('"Hello\\t \\t".rstrip()', 'strip terminal spaces and tabs', 1000)
+def terminal_space_rstrip(STR):
+ s = STR("Hello!\t \t")
+ s_rstrip = s.rstrip
+ for x in _RANGE_1000:
+ s_rstrip()
+
+@bench('"\\t \\tHello".rstrip()', 'strip terminal spaces and tabs', 1000)
+def terminal_space_lstrip(STR):
+ s = STR("\t \tHello!")
+ s_lstrip = s.lstrip
+ for x in _RANGE_1000:
+ s_lstrip()
+
+
+#### replace
+@bench('"This is a test".replace(" ", "\\t")', 'replace single character',
+ 1000)
+def replace_single_character(STR):
+ s = STR("This is a test!")
+ from_str = STR(" ")
+ to_str = STR("\t")
+ s_replace = s.replace
+ for x in _RANGE_1000:
+ s_replace(from_str, to_str)
+
+@uses_re
+@bench('re.sub(" ", "\\t", "This is a test"', 'replace single character',
+ 1000)
+def replace_single_character_re(STR):
+ s = STR("This is a test!")
+ pat = re.compile(STR(" "))
+ to_str = STR("\t")
+ pat_sub = pat.sub
+ for x in _RANGE_1000:
+ pat_sub(to_str, s)
+
+@bench('"...text.with.2000.lines...replace("\\n", " ")',
+ 'replace single character, big string', 10)
+def replace_single_character_big(STR):
+ s = _get_2000_lines(STR)
+ from_str = STR("\n")
+ to_str = STR(" ")
+ s_replace = s.replace
+ for x in _RANGE_10:
+ s_replace(from_str, to_str)
+
+@uses_re
+@bench('re.sub("\\n", " ", "...text.with.2000.lines...")',
+ 'replace single character, big string', 10)
+def replace_single_character_big_re(STR):
+ s = _get_2000_lines(STR)
+ pat = re.compile(STR("\n"))
+ to_str = STR(" ")
+ pat_sub = pat.sub
+ for x in _RANGE_10:
+ pat_sub(to_str, s)
+
+
+@bench('dna.replace("ATC", "ATT")',
+ 'replace multiple characters, dna', 10)
+def replace_multiple_characters_dna(STR):
+ seq = _get_dna(STR)
+ from_str = STR("ATC")
+ to_str = STR("ATT")
+ seq_replace = seq.replace
+ for x in _RANGE_10:
+ seq_replace(from_str, to_str)
+
+# This increases the character count
+@bench('"...text.with.2000.newlines...replace("\\n", "\\r\\n")',
+ 'replace and expand multiple characters, big string', 10)
+def replace_multiple_character_big(STR):
+ s = _get_2000_lines(STR)
+ from_str = STR("\n")
+ to_str = STR("\r\n")
+ s_replace = s.replace
+ for x in _RANGE_10:
+ s_replace(from_str, to_str)
+
+
+# This decreases the character count
+@bench('"When shall we three meet again?".replace("ee", "")',
+ 'replace/remove multiple characters', 1000)
+def replace_multiple_character_remove(STR):
+ s = STR("When shall we three meet again?")
+ from_str = STR("ee")
+ to_str = STR("")
+ s_replace = s.replace
+ for x in _RANGE_1000:
+ s_replace(from_str, to_str)
+
+
+big_s = "A" + ("Z"*128*1024)
+big_s_bytes = bytes_from_str(big_s)
+big_s_unicode = unicode_from_str(big_s)
+def _get_big_s(STR):
+ if STR is UNICODE: return big_s_unicode
+ if STR is BYTES: return big_s_bytes
+ raise AssertionError
+
+# The older replace implementation counted all matches in
+# the string even when it only neeed to make one replacement.
+@bench('("A" + ("Z"*128*1024)).replace("A", "BB", 1)',
+ 'quick replace single character match', 10)
+def quick_replace_single_match(STR):
+ s = _get_big_s(STR)
+ from_str = STR("A")
+ to_str = STR("BB")
+ s_replace = s.replace
+ for x in _RANGE_10:
+ s_replace(from_str, to_str, 1)
+
+@bench('("A" + ("Z"*128*1024)).replace("AZZ", "BBZZ", 1)',
+ 'quick replace multiple character match', 10)
+def quick_replace_multiple_match(STR):
+ s = _get_big_s(STR)
+ from_str = STR("AZZ")
+ to_str = STR("BBZZ")
+ s_replace = s.replace
+ for x in _RANGE_10:
+ s_replace(from_str, to_str, 1)
+
+
+####
+
+# CCP does a lot of this, for internationalisation of ingame messages.
+_format = "The %(thing)s is %(place)s the %(location)s."
+_format_dict = { "thing":"THING", "place":"PLACE", "location":"LOCATION", }
+_format_bytes = bytes_from_str(_format)
+_format_unicode = unicode_from_str(_format)
+_format_dict_bytes = dict((bytes_from_str(k), bytes_from_str(v)) for (k,v) in _format_dict.items())
+_format_dict_unicode = dict((unicode_from_str(k), unicode_from_str(v)) for (k,v) in _format_dict.items())
+
+def _get_format(STR):
+ if STR is UNICODE:
+ return _format_unicode
+ if STR is BYTES:
+ if sys.version_info >= (3,):
+ raise UnsupportedType
+ return _format_bytes
+ raise AssertionError
+
+def _get_format_dict(STR):
+ if STR is UNICODE:
+ return _format_dict_unicode
+ if STR is BYTES:
+ if sys.version_info >= (3,):
+ raise UnsupportedType
+ return _format_dict_bytes
+ raise AssertionError
+
+# Formatting.
+@bench('"The %(k1)s is %(k2)s the %(k3)s."%{"k1":"x","k2":"y","k3":"z",}',
+ 'formatting a string type with a dict', 1000)
+def format_with_dict(STR):
+ s = _get_format(STR)
+ d = _get_format_dict(STR)
+ for x in _RANGE_1000:
+ s % d
+
+
+#### Upper- and lower- case conversion
+
+@bench('("Where in the world is Carmen San Deigo?"*10).lower()',
+ "case conversion -- rare", 1000)
+def lower_conversion_rare(STR):
+ s = STR("Where in the world is Carmen San Deigo?"*10)
+ s_lower = s.lower
+ for x in _RANGE_1000:
+ s_lower()
+
+@bench('("WHERE IN THE WORLD IS CARMEN SAN DEIGO?"*10).lower()',
+ "case conversion -- dense", 1000)
+def lower_conversion_dense(STR):
+ s = STR("WHERE IN THE WORLD IS CARMEN SAN DEIGO?"*10)
+ s_lower = s.lower
+ for x in _RANGE_1000:
+ s_lower()
+
+
+@bench('("wHERE IN THE WORLD IS cARMEN sAN dEIGO?"*10).upper()',
+ "case conversion -- rare", 1000)
+def upper_conversion_rare(STR):
+ s = STR("Where in the world is Carmen San Deigo?"*10)
+ s_upper = s.upper
+ for x in _RANGE_1000:
+ s_upper()
+
+@bench('("where in the world is carmen san deigo?"*10).upper()',
+ "case conversion -- dense", 1000)
+def upper_conversion_dense(STR):
+ s = STR("where in the world is carmen san deigo?"*10)
+ s_upper = s.upper
+ for x in _RANGE_1000:
+ s_upper()
+
+
+# end of benchmarks
+
+#################
+
+class BenchTimer(timeit.Timer):
+ def best(self, repeat=1):
+ for i in range(1, 10):
+ number = 10**i
+ x = self.timeit(number)
+ if x > 0.02:
+ break
+ times = [x]
+ for i in range(1, repeat):
+ times.append(self.timeit(number))
+ return min(times) / number
+
+def main():
+ (options, test_names) = parser.parse_args()
+ if options.bytes_only and options.unicode_only:
+ raise SystemExit("Only one of --8-bit and --unicode are allowed")
+
+ bench_functions = []
+ for (k,v) in globals().items():
+ if hasattr(v, "is_bench"):
+ if test_names:
+ for name in test_names:
+ if name in v.group:
+ break
+ else:
+ # Not selected, ignore
+ continue
+ if options.skip_re and hasattr(v, "uses_re"):
+ continue
+
+ bench_functions.append( (v.group, k, v) )
+ bench_functions.sort()
+
+ p("bytes\tunicode")
+ p("(in ms)\t(in ms)\t%\tcomment")
+
+ bytes_total = uni_total = 0.0
+
+ for title, group in itertools.groupby(bench_functions,
+ operator.itemgetter(0)):
+ # Flush buffer before each group
+ sys.stdout.flush()
+ p("="*10, title)
+ for (_, k, v) in group:
+ if hasattr(v, "is_bench"):
+ bytes_time = 0.0
+ bytes_time_s = " - "
+ if not options.unicode_only:
+ try:
+ bytes_time = BenchTimer("__main__.%s(__main__.BYTES)" % (k,),
+ "import __main__").best(REPEAT)
+ bytes_time_s = "%.2f" % (1000 * bytes_time)
+ bytes_total += bytes_time
+ except UnsupportedType:
+ bytes_time_s = "N/A"
+ uni_time = 0.0
+ uni_time_s = " - "
+ if not options.bytes_only:
+ try:
+ uni_time = BenchTimer("__main__.%s(__main__.UNICODE)" % (k,),
+ "import __main__").best(REPEAT)
+ uni_time_s = "%.2f" % (1000 * uni_time)
+ uni_total += uni_time
+ except UnsupportedType:
+ uni_time_s = "N/A"
+ try:
+ average = bytes_time/uni_time
+ except (TypeError, ZeroDivisionError):
+ average = 0.0
+ p("%s\t%s\t%.1f\t%s (*%d)" % (
+ bytes_time_s, uni_time_s, 100.*average,
+ v.comment, v.repeat_count))
+
+ if bytes_total == uni_total == 0.0:
+ p("That was zippy!")
+ else:
+ try:
+ ratio = bytes_total/uni_total
+ except ZeroDivisionError:
+ ratio = 0.0
+ p("%.2f\t%.2f\t%.1f\t%s" % (
+ 1000*bytes_total, 1000*uni_total, 100.*ratio,
+ "TOTAL"))
+
+if __name__ == "__main__":
+ main()
diff --git a/Tools/unicode/comparecodecs.py b/Tools/unicode/comparecodecs.py
index 0f5c1e2..7de14fd 100644
--- a/Tools/unicode/comparecodecs.py
+++ b/Tools/unicode/comparecodecs.py
@@ -14,7 +14,7 @@ def compare_codecs(encoding1, encoding2):
print('Comparing encoding/decoding of %r and %r' % (encoding1, encoding2))
mismatch = 0
# Check encoding
- for i in range(sys.maxunicode):
+ for i in range(sys.maxunicode+1):
u = chr(i)
try:
c1 = u.encode(encoding1)
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index d503190..db0f8ec 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -21,17 +21,24 @@
# 2004-05-29 perky add east asian width information
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
+# 2011-10-21 ezio add support for name aliases and named sequences
+# 2012-01 benjamin add full case mappings
#
# written by Fredrik Lundh (fredrik@pythonware.com)
#
-import sys, os, zipfile
+import os
+import sys
+import zipfile
+
+from textwrap import dedent
+from operator import itemgetter
SCRIPT = sys.argv[0]
VERSION = "3.2"
# The Unicode Database
-UNIDATA_VERSION = "6.0.0"
+UNIDATA_VERSION = "6.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@@ -39,6 +46,19 @@ UNIHAN = "Unihan%s.zip"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
+NAME_ALIASES = "NameAliases%s.txt"
+NAMED_SEQUENCES = "NamedSequences%s.txt"
+SPECIAL_CASING = "SpecialCasing%s.txt"
+CASE_FOLDING = "CaseFolding%s.txt"
+
+# Private Use Areas -- in planes 1, 15, 16
+PUA_1 = range(0xE000, 0xF900)
+PUA_15 = range(0xF0000, 0xFFFFE)
+PUA_16 = range(0x100000, 0x10FFFE)
+
+# we use this ranges of PUA_15 to store name aliases and named sequences
+NAME_ALIASES_START = 0xF0000
+NAMED_SEQUENCES_START = 0xF0200
old_versions = ["3.2.0"]
@@ -67,13 +87,15 @@ UPPER_MASK = 0x80
XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200
PRINTABLE_MASK = 0x400
-NODELTA_MASK = 0x800
-NUMERIC_MASK = 0x1000
+NUMERIC_MASK = 0x800
+CASE_IGNORABLE_MASK = 0x1000
+CASED_MASK = 0x2000
+EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DB5'),
- ('4E00', '9FCB'),
+ ('4E00', '9FCC'),
('20000', '2A6D6'),
('2A700', '2B734'),
('2B740', '2B81D')
@@ -367,6 +389,7 @@ def makeunicodetype(unicode, trace):
numeric = {}
spaces = []
linebreaks = []
+ extra_casing = []
for char in unicode.chars:
record = unicode.table[char]
@@ -379,7 +402,7 @@ def makeunicodetype(unicode, trace):
delta = True
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK
- if category == "Ll":
+ if "Lowercase" in properties:
flags |= LOWER_MASK
if 'Line_Break' in properties or bidirectional == "B":
flags |= LINEBREAK_MASK
@@ -389,7 +412,7 @@ def makeunicodetype(unicode, trace):
spaces.append(char)
if category == "Lt":
flags |= TITLE_MASK
- if category == "Lu":
+ if "Uppercase" in properties:
flags |= UPPER_MASK
if char == ord(" ") or category[0] not in ("C", "Z"):
flags |= PRINTABLE_MASK
@@ -397,7 +420,12 @@ def makeunicodetype(unicode, trace):
flags |= XID_START_MASK
if "XID_Continue" in properties:
flags |= XID_CONTINUE_MASK
- # use delta predictor for upper/lower/title if it fits
+ if "Cased" in properties:
+ flags |= CASED_MASK
+ if "Case_Ignorable" in properties:
+ flags |= CASE_IGNORABLE_MASK
+ sc = unicode.special_casing.get(char)
+ cf = unicode.case_folding.get(char, [char])
if record[12]:
upper = int(record[12], 16)
else:
@@ -409,23 +437,39 @@ def makeunicodetype(unicode, trace):
if record[14]:
title = int(record[14], 16)
else:
- # UCD.html says that a missing title char means that
- # it defaults to the uppercase character, not to the
- # character itself. Apparently, in the current UCD (5.x)
- # this feature is never used
title = upper
- upper_d = upper - char
- lower_d = lower - char
- title_d = title - char
- if -32768 <= upper_d <= 32767 and \
- -32768 <= lower_d <= 32767 and \
- -32768 <= title_d <= 32767:
- # use deltas
- upper = upper_d & 0xffff
- lower = lower_d & 0xffff
- title = title_d & 0xffff
+ if sc is None and cf != [lower]:
+ sc = ([lower], [title], [upper])
+ if sc is None:
+ if upper == lower == title:
+ upper = lower = title = 0
+ else:
+ upper = upper - char
+ lower = lower - char
+ title = title - char
+ assert (abs(upper) <= 2147483647 and
+ abs(lower) <= 2147483647 and
+ abs(title) <= 2147483647)
else:
- flags |= NODELTA_MASK
+ # This happens either when some character maps to more than one
+ # character in uppercase, lowercase, or titlecase or the
+ # casefolded version of the character is different from the
+ # lowercase. The extra characters are stored in a different
+ # array.
+ flags |= EXTENDED_CASE_MASK
+ lower = len(extra_casing) | (len(sc[0]) << 24)
+ extra_casing.extend(sc[0])
+ if cf != sc[0]:
+ lower |= len(cf) << 20
+ extra_casing.extend(cf)
+ upper = len(extra_casing) | (len(sc[2]) << 24)
+ extra_casing.extend(sc[2])
+ # Title is probably equal to upper.
+ if sc[1] == sc[2]:
+ title = upper
+ else:
+ title = len(extra_casing) | (len(sc[1]) << 24)
+ extra_casing.extend(sc[1])
# decimal digit, integer digit
decimal = 0
if record[6]:
@@ -452,6 +496,7 @@ def makeunicodetype(unicode, trace):
print(sum(map(len, numeric.values())), "numeric code points")
print(len(spaces), "whitespace code points")
print(len(linebreaks), "linebreak code points")
+ print(len(extra_casing), "extended case array")
print("--- Writing", FILE, "...")
@@ -465,6 +510,14 @@ def makeunicodetype(unicode, trace):
print("};", file=fp)
print(file=fp)
+ print("/* extended case mappings */", file=fp)
+ print(file=fp)
+ print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
+ for c in extra_casing:
+ print(" %d," % c, file=fp)
+ print("};", file=fp)
+ print(file=fp)
+
# split decomposition index table
index1, index2, shift = splitbins(index, trace)
@@ -692,6 +745,39 @@ def makeunicodename(unicode, trace):
print("/* name->code dictionary */", file=fp)
codehash.dump(fp, trace)
+ print(file=fp)
+ print('static const unsigned int aliases_start = %#x;' %
+ NAME_ALIASES_START, file=fp)
+ print('static const unsigned int aliases_end = %#x;' %
+ (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
+
+ print('static const unsigned int name_aliases[] = {', file=fp)
+ for name, codepoint in unicode.aliases:
+ print(' 0x%04X,' % codepoint, file=fp)
+ print('};', file=fp)
+
+ # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
+ # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
+ # sequences or sequences with non-BMP chars are added.
+ # unicodedata_lookup should be adapted too.
+ print(dedent("""
+ typedef struct NamedSequence {
+ int seqlen;
+ Py_UCS2 seq[4];
+ } named_sequence;
+ """), file=fp)
+
+ print('static const unsigned int named_sequences_start = %#x;' %
+ NAMED_SEQUENCES_START, file=fp)
+ print('static const unsigned int named_sequences_end = %#x;' %
+ (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
+
+ print('static const named_sequence named_sequences[] = {', file=fp)
+ for name, sequence in unicode.named_sequences:
+ seq_str = ', '.join('0x%04X' % cp for cp in sequence)
+ print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
+ print('};', file=fp)
+
fp.close()
@@ -726,7 +812,11 @@ def merge_old_version(version, new, old):
for k in range(len(old.table[i])):
if old.table[i][k] != new.table[i][k]:
value = old.table[i][k]
- if k == 2:
+ if k == 1 and i in PUA_15:
+ # the name is not set in the old.table, but in the
+ # new.table we are using it for aliases and named seq
+ assert value == ''
+ elif k == 2:
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
category_changes[i] = CATEGORY_NAMES.index(value)
elif k == 4:
@@ -816,15 +906,15 @@ class UnicodeData:
expand=1,
cjk_check=True):
self.changed = []
- file = open_data(UNICODE_DATA, version)
table = [None] * 0x110000
- while 1:
- s = file.readline()
- if not s:
- break
- s = s.strip().split(";")
- char = int(s[0], 16)
- table[char] = s
+ with open_data(UNICODE_DATA, version) as file:
+ while 1:
+ s = file.readline()
+ if not s:
+ break
+ s = s.strip().split(";")
+ char = int(s[0], 16)
+ table[char] = s
cjk_ranges_found = []
@@ -855,32 +945,79 @@ class UnicodeData:
self.table = table
self.chars = list(range(0x110000)) # unicode 3.2
- file = open_data(COMPOSITION_EXCLUSIONS, version)
+ # check for name aliases and named sequences, see #12753
+ # aliases and named sequences are not in 3.2.0
+ if version != '3.2.0':
+ self.aliases = []
+ # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
+ # in order to take advantage of the compression and lookup
+ # algorithms used for the other characters
+ pua_index = NAME_ALIASES_START
+ with open_data(NAME_ALIASES, version) as file:
+ for s in file:
+ s = s.strip()
+ if not s or s.startswith('#'):
+ continue
+ char, name, abbrev = s.split(';')
+ char = int(char, 16)
+ self.aliases.append((name, char))
+ # also store the name in the PUA 1
+ self.table[pua_index][1] = name
+ pua_index += 1
+ assert pua_index - NAME_ALIASES_START == len(self.aliases)
+
+ self.named_sequences = []
+ # store named seqences in the PUA 1, in range U+F0100..,
+ # in order to take advantage of the compression and lookup
+ # algorithms used for the other characters.
+
+ assert pua_index < NAMED_SEQUENCES_START
+ pua_index = NAMED_SEQUENCES_START
+ with open_data(NAMED_SEQUENCES, version) as file:
+ for s in file:
+ s = s.strip()
+ if not s or s.startswith('#'):
+ continue
+ name, chars = s.split(';')
+ chars = tuple(int(char, 16) for char in chars.split())
+ # check that the structure defined in makeunicodename is OK
+ assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
+ assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
+ "the NamedSequence struct and in unicodedata_lookup")
+ self.named_sequences.append((name, chars))
+ # also store these in the PUA 1
+ self.table[pua_index][1] = name
+ pua_index += 1
+ assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
+
self.exclusions = {}
- for s in file:
- s = s.strip()
- if not s:
- continue
- if s[0] == '#':
- continue
- char = int(s.split()[0],16)
- self.exclusions[char] = 1
+ with open_data(COMPOSITION_EXCLUSIONS, version) as file:
+ for s in file:
+ s = s.strip()
+ if not s:
+ continue
+ if s[0] == '#':
+ continue
+ char = int(s.split()[0],16)
+ self.exclusions[char] = 1
widths = [None] * 0x110000
- for s in open_data(EASTASIAN_WIDTH, version):
- s = s.strip()
- if not s:
- continue
- if s[0] == '#':
- continue
- s = s.split()[0].split(';')
- if '..' in s[0]:
- first, last = [int(c, 16) for c in s[0].split('..')]
- chars = list(range(first, last+1))
- else:
- chars = [int(s[0], 16)]
- for char in chars:
- widths[char] = s[1]
+ with open_data(EASTASIAN_WIDTH, version) as file:
+ for s in file:
+ s = s.strip()
+ if not s:
+ continue
+ if s[0] == '#':
+ continue
+ s = s.split()[0].split(';')
+ if '..' in s[0]:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ chars = list(range(first, last+1))
+ else:
+ chars = [int(s[0], 16)]
+ for char in chars:
+ widths[char] = s[1]
+
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(widths[i])
@@ -888,36 +1025,39 @@ class UnicodeData:
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(set())
- for s in open_data(DERIVED_CORE_PROPERTIES, version):
- s = s.split('#', 1)[0].strip()
- if not s:
- continue
- r, p = s.split(";")
- r = r.strip()
- p = p.strip()
- if ".." in r:
- first, last = [int(c, 16) for c in r.split('..')]
- chars = list(range(first, last+1))
- else:
- chars = [int(r, 16)]
- for char in chars:
- if table[char]:
- # Some properties (e.g. Default_Ignorable_Code_Point)
- # apply to unassigned code points; ignore them
- table[char][-1].add(p)
-
- for s in open_data(LINE_BREAK, version):
- s = s.partition('#')[0]
- s = [i.strip() for i in s.split(';')]
- if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
- continue
- if '..' not in s[0]:
- first = last = int(s[0], 16)
- else:
- first, last = [int(c, 16) for c in s[0].split('..')]
- for char in range(first, last+1):
- table[char][-1].add('Line_Break')
+ with open_data(DERIVED_CORE_PROPERTIES, version) as file:
+ for s in file:
+ s = s.split('#', 1)[0].strip()
+ if not s:
+ continue
+
+ r, p = s.split(";")
+ r = r.strip()
+ p = p.strip()
+ if ".." in r:
+ first, last = [int(c, 16) for c in r.split('..')]
+ chars = list(range(first, last+1))
+ else:
+ chars = [int(r, 16)]
+ for char in chars:
+ if table[char]:
+ # Some properties (e.g. Default_Ignorable_Code_Point)
+ # apply to unassigned code points; ignore them
+ table[char][-1].add(p)
+
+ with open_data(LINE_BREAK, version) as file:
+ for s in file:
+ s = s.partition('#')[0]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+ continue
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ table[char][-1].add('Line_Break')
# We only want the quickcheck properties
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -928,31 +1068,33 @@ class UnicodeData:
# for older versions, and no delta records will be created.
quickchecks = [0] * 0x110000
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
- for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
- if '#' in s:
- s = s[:s.index('#')]
- s = [i.strip() for i in s.split(';')]
- if len(s) < 2 or s[1] not in qc_order:
- continue
- quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
- quickcheck_shift = qc_order.index(s[1])*2
- quickcheck <<= quickcheck_shift
- if '..' not in s[0]:
- first = last = int(s[0], 16)
- else:
- first, last = [int(c, 16) for c in s[0].split('..')]
- for char in range(first, last+1):
- assert not (quickchecks[char]>>quickcheck_shift)&3
- quickchecks[char] |= quickcheck
+ with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
+ for s in file:
+ if '#' in s:
+ s = s[:s.index('#')]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in qc_order:
+ continue
+ quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+ quickcheck_shift = qc_order.index(s[1])*2
+ quickcheck <<= quickcheck_shift
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ assert not (quickchecks[char]>>quickcheck_shift)&3
+ quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
- zip = zipfile.ZipFile(open_data(UNIHAN, version))
- if version == '3.2.0':
- data = zip.open('Unihan-3.2.0.txt').read()
- else:
- data = zip.open('Unihan_NumericValues.txt').read()
+ with open_data(UNIHAN, version) as file:
+ zip = zipfile.ZipFile(file)
+ if version == '3.2.0':
+ data = zip.open('Unihan-3.2.0.txt').read()
+ else:
+ data = zip.open('Unihan_NumericValues.txt').read()
for line in data.decode("utf-8").splitlines():
if not line.startswith('U+'):
continue
@@ -965,6 +1107,34 @@ class UnicodeData:
# Patch the numeric field
if table[i] is not None:
table[i][8] = value
+ sc = self.special_casing = {}
+ with open_data(SPECIAL_CASING, version) as file:
+ for s in file:
+ s = s[:-1].split('#', 1)[0]
+ if not s:
+ continue
+ data = s.split("; ")
+ if data[4]:
+ # We ignore all conditionals (since they depend on
+ # languages) except for one, which is hardcoded. See
+ # handle_capital_sigma in unicodeobject.c.
+ continue
+ c = int(data[0], 16)
+ lower = [int(char, 16) for char in data[1].split()]
+ title = [int(char, 16) for char in data[2].split()]
+ upper = [int(char, 16) for char in data[3].split()]
+ sc[c] = (lower, title, upper)
+ cf = self.case_folding = {}
+ if version != '3.2.0':
+ with open_data(CASE_FOLDING, version) as file:
+ for s in file:
+ s = s[:-1].split('#', 1)[0]
+ if not s:
+ continue
+ data = s.split("; ")
+ if data[1] in "CF":
+ c = int(data[0], 16)
+ cf[c] = [int(char, 16) for char in data[2].split()]
def uselatin1(self):
# restrict character range to ISO Latin 1
diff --git a/Tools/unittestgui/unittestgui.py b/Tools/unittestgui/unittestgui.py
index b526646..09a20e2 100644
--- a/Tools/unittestgui/unittestgui.py
+++ b/Tools/unittestgui/unittestgui.py
@@ -28,7 +28,6 @@ SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
"""
__author__ = "Steve Purcell (stephen_purcell@yahoo.com)"
-__version__ = "$Revision: 1.7 $"[11:-2]
import sys
import traceback