summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHye-Shik Chang <hyeshik@gmail.com>2004-08-04 06:33:51 (GMT)
committerHye-Shik Chang <hyeshik@gmail.com>2004-08-04 06:33:51 (GMT)
commitb5047fd01948ab108edcc1b3c2c901d915814cfd (patch)
tree57da79f0a53a1886bd1cdc431d86c0c18aa5a413
parent6db15d7307b831766617f6a9700ecc4c75a16081 (diff)
downloadcpython-b5047fd01948ab108edcc1b3c2c901d915814cfd.zip
cpython-b5047fd01948ab108edcc1b3c2c901d915814cfd.tar.gz
cpython-b5047fd01948ab108edcc1b3c2c901d915814cfd.tar.bz2
Add a workaround for a problem that UTF-8 strings can be corrupted
or broken by basic ctype functions in 4.4BSD descendants. This will be fixed in their future development branches but they'll keep the POSIX-incompatibility for their backward-compatiblities in near future.
-rw-r--r--Include/pyport.h33
-rw-r--r--Lib/test/test_locale.py35
-rw-r--r--Misc/NEWS3
3 files changed, 71 insertions, 0 deletions
diff --git a/Include/pyport.h b/Include/pyport.h
index 0ee42f0..b20bc15 100644
--- a/Include/pyport.h
+++ b/Include/pyport.h
@@ -411,6 +411,39 @@ extern int fdatasync(int);
extern double hypot(double, double);
#endif
+
+/*******************************************************************
+On 4.4BSD-descendants, ctype functions serves the whole range of
+wchar_t character set rather than single byte code points only.
+This characteristic can break some operations of string object
+including str.upper() and str.split() on UTF-8 locales. This
+workaround was provided by Tim Robbins of FreeBSD project. He said
+the incompatibility will be fixed in FreeBSD 6.
+********************************************************************/
+
+#ifdef __FreeBSD__
+#include <osreldate.h>
+#if __FreeBSD_version > 500039
+#include <ctype.h>
+#include <wctype.h>
+#undef isalnum
+#define isalnum(c) iswalnum(btowc(c))
+#undef isalpha
+#define isalpha(c) iswalpha(btowc(c))
+#undef islower
+#define islower(c) iswlower(btowc(c))
+#undef isspace
+#define isspace(c) iswspace(btowc(c))
+#undef isupper
+#define isupper(c) iswupper(btowc(c))
+#undef tolower
+#define tolower(c) towlower(btowc(c))
+#undef toupper
+#define toupper(c) towupper(btowc(c))
+#endif
+#endif
+
+
/* Declarations for symbol visibility.
PyAPI_FUNC(type): Declares a public Python API function and return type
diff --git a/Lib/test/test_locale.py b/Lib/test/test_locale.py
index 9187c9e..d8f7925 100644
--- a/Lib/test/test_locale.py
+++ b/Lib/test/test_locale.py
@@ -47,3 +47,38 @@ try:
locale.getpreferredencoding()
finally:
locale.setlocale(locale.LC_NUMERIC, oldlocale)
+
+
+# Test BSD Rune locale's bug for isctype functions.
+def teststrop(s, method, output):
+ if verbose:
+ print "%s.%s() =? %s ..." % (repr(s), method, repr(output)),
+ result = getattr(s, method)()
+ if result != output:
+ if verbose:
+ print "no"
+ print "%s.%s() == %s != %s" % (repr(s), method, repr(result),
+ repr(output))
+ elif verbose:
+ print "yes"
+
+try:
+ oldlocale = locale.setlocale(locale.LC_CTYPE)
+ locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
+except locale.Error:
+ pass
+else:
+ try:
+ teststrop('\x20', 'isspace', True)
+ teststrop('\xa0', 'isspace', False)
+ teststrop('\xa1', 'isspace', False)
+ teststrop('\xc0', 'isalpha', False)
+ teststrop('\xc0', 'isalnum', False)
+ teststrop('\xc0', 'isupper', False)
+ teststrop('\xc0', 'islower', False)
+ teststrop('\xec\xa0\xbc', 'split', ['\xec\xa0\xbc'])
+ teststrop('\xed\x95\xa0', 'strip', '\xed\x95\xa0')
+ teststrop('\xcc\x85', 'lower', '\xcc\x85')
+ teststrop('\xed\x95\xa0', 'upper', '\xed\x95\xa0')
+ finally:
+ locale.setlocale(locale.LC_CTYPE, oldlocale)
diff --git a/Misc/NEWS b/Misc/NEWS
index c450f63..40c58b3 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -64,6 +64,9 @@ Core and builtins
- Implemented bind_textdomain_codeset() in locale module.
+- Added a workaround for proper string operations in BSDs. str.split
+ and str.is* methods can now work correctly with UTF-8 locales.
+
Extension modules
-----------------