diff options
author | Hye-Shik Chang <hyeshik@gmail.com> | 2004-08-04 06:33:51 (GMT) |
---|---|---|
committer | Hye-Shik Chang <hyeshik@gmail.com> | 2004-08-04 06:33:51 (GMT) |
commit | b5047fd01948ab108edcc1b3c2c901d915814cfd (patch) | |
tree | 57da79f0a53a1886bd1cdc431d86c0c18aa5a413 | |
parent | 6db15d7307b831766617f6a9700ecc4c75a16081 (diff) | |
download | cpython-b5047fd01948ab108edcc1b3c2c901d915814cfd.zip cpython-b5047fd01948ab108edcc1b3c2c901d915814cfd.tar.gz cpython-b5047fd01948ab108edcc1b3c2c901d915814cfd.tar.bz2 |
Add a workaround for a problem that UTF-8 strings can be corrupted
or broken by basic ctype functions in 4.4BSD descendants. This
will be fixed in their future development branches but they'll keep
the POSIX-incompatibility for their backward-compatiblities in near
future.
-rw-r--r-- | Include/pyport.h | 33 | ||||
-rw-r--r-- | Lib/test/test_locale.py | 35 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
3 files changed, 71 insertions, 0 deletions
diff --git a/Include/pyport.h b/Include/pyport.h index 0ee42f0..b20bc15 100644 --- a/Include/pyport.h +++ b/Include/pyport.h @@ -411,6 +411,39 @@ extern int fdatasync(int); extern double hypot(double, double); #endif + +/******************************************************************* +On 4.4BSD-descendants, ctype functions serves the whole range of +wchar_t character set rather than single byte code points only. +This characteristic can break some operations of string object +including str.upper() and str.split() on UTF-8 locales. This +workaround was provided by Tim Robbins of FreeBSD project. He said +the incompatibility will be fixed in FreeBSD 6. +********************************************************************/ + +#ifdef __FreeBSD__ +#include <osreldate.h> +#if __FreeBSD_version > 500039 +#include <ctype.h> +#include <wctype.h> +#undef isalnum +#define isalnum(c) iswalnum(btowc(c)) +#undef isalpha +#define isalpha(c) iswalpha(btowc(c)) +#undef islower +#define islower(c) iswlower(btowc(c)) +#undef isspace +#define isspace(c) iswspace(btowc(c)) +#undef isupper +#define isupper(c) iswupper(btowc(c)) +#undef tolower +#define tolower(c) towlower(btowc(c)) +#undef toupper +#define toupper(c) towupper(btowc(c)) +#endif +#endif + + /* Declarations for symbol visibility. PyAPI_FUNC(type): Declares a public Python API function and return type diff --git a/Lib/test/test_locale.py b/Lib/test/test_locale.py index 9187c9e..d8f7925 100644 --- a/Lib/test/test_locale.py +++ b/Lib/test/test_locale.py @@ -47,3 +47,38 @@ try: locale.getpreferredencoding() finally: locale.setlocale(locale.LC_NUMERIC, oldlocale) + + +# Test BSD Rune locale's bug for isctype functions. +def teststrop(s, method, output): + if verbose: + print "%s.%s() =? %s ..." % (repr(s), method, repr(output)), + result = getattr(s, method)() + if result != output: + if verbose: + print "no" + print "%s.%s() == %s != %s" % (repr(s), method, repr(result), + repr(output)) + elif verbose: + print "yes" + +try: + oldlocale = locale.setlocale(locale.LC_CTYPE) + locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8') +except locale.Error: + pass +else: + try: + teststrop('\x20', 'isspace', True) + teststrop('\xa0', 'isspace', False) + teststrop('\xa1', 'isspace', False) + teststrop('\xc0', 'isalpha', False) + teststrop('\xc0', 'isalnum', False) + teststrop('\xc0', 'isupper', False) + teststrop('\xc0', 'islower', False) + teststrop('\xec\xa0\xbc', 'split', ['\xec\xa0\xbc']) + teststrop('\xed\x95\xa0', 'strip', '\xed\x95\xa0') + teststrop('\xcc\x85', 'lower', '\xcc\x85') + teststrop('\xed\x95\xa0', 'upper', '\xed\x95\xa0') + finally: + locale.setlocale(locale.LC_CTYPE, oldlocale) @@ -64,6 +64,9 @@ Core and builtins - Implemented bind_textdomain_codeset() in locale module. +- Added a workaround for proper string operations in BSDs. str.split + and str.is* methods can now work correctly with UTF-8 locales. + Extension modules ----------------- |