summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBarry Warsaw <barry@python.org>2000-03-20 16:36:48 (GMT)
committerBarry Warsaw <barry@python.org>2000-03-20 16:36:48 (GMT)
commit51ac58039f62ef9d605974dae32a6ada9c26039b (patch)
treec8aee44da7330978efe15671b2c9e98bc898eea3
parentabc411bac883c1706a9dcc8b1bea85a0b940cbfb (diff)
downloadcpython-51ac58039f62ef9d605974dae32a6ada9c26039b.zip
cpython-51ac58039f62ef9d605974dae32a6ada9c26039b.tar.gz
cpython-51ac58039f62ef9d605974dae32a6ada9c26039b.tar.bz2
On 17-Mar-2000, Marc-Andre Lemburg said:
Attached you find an update of the Unicode implementation. The patch is against the current CVS version. I would appreciate if someone with CVS checkin permissions could check the changes in. The patch contains all bugs and patches sent this week and also fixes a leak in the codecs code and a bug in the free list code for Unicode objects (which only shows up when compiling Python with Py_DEBUG; thanks to MarkH for spotting this one).
-rw-r--r--Include/unicodeobject.h16
-rw-r--r--Lib/encodings/__init__.py6
-rw-r--r--Lib/test/test_string.py1
-rw-r--r--Lib/test/test_unicode.py1
-rw-r--r--Misc/unicode.txt11
-rw-r--r--Modules/stropmodule.c2
-rw-r--r--Objects/stringobject.c4
-rw-r--r--Objects/unicodeobject.c45
-rw-r--r--Python/codecs.c14
9 files changed, 61 insertions, 39 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 37f2b0d..770ecab 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1,8 +1,5 @@
#ifndef Py_UNICODEOBJECT_H
#define Py_UNICODEOBJECT_H
-#ifdef __cplusplus
-extern "C" {
-#endif
/*
@@ -109,8 +106,9 @@ typedef unsigned short Py_UNICODE;
/* --- Internal Unicode Operations ---------------------------------------- */
/* If you want Python to use the compiler's wctype.h functions instead
- of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS.
- This reduces the interpreter's code size. */
+ of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
+ configure Python using --with-ctype-functions. This reduces the
+ interpreter's code size. */
#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
@@ -169,6 +167,10 @@ typedef unsigned short Py_UNICODE;
(!memcmp((string)->str + (offset), (substring)->str,\
(substring)->length*sizeof(Py_UNICODE)))
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/* --- Unicode Type ------------------------------------------------------- */
typedef struct {
@@ -647,7 +649,7 @@ extern DL_IMPORT(int) PyUnicode_Find(
int direction /* Find direction: +1 forward, -1 backward */
);
-/* Count the number of occurances of substr in str[start:end]. */
+/* Count the number of occurrences of substr in str[start:end]. */
extern DL_IMPORT(int) PyUnicode_Count(
PyObject *str, /* String */
@@ -656,7 +658,7 @@ extern DL_IMPORT(int) PyUnicode_Count(
int end /* Stop index */
);
-/* Replace at most maxcount occurances of substr in str with replstr
+/* Replace at most maxcount occurrences of substr in str with replstr
and return the resulting Unicode object. */
extern DL_IMPORT(PyObject *) PyUnicode_Replace(
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py
index c33b822..cd5876e 100644
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -30,13 +30,13 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
import string,codecs,aliases
_cache = {}
-_unkown = '--unkown--'
+_unknown = '--unknown--'
def search_function(encoding):
# Cache lookup
- entry = _cache.get(encoding,_unkown)
- if entry is not _unkown:
+ entry = _cache.get(encoding,_unknown)
+ if entry is not _unknown:
return entry
# Import the module
diff --git a/Lib/test/test_string.py b/Lib/test/test_string.py
index bb6d035..4a3e474 100644
--- a/Lib/test/test_string.py
+++ b/Lib/test/test_string.py
@@ -143,6 +143,7 @@ test('translate', 'abc', 'Abc', table)
test('translate', 'xyz', 'xyz', table)
test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1)
+test('replace', 'one!two!three!', 'onetwothree', '!', '')
test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2)
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3)
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4)
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 64b8ff8..69d4273 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -108,6 +108,7 @@ if 0:
test('translate', u'xyz', u'xyz', table)
test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
+test('replace', u'one!two!three!', u'onetwothree', '!', '')
test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
diff --git a/Misc/unicode.txt b/Misc/unicode.txt
index b31beef..9a4832a 100644
--- a/Misc/unicode.txt
+++ b/Misc/unicode.txt
@@ -743,8 +743,9 @@ For explicit handling of files using Unicode, the standard
stream codecs as available through the codecs module should
be used.
-XXX There should be a short-cut open(filename,mode,encoding) available which
- also assures that mode contains the 'b' character when needed.
+The codecs module should provide a short-cut open(filename,mode,encoding)
+available which also assures that mode contains the 'b' character when
+needed.
File/Stream Input:
@@ -810,6 +811,10 @@ Unicode-Mappings:
Introduction to Unicode (a little outdated by still nice to read):
http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html
+For comparison:
+ Introducing Unicode to ECMAScript --
+ http://www-4.ibm.com/software/developer/library/internationalization-support.html
+
Encodings:
Overview:
@@ -832,7 +837,7 @@ Encodings:
History of this Proposal:
-------------------------
-1.2:
+1.2: Removed POD about codecs.open()
1.1: Added note about comparisons and hash values. Added note about
case mapping algorithms. Changed stream codecs .read() and
.write() method to match the standard file-like object methods
diff --git a/Modules/stropmodule.c b/Modules/stropmodule.c
index a0d8b9a..4c9ee76 100644
--- a/Modules/stropmodule.c
+++ b/Modules/stropmodule.c
@@ -1054,7 +1054,7 @@ strop_translate(self, args)
strstr replacement for arbitrary blocks of memory.
- Locates the first occurance in the memory pointed to by MEM of the
+ Locates the first occurrence in the memory pointed to by MEM of the
contents of memory pointed to by PAT. Returns the index into MEM if
found, or -1 if not found. If len of PAT is greater than length of
MEM, the function returns -1.
diff --git a/Objects/stringobject.c b/Objects/stringobject.c
index 2d404b9..10257f7 100644
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -1395,7 +1395,7 @@ string_translate(self, args)
strstr replacement for arbitrary blocks of memory.
- Locates the first occurance in the memory pointed to by MEM of the
+ Locates the first occurrence in the memory pointed to by MEM of the
contents of memory pointed to by PAT. Returns the index into MEM if
found, or -1 if not found. If len of PAT is greater than length of
MEM, the function returns -1.
@@ -1578,7 +1578,7 @@ string_replace(self, args)
return NULL;
if (sub_len <= 0) {
- PyErr_SetString(PyExc_ValueError, "empty replacement string");
+ PyErr_SetString(PyExc_ValueError, "empty pattern string");
return NULL;
}
new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len);
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index da12da2..d63165e 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -83,7 +83,7 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
all objects on the free list having a size less than this
limit. This reduces malloc() overhead for small Unicode objects.
- At worse this will result in MAX_UNICODE_FREELIST_SIZE *
+ At worst this will result in MAX_UNICODE_FREELIST_SIZE *
(sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
malloc()-overhead) bytes of unused garbage.
@@ -180,7 +180,7 @@ PyUnicodeObject *_PyUnicode_New(int length)
unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
unicode_freelist_size--;
unicode->ob_type = &PyUnicode_Type;
- _Py_NewReference(unicode);
+ _Py_NewReference((PyObject *)unicode);
if (unicode->str) {
if (unicode->length < length &&
_PyUnicode_Resize(unicode, length)) {
@@ -199,16 +199,19 @@ PyUnicodeObject *_PyUnicode_New(int length)
unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
}
- if (!unicode->str) {
- PyMem_DEL(unicode);
- PyErr_NoMemory();
- return NULL;
- }
+ if (!unicode->str)
+ goto onError;
unicode->str[length] = 0;
unicode->length = length;
unicode->hash = -1;
unicode->utf8str = NULL;
return unicode;
+
+ onError:
+ _Py_ForgetReference((PyObject *)unicode);
+ PyMem_DEL(unicode);
+ PyErr_NoMemory();
+ return NULL;
}
static
@@ -224,7 +227,6 @@ void _PyUnicode_Free(register PyUnicodeObject *unicode)
*(PyUnicodeObject **)unicode = unicode_freelist;
unicode_freelist = unicode;
unicode_freelist_size++;
- _Py_ForgetReference(unicode);
}
else {
free(unicode->str);
@@ -489,7 +491,7 @@ int utf8_decoding_error(const char **source,
}
else {
PyErr_Format(PyExc_ValueError,
- "UTF-8 decoding error; unkown error handling code: %s",
+ "UTF-8 decoding error; unknown error handling code: %s",
errors);
return -1;
}
@@ -611,7 +613,7 @@ int utf8_encoding_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"UTF-8 encoding error; "
- "unkown error handling code: %s",
+ "unknown error handling code: %s",
errors);
return -1;
}
@@ -733,7 +735,7 @@ int utf16_decoding_error(const Py_UNICODE **source,
}
else {
PyErr_Format(PyExc_ValueError,
- "UTF-16 decoding error; unkown error handling code: %s",
+ "UTF-16 decoding error; unknown error handling code: %s",
errors);
return -1;
}
@@ -921,7 +923,7 @@ int unicodeescape_decoding_error(const char **source,
else {
PyErr_Format(PyExc_ValueError,
"Unicode-Escape decoding error; "
- "unkown error handling code: %s",
+ "unknown error handling code: %s",
errors);
return -1;
}
@@ -1051,6 +1053,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*/
+static const Py_UNICODE *findchar(const Py_UNICODE *s,
+ int size,
+ Py_UNICODE ch);
+
static
PyObject *unicodeescape_string(const Py_UNICODE *s,
int size,
@@ -1069,9 +1075,6 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
p = q = PyString_AS_STRING(repr);
if (quotes) {
- static const Py_UNICODE *findchar(const Py_UNICODE *s,
- int size,
- Py_UNICODE ch);
*p++ = 'u';
*p++ = (findchar(s, size, '\'') &&
!findchar(s, size, '"')) ? '"' : '\'';
@@ -1298,7 +1301,7 @@ int latin1_encoding_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"Latin-1 encoding error; "
- "unkown error handling code: %s",
+ "unknown error handling code: %s",
errors);
return -1;
}
@@ -1369,7 +1372,7 @@ int ascii_decoding_error(const char **source,
else {
PyErr_Format(PyExc_ValueError,
"ASCII decoding error; "
- "unkown error handling code: %s",
+ "unknown error handling code: %s",
errors);
return -1;
}
@@ -1431,7 +1434,7 @@ int ascii_encoding_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"ASCII encoding error; "
- "unkown error handling code: %s",
+ "unknown error handling code: %s",
errors);
return -1;
}
@@ -1502,7 +1505,7 @@ int charmap_decoding_error(const char **source,
else {
PyErr_Format(PyExc_ValueError,
"charmap decoding error; "
- "unkown error handling code: %s",
+ "unknown error handling code: %s",
errors);
return -1;
}
@@ -1618,7 +1621,7 @@ int charmap_encoding_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"charmap encoding error; "
- "unkown error handling code: %s",
+ "unknown error handling code: %s",
errors);
return -1;
}
@@ -1750,7 +1753,7 @@ int translate_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"translate error; "
- "unkown error handling code: %s",
+ "unknown error handling code: %s",
errors);
return -1;
}
diff --git a/Python/codecs.c b/Python/codecs.c
index 5075a20..2d49377 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -93,9 +93,14 @@ PyObject *lowercasestring(const char *string)
PyObject *_PyCodec_Lookup(const char *encoding)
{
- PyObject *result, *args = NULL, *v;
+ PyObject *result, *args = NULL, *v = NULL;
int i, len;
+ if (_PyCodec_SearchCache == NULL || _PyCodec_SearchPath == NULL) {
+ PyErr_SetString(PyExc_SystemError,
+ "codec module not properly initialized");
+ goto onError;
+ }
if (!import_encodings_called)
import_encodings();
@@ -109,6 +114,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
result = PyDict_GetItem(_PyCodec_SearchCache, v);
if (result != NULL) {
Py_INCREF(result);
+ Py_DECREF(v);
return result;
}
@@ -121,6 +127,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
if (args == NULL)
goto onError;
PyTuple_SET_ITEM(args,0,v);
+ v = NULL;
for (i = 0; i < len; i++) {
PyObject *func;
@@ -146,7 +153,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
if (i == len) {
/* XXX Perhaps we should cache misses too ? */
PyErr_SetString(PyExc_LookupError,
- "unkown encoding");
+ "unknown encoding");
goto onError;
}
@@ -156,6 +163,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
return result;
onError:
+ Py_XDECREF(v);
Py_XDECREF(args);
return NULL;
}
@@ -378,5 +386,7 @@ void _PyCodecRegistry_Init()
void _PyCodecRegistry_Fini()
{
Py_XDECREF(_PyCodec_SearchPath);
+ _PyCodec_SearchPath = NULL;
Py_XDECREF(_PyCodec_SearchCache);
+ _PyCodec_SearchCache = NULL;
}