diff options
Diffstat (limited to 'Modules/python.c')
-rw-r--r-- | Modules/python.c | 113 |
1 files changed, 89 insertions, 24 deletions
diff --git a/Modules/python.c b/Modules/python.c index f6da86f..4c0a55b 100644 --- a/Modules/python.c +++ b/Modules/python.c @@ -14,6 +14,93 @@ wmain(int argc, wchar_t **argv) return Py_Main(argc, argv); } #else +static wchar_t* +char2wchar(char* arg) +{ + wchar_t *res; +#ifdef HAVE_BROKEN_MBSTOWCS + /* Some platforms have a broken implementation of + * mbstowcs which does not count the characters that + * would result from conversion. Use an upper bound. + */ + size_t argsize = strlen(arg); +#else + size_t argsize = mbstowcs(NULL, arg, 0); +#endif + size_t count; + unsigned char *in; + wchar_t *out; +#ifdef HAVE_MBRTOWC + mbstate_t mbs; +#endif + if (argsize != (size_t)-1) { + res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t)); + if (!res) + goto oom; + count = mbstowcs(res, arg, argsize+1); + if (count != (size_t)-1) + return res; + PyMem_Free(res); + } + /* Conversion failed. Fall back to escaping with utf8b. */ +#ifdef HAVE_MBRTOWC + /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */ + + /* Overallocate; as multi-byte characters are in the argument, the + actual output could use less memory. */ + argsize = strlen(arg) + 1; + res = PyMem_Malloc(argsize*sizeof(wchar_t)); + if (!res) goto oom; + in = (unsigned char*)arg; + out = res; + memset(&mbs, 0, sizeof mbs); + while (argsize) { + size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); + if (converted == 0) + /* Reached end of string; null char stored. */ + break; + if (converted == (size_t)-2) { + /* Incomplete character. This should never happen, + since we provide everything that we have - + unless there is a bug in the C library, or I + misunderstood how mbrtowc works. */ + fprintf(stderr, "unexpected mbrtowc result -2\n"); + return NULL; + } + if (converted == (size_t)-1) { + /* Conversion error. Escape as UTF-8b, and start over + in the initial shift state. */ + *out++ = 0xdc00 + *in++; + argsize--; + memset(&mbs, 0, sizeof mbs); + continue; + } + /* successfully converted some bytes */ + in += converted; + argsize -= converted; + out++; + } +#else + /* Cannot use C locale for escaping; manually escape as if charset + is ASCII (i.e. escape all bytes > 128. This will still roundtrip + correctly in the locale's charset, which must be an ASCII superset. */ + res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t)); + if (!res) goto oom; + in = (unsigned char*)arg; + out = res; + while(*in) + if(*in < 128) + *out++ = *in++; + else + *out++ = 0xdc00 + *in++; + *out = 0; +#endif + return res; +oom: + fprintf(stderr, "out of memory\n"); + return NULL; +} + int main(int argc, char **argv) { @@ -40,31 +127,9 @@ main(int argc, char **argv) oldloc = strdup(setlocale(LC_ALL, NULL)); setlocale(LC_ALL, ""); for (i = 0; i < argc; i++) { -#ifdef HAVE_BROKEN_MBSTOWCS - /* Some platforms have a broken implementation of - * mbstowcs which does not count the characters that - * would result from conversion. Use an upper bound. - */ - size_t argsize = strlen(argv[i]); -#else - size_t argsize = mbstowcs(NULL, argv[i], 0); -#endif - size_t count; - if (argsize == (size_t)-1) { - fprintf(stderr, "Could not convert argument %d to string\n", i); + argv_copy2[i] = argv_copy[i] = char2wchar(argv[i]); + if (!argv_copy[i]) return 1; - } - argv_copy[i] = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t)); - argv_copy2[i] = argv_copy[i]; - if (!argv_copy[i]) { - fprintf(stderr, "out of memory\n"); - return 1; - } - count = mbstowcs(argv_copy[i], argv[i], argsize+1); - if (count == (size_t)-1) { - fprintf(stderr, "Could not convert argument %d to string\n", i); - return 1; - } } setlocale(LC_ALL, oldloc); free(oldloc); |