diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-02-09 20:38:12 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-02-09 20:38:12 (GMT) |
commit | 859cd4723f07e2b1da7387f5be0f2ce0a195974d (patch) | |
tree | 500f0eddeedff2c61e7da96b603a64d851138d65 /Tools | |
parent | 7451a72e2ba8939215324387e36285725632e637 (diff) | |
parent | b6ed17344b456f397df800cc553fef94f5b1e58b (diff) | |
download | cpython-859cd4723f07e2b1da7387f5be0f2ce0a195974d.zip cpython-859cd4723f07e2b1da7387f5be0f2ce0a195974d.tar.gz cpython-859cd4723f07e2b1da7387f5be0f2ce0a195974d.tar.bz2 |
Issue #17156: pygettext.py now uses an encoding of source file and correctly
writes and escapes non-ascii characters.
Diffstat (limited to 'Tools')
-rwxr-xr-x | Tools/i18n/pygettext.py | 66 |
1 files changed, 33 insertions, 33 deletions
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 93b2b79..79d976b 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -188,8 +188,8 @@ msgstr "" "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" "Language-Team: LANGUAGE <LL@li.org>\\n" "MIME-Version: 1.0\\n" -"Content-Type: text/plain; charset=CHARSET\\n" -"Content-Transfer-Encoding: ENCODING\\n" +"Content-Type: text/plain; charset=%(charset)s\\n" +"Content-Transfer-Encoding: %(encoding)s\\n" "Generated-By: pygettext.py %(version)s\\n" ''') @@ -203,35 +203,32 @@ def usage(code, msg=''): -escapes = [] - -def make_escapes(pass_iso8859): - global escapes - if pass_iso8859: - # Allow iso-8859 characters to pass through so that e.g. 'msgid +def make_escapes(pass_nonascii): + global escapes, escape + if pass_nonascii: + # Allow non-ascii characters to pass through so that e.g. 'msgid # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we # escape any character outside the 32..126 range. mod = 128 + escape = escape_ascii else: mod = 256 - for i in range(256): - if 32 <= (i % mod) <= 126: - escapes.append(chr(i)) - else: - escapes.append("\\%03o" % i) - escapes[ord('\\')] = '\\\\' - escapes[ord('\t')] = '\\t' - escapes[ord('\r')] = '\\r' - escapes[ord('\n')] = '\\n' - escapes[ord('\"')] = '\\"' + escape = escape_nonascii + escapes = [r"\%03o" % i for i in range(mod)] + for i in range(32, 127): + escapes[i] = chr(i) + escapes[ord('\\')] = r'\\' + escapes[ord('\t')] = r'\t' + escapes[ord('\r')] = r'\r' + escapes[ord('\n')] = r'\n' + escapes[ord('\"')] = r'\"' + +def escape_ascii(s, encoding): + return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) -def escape(s): - global escapes - s = list(s) - for i in range(len(s)): - s[i] = escapes[ord(s[i])] - return EMPTYSTRING.join(s) +def escape_nonascii(s, encoding): + return ''.join(escapes[b] for b in s.encode(encoding)) def safe_eval(s): @@ -239,18 +236,18 @@ def safe_eval(s): return eval(s, {'__builtins__':{}}, {}) -def normalize(s): +def normalize(s, encoding): # This converts the various Python string types into a format that is # appropriate for .po files, namely much closer to C style. lines = s.split('\n') if len(lines) == 1: - s = '"' + escape(s) + '"' + s = '"' + escape(s, encoding) + '"' else: if not lines[-1]: del lines[-1] lines[-1] = lines[-1] + '\n' for i in range(len(lines)): - lines[i] = escape(lines[i]) + lines[i] = escape(lines[i], encoding) lineterm = '\\n"\n"' s = '""\n"' + lineterm.join(lines) + '"' return s @@ -447,7 +444,10 @@ class TokenEater: timestamp = time.strftime('%Y-%m-%d %H:%M+%Z') # The time stamp in the header doesn't have the same format as that # generated by xgettext... - print(pot_header % {'time': timestamp, 'version': __version__}, file=fp) + encoding = fp.encoding if fp.encoding else 'UTF-8' + print(pot_header % {'time': timestamp, 'version': __version__, + 'charset': encoding, + 'encoding': '8bit'}, file=fp) # Sort the entries. First sort each particular entry's keys, then # sort all the entries by their first item. reverse = {} @@ -491,7 +491,7 @@ class TokenEater: print(locline, file=fp) if isdocstring: print('#, docstring', file=fp) - print('msgid', normalize(k), file=fp) + print('msgid', normalize(k, encoding), file=fp) print('msgstr ""\n', file=fp) @@ -587,7 +587,7 @@ def main(): fp.close() # calculate escapes - make_escapes(options.escape) + make_escapes(not options.escape) # calculate all keywords options.keywords.extend(default_keywords) @@ -620,17 +620,17 @@ def main(): if filename == '-': if options.verbose: print(_('Reading standard input')) - fp = sys.stdin + fp = sys.stdin.buffer closep = 0 else: if options.verbose: print(_('Working on %s') % filename) - fp = open(filename) + fp = open(filename, 'rb') closep = 1 try: eater.set_filename(filename) try: - tokens = tokenize.generate_tokens(fp.readline) + tokens = tokenize.tokenize(fp.readline) for _token in tokens: eater(*_token) except tokenize.TokenError as e: |