diff options
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rwxr-xr-x | Tools/i18n/pygettext.py | 66 |
2 files changed, 36 insertions, 33 deletions
@@ -169,6 +169,9 @@ Core and Builtins Library ------- +- Issue #17156: pygettext.py now uses an encoding of source file and correctly + writes and escapes non-ascii characters. + - Issue #16564: Fixed regression relative to Python2 in the operation of email.encoders.encode_noop when used with binary data. diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 93b2b79..79d976b 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -188,8 +188,8 @@ msgstr "" "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" "Language-Team: LANGUAGE <LL@li.org>\\n" "MIME-Version: 1.0\\n" -"Content-Type: text/plain; charset=CHARSET\\n" -"Content-Transfer-Encoding: ENCODING\\n" +"Content-Type: text/plain; charset=%(charset)s\\n" +"Content-Transfer-Encoding: %(encoding)s\\n" "Generated-By: pygettext.py %(version)s\\n" ''') @@ -203,35 +203,32 @@ def usage(code, msg=''): -escapes = [] - -def make_escapes(pass_iso8859): - global escapes - if pass_iso8859: - # Allow iso-8859 characters to pass through so that e.g. 'msgid +def make_escapes(pass_nonascii): + global escapes, escape + if pass_nonascii: + # Allow non-ascii characters to pass through so that e.g. 'msgid # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we # escape any character outside the 32..126 range. mod = 128 + escape = escape_ascii else: mod = 256 - for i in range(256): - if 32 <= (i % mod) <= 126: - escapes.append(chr(i)) - else: - escapes.append("\\%03o" % i) - escapes[ord('\\')] = '\\\\' - escapes[ord('\t')] = '\\t' - escapes[ord('\r')] = '\\r' - escapes[ord('\n')] = '\\n' - escapes[ord('\"')] = '\\"' + escape = escape_nonascii + escapes = [r"\%03o" % i for i in range(mod)] + for i in range(32, 127): + escapes[i] = chr(i) + escapes[ord('\\')] = r'\\' + escapes[ord('\t')] = r'\t' + escapes[ord('\r')] = r'\r' + escapes[ord('\n')] = r'\n' + escapes[ord('\"')] = r'\"' + +def escape_ascii(s, encoding): + return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) -def escape(s): - global escapes - s = list(s) - for i in range(len(s)): - s[i] = escapes[ord(s[i])] - return EMPTYSTRING.join(s) +def escape_nonascii(s, encoding): + return ''.join(escapes[b] for b in s.encode(encoding)) def safe_eval(s): @@ -239,18 +236,18 @@ def safe_eval(s): return eval(s, {'__builtins__':{}}, {}) -def normalize(s): +def normalize(s, encoding): # This converts the various Python string types into a format that is # appropriate for .po files, namely much closer to C style. lines = s.split('\n') if len(lines) == 1: - s = '"' + escape(s) + '"' + s = '"' + escape(s, encoding) + '"' else: if not lines[-1]: del lines[-1] lines[-1] = lines[-1] + '\n' for i in range(len(lines)): - lines[i] = escape(lines[i]) + lines[i] = escape(lines[i], encoding) lineterm = '\\n"\n"' s = '""\n"' + lineterm.join(lines) + '"' return s @@ -447,7 +444,10 @@ class TokenEater: timestamp = time.strftime('%Y-%m-%d %H:%M+%Z') # The time stamp in the header doesn't have the same format as that # generated by xgettext... - print(pot_header % {'time': timestamp, 'version': __version__}, file=fp) + encoding = fp.encoding if fp.encoding else 'UTF-8' + print(pot_header % {'time': timestamp, 'version': __version__, + 'charset': encoding, + 'encoding': '8bit'}, file=fp) # Sort the entries. First sort each particular entry's keys, then # sort all the entries by their first item. reverse = {} @@ -491,7 +491,7 @@ class TokenEater: print(locline, file=fp) if isdocstring: print('#, docstring', file=fp) - print('msgid', normalize(k), file=fp) + print('msgid', normalize(k, encoding), file=fp) print('msgstr ""\n', file=fp) @@ -587,7 +587,7 @@ def main(): fp.close() # calculate escapes - make_escapes(options.escape) + make_escapes(not options.escape) # calculate all keywords options.keywords.extend(default_keywords) @@ -620,17 +620,17 @@ def main(): if filename == '-': if options.verbose: print(_('Reading standard input')) - fp = sys.stdin + fp = sys.stdin.buffer closep = 0 else: if options.verbose: print(_('Working on %s') % filename) - fp = open(filename) + fp = open(filename, 'rb') closep = 1 try: eater.set_filename(filename) try: - tokens = tokenize.generate_tokens(fp.readline) + tokens = tokenize.tokenize(fp.readline) for _token in tokens: eater(*_token) except tokenize.TokenError as e: |