summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Misc/NEWS3
-rwxr-xr-xTools/i18n/pygettext.py66
2 files changed, 36 insertions, 33 deletions
diff --git a/Misc/NEWS b/Misc/NEWS
index 5b56d0c..7d3fc94 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -169,6 +169,9 @@ Core and Builtins
Library
-------
+- Issue #17156: pygettext.py now uses an encoding of source file and correctly
+ writes and escapes non-ascii characters.
+
- Issue #16564: Fixed regression relative to Python2 in the operation of
email.encoders.encode_noop when used with binary data.
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py
index 93b2b79..79d976b 100755
--- a/Tools/i18n/pygettext.py
+++ b/Tools/i18n/pygettext.py
@@ -188,8 +188,8 @@ msgstr ""
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
"Language-Team: LANGUAGE <LL@li.org>\\n"
"MIME-Version: 1.0\\n"
-"Content-Type: text/plain; charset=CHARSET\\n"
-"Content-Transfer-Encoding: ENCODING\\n"
+"Content-Type: text/plain; charset=%(charset)s\\n"
+"Content-Transfer-Encoding: %(encoding)s\\n"
"Generated-By: pygettext.py %(version)s\\n"
''')
@@ -203,35 +203,32 @@ def usage(code, msg=''):
-escapes = []
-
-def make_escapes(pass_iso8859):
- global escapes
- if pass_iso8859:
- # Allow iso-8859 characters to pass through so that e.g. 'msgid
+def make_escapes(pass_nonascii):
+ global escapes, escape
+ if pass_nonascii:
+ # Allow non-ascii characters to pass through so that e.g. 'msgid
# "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
# escape any character outside the 32..126 range.
mod = 128
+ escape = escape_ascii
else:
mod = 256
- for i in range(256):
- if 32 <= (i % mod) <= 126:
- escapes.append(chr(i))
- else:
- escapes.append("\\%03o" % i)
- escapes[ord('\\')] = '\\\\'
- escapes[ord('\t')] = '\\t'
- escapes[ord('\r')] = '\\r'
- escapes[ord('\n')] = '\\n'
- escapes[ord('\"')] = '\\"'
+ escape = escape_nonascii
+ escapes = [r"\%03o" % i for i in range(mod)]
+ for i in range(32, 127):
+ escapes[i] = chr(i)
+ escapes[ord('\\')] = r'\\'
+ escapes[ord('\t')] = r'\t'
+ escapes[ord('\r')] = r'\r'
+ escapes[ord('\n')] = r'\n'
+ escapes[ord('\"')] = r'\"'
+
+def escape_ascii(s, encoding):
+ return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
-def escape(s):
- global escapes
- s = list(s)
- for i in range(len(s)):
- s[i] = escapes[ord(s[i])]
- return EMPTYSTRING.join(s)
+def escape_nonascii(s, encoding):
+ return ''.join(escapes[b] for b in s.encode(encoding))
def safe_eval(s):
@@ -239,18 +236,18 @@ def safe_eval(s):
return eval(s, {'__builtins__':{}}, {})
-def normalize(s):
+def normalize(s, encoding):
# This converts the various Python string types into a format that is
# appropriate for .po files, namely much closer to C style.
lines = s.split('\n')
if len(lines) == 1:
- s = '"' + escape(s) + '"'
+ s = '"' + escape(s, encoding) + '"'
else:
if not lines[-1]:
del lines[-1]
lines[-1] = lines[-1] + '\n'
for i in range(len(lines)):
- lines[i] = escape(lines[i])
+ lines[i] = escape(lines[i], encoding)
lineterm = '\\n"\n"'
s = '""\n"' + lineterm.join(lines) + '"'
return s
@@ -447,7 +444,10 @@ class TokenEater:
timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
# The time stamp in the header doesn't have the same format as that
# generated by xgettext...
- print(pot_header % {'time': timestamp, 'version': __version__}, file=fp)
+ encoding = fp.encoding if fp.encoding else 'UTF-8'
+ print(pot_header % {'time': timestamp, 'version': __version__,
+ 'charset': encoding,
+ 'encoding': '8bit'}, file=fp)
# Sort the entries. First sort each particular entry's keys, then
# sort all the entries by their first item.
reverse = {}
@@ -491,7 +491,7 @@ class TokenEater:
print(locline, file=fp)
if isdocstring:
print('#, docstring', file=fp)
- print('msgid', normalize(k), file=fp)
+ print('msgid', normalize(k, encoding), file=fp)
print('msgstr ""\n', file=fp)
@@ -587,7 +587,7 @@ def main():
fp.close()
# calculate escapes
- make_escapes(options.escape)
+ make_escapes(not options.escape)
# calculate all keywords
options.keywords.extend(default_keywords)
@@ -620,17 +620,17 @@ def main():
if filename == '-':
if options.verbose:
print(_('Reading standard input'))
- fp = sys.stdin
+ fp = sys.stdin.buffer
closep = 0
else:
if options.verbose:
print(_('Working on %s') % filename)
- fp = open(filename)
+ fp = open(filename, 'rb')
closep = 1
try:
eater.set_filename(filename)
try:
- tokens = tokenize.generate_tokens(fp.readline)
+ tokens = tokenize.tokenize(fp.readline)
for _token in tokens:
eater(*_token)
except tokenize.TokenError as e: