Issue #17156: pygettext.py now uses an encoding of source file and correctly

writes and escapes non-ascii characters.
author: Serhiy Storchaka <storchaka@gmail.com> 2013-02-09 20:38:12 (GMT)
committer: Serhiy Storchaka <storchaka@gmail.com> 2013-02-09 20:38:12 (GMT)
commit: 859cd4723f07e2b1da7387f5be0f2ce0a195974d (patch)
tree: 500f0eddeedff2c61e7da96b603a64d851138d65 /Tools
parent: 7451a72e2ba8939215324387e36285725632e637 (diff)
parent: b6ed17344b456f397df800cc553fef94f5b1e58b (diff)
download: cpython-859cd4723f07e2b1da7387f5be0f2ce0a195974d.zip
cpython-859cd4723f07e2b1da7387f5be0f2ce0a195974d.tar.gz
cpython-859cd4723f07e2b1da7387f5be0f2ce0a195974d.tar.bz2
1 files changed, 33 insertions, 33 deletions
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py
index 93b2b79..79d976b 100755
--- a/Tools/i18n/pygettext.py
+++ b/Tools/i18n/pygettext.py
@@ -188,8 +188,8 @@ msgstr ""
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
 "Language-Team: LANGUAGE <LL@li.org>\\n"
 "MIME-Version: 1.0\\n"
-"Content-Type: text/plain; charset=CHARSET\\n"
-"Content-Transfer-Encoding: ENCODING\\n"
+"Content-Type: text/plain; charset=%(charset)s\\n"
+"Content-Transfer-Encoding: %(encoding)s\\n"
 "Generated-By: pygettext.py %(version)s\\n"
 
 ''')
@@ -203,35 +203,32 @@ def usage(code, msg=''):
 
 
 
-escapes = []
-
-def make_escapes(pass_iso8859):
-    global escapes
-    if pass_iso8859:
-        # Allow iso-8859 characters to pass through so that e.g. 'msgid
+def make_escapes(pass_nonascii):
+    global escapes, escape
+    if pass_nonascii:
+        # Allow non-ascii characters to pass through so that e.g. 'msgid
         # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
         # escape any character outside the 32..126 range.
         mod = 128
+        escape = escape_ascii
     else:
         mod = 256
-    for i in range(256):
-        if 32 <= (i % mod) <= 126:
-            escapes.append(chr(i))
-        else:
-            escapes.append("\\%03o" % i)
-    escapes[ord('\\')] = '\\\\'
-    escapes[ord('\t')] = '\\t'
-    escapes[ord('\r')] = '\\r'
-    escapes[ord('\n')] = '\\n'
-    escapes[ord('\"')] = '\\"'
+        escape = escape_nonascii
+    escapes = [r"\%03o" % i for i in range(mod)]
+    for i in range(32, 127):
+        escapes[i] = chr(i)
+    escapes[ord('\\')] = r'\\'
+    escapes[ord('\t')] = r'\t'
+    escapes[ord('\r')] = r'\r'
+    escapes[ord('\n')] = r'\n'
+    escapes[ord('\"')] = r'\"'
+
 
+def escape_ascii(s, encoding):
+    return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
 
-def escape(s):
-    global escapes
-    s = list(s)
-    for i in range(len(s)):
-        s[i] = escapes[ord(s[i])]
-    return EMPTYSTRING.join(s)
+def escape_nonascii(s, encoding):
+    return ''.join(escapes[b] for b in s.encode(encoding))
 
 
 def safe_eval(s):
@@ -239,18 +236,18 @@ def safe_eval(s):
     return eval(s, {'__builtins__':{}}, {})
 
 
-def normalize(s):
+def normalize(s, encoding):
     # This converts the various Python string types into a format that is
     # appropriate for .po files, namely much closer to C style.
     lines = s.split('\n')
     if len(lines) == 1:
-        s = '"' + escape(s) + '"'
+        s = '"' + escape(s, encoding) + '"'
     else:
         if not lines[-1]:
             del lines[-1]
             lines[-1] = lines[-1] + '\n'
         for i in range(len(lines)):
-            lines[i] = escape(lines[i])
+            lines[i] = escape(lines[i], encoding)
         lineterm = '\\n"\n"'
         s = '""\n"' + lineterm.join(lines) + '"'
     return s
@@ -447,7 +444,10 @@ class TokenEater:
         timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
         # The time stamp in the header doesn't have the same format as that
         # generated by xgettext...
-        print(pot_header % {'time': timestamp, 'version': __version__}, file=fp)
+        encoding = fp.encoding if fp.encoding else 'UTF-8'
+        print(pot_header % {'time': timestamp, 'version': __version__,
+                            'charset': encoding,
+                            'encoding': '8bit'}, file=fp)
         # Sort the entries.  First sort each particular entry's keys, then
         # sort all the entries by their first item.
         reverse = {}
@@ -491,7 +491,7 @@ class TokenEater:
                         print(locline, file=fp)
                 if isdocstring:
                     print('#, docstring', file=fp)
-                print('msgid', normalize(k), file=fp)
+                print('msgid', normalize(k, encoding), file=fp)
                 print('msgstr ""\n', file=fp)
 
 
@@ -587,7 +587,7 @@ def main():
                 fp.close()
 
     # calculate escapes
-    make_escapes(options.escape)
+    make_escapes(not options.escape)
 
     # calculate all keywords
     options.keywords.extend(default_keywords)
@@ -620,17 +620,17 @@ def main():
         if filename == '-':
             if options.verbose:
                 print(_('Reading standard input'))
-            fp = sys.stdin
+            fp = sys.stdin.buffer
             closep = 0
         else:
             if options.verbose:
                 print(_('Working on %s') % filename)
-            fp = open(filename)
+            fp = open(filename, 'rb')
             closep = 1
         try:
             eater.set_filename(filename)
             try:
-                tokens = tokenize.generate_tokens(fp.readline)
+                tokens = tokenize.tokenize(fp.readline)
                 for _token in tokens:
                     eater(*_token)
             except tokenize.TokenError as e:
author	Serhiy Storchaka <storchaka@gmail.com>	2013-02-09 20:38:12 (GMT)
committer	Serhiy Storchaka <storchaka@gmail.com>	2013-02-09 20:38:12 (GMT)
commit	859cd4723f07e2b1da7387f5be0f2ce0a195974d (patch)
tree	500f0eddeedff2c61e7da96b603a64d851138d65 /Tools
parent	7451a72e2ba8939215324387e36285725632e637 (diff)
parent	b6ed17344b456f397df800cc553fef94f5b1e58b (diff)
download	cpython-859cd4723f07e2b1da7387f5be0f2ce0a195974d.zip cpython-859cd4723f07e2b1da7387f5be0f2ce0a195974d.tar.gz cpython-859cd4723f07e2b1da7387f5be0f2ce0a195974d.tar.bz2