summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBarry Warsaw <barry@python.org>1999-08-13 20:59:48 (GMT)
committerBarry Warsaw <barry@python.org>1999-08-13 20:59:48 (GMT)
commite27db5a3938b64a1e72c853285aa02c8c60c45d2 (patch)
treeba6c377440c21ecbd730873743c15b7268438eee
parent31e5e37196260c58a57f780a688b27b25ccae1bb (diff)
downloadcpython-e27db5a3938b64a1e72c853285aa02c8c60c45d2.zip
cpython-e27db5a3938b64a1e72c853285aa02c8c60c45d2.tar.gz
cpython-e27db5a3938b64a1e72c853285aa02c8c60c45d2.tar.bz2
Initial implementation
-rwxr-xr-xTools/i18n/pygettext.py268
1 files changed, 268 insertions, 0 deletions
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py
index ee3ecd2..3542f3f 100755
--- a/Tools/i18n/pygettext.py
+++ b/Tools/i18n/pygettext.py
@@ -1 +1,269 @@
#! /usr/bin/env python
+
+"""pygettext -- Python equivalent of xgettext(1)
+
+Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
+internationalization of C programs. Most of these tools are independent of
+the programming language and can be used from within Python programs. Martin
+von Loewis' work[1] helps considerably in this regard.
+
+There's one hole though; xgettext is the program that scans source code
+looking for message strings, but it groks only C (or C++). Python introduces
+a few wrinkles, such as dual quoting characters, triple quoted strings, and
+raw strings. xgettext understands none of this.
+
+Enter pygettext, which uses Python's standard tokenize module to scan Python
+source code, generating .pot files identical to what GNU xgettext[2] generates
+for C and C++ code. From there, the standard GNU tools can be used.
+
+A word about marking Python strings as candidates for translation. GNU
+xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
+gettext_noop. But those can be a lot of text to include all over your code.
+C and C++ have a trick: they use the C preprocessor. Most internationalized C
+source includes a #define for gettext() to _() so that what has to be written
+in the source is much less. Thus these are both translatable strings:
+
+ gettext("Translatable String")
+ _("Translatable String")
+
+Python of course has no preprocessor so this doesn't work so well. Thus,
+pygettext searches only for _() by default, but see the -k/--keyword flag
+below for how to augment this.
+
+ [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
+ [2] http://www.gnu.org/software/gettext/gettext.html
+
+
+NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
+where ever possible.
+
+Usage: pygettext [options] filename ...
+
+Options:
+
+ -a
+ --extract-all
+ Extract all strings
+
+ -d default-domain
+ --default-domain=default-domain
+ Rename the default output file from messages.pot to default-domain.pot
+
+ -k [word]
+ --keyword[=word]
+ Additional keywords to look for. Without `word' means not to use the
+ default keywords. The default keywords, which are always looked for
+ if not explicitly disabled: _
+
+ The default keyword list is different than GNU xgettext. You can have
+ multiple -k flags on the command line.
+
+ --no-location
+ Do not write filename/lineno location comments
+
+ -n [style]
+ --add-location[=style]
+ Write filename/lineno location comments indicating where each
+ extracted string is found in the source. These lines appear before
+ each msgid. Two styles are supported:
+
+ Solaris # File: filename, line: line-number
+ Gnu #: filename:line
+
+ If style is omitted, Gnu is used. The style name is case
+ insensitive. By default, locations are included.
+
+ --help
+ -h
+ print this help message and exit
+
+"""
+
+import os
+import sys
+import string
+import time
+import getopt
+import tokenize
+
+__version__ = '0.1'
+
+
+
+def usage(code, msg=''):
+ print __doc__ % globals()
+ if msg:
+ print msg
+ sys.exit(code)
+
+
+
+def normalize(s):
+ # This converts the various Python string types into a format that is
+ # appropriate for .po files, namely much closer to C style.
+ #
+ # unwrap quotes, safely
+ s = eval(s, {'__builtins__':{}}, {})
+ # now escape any embedded double quotes
+ parts = []
+ last = 0
+ i = string.find(s, '"')
+ while i >= 0:
+ # find the number of preceding backslashes
+ j = i
+ n = 0
+ while j >= 0 and s[i] == '\\':
+ j = j - 1
+ n = n + 1
+ if (n % 2) == 0:
+ parts.append(s[last:j])
+ parts.append('\\')
+ parts.append(s[j:i])
+ else:
+ parts.append(s[last:i])
+ last = i
+ i = string.find(s, '"', i+1)
+ else:
+ parts.append(s[last:])
+ if parts:
+ return '"' + string.join(parts, '') + '"'
+ else:
+ return '"' + s + '"'
+
+
+
+class TokenEater:
+ def __init__(self, options):
+ self.__options = options
+ self.__messages = {}
+ self.__state = self.__waiting
+ self.__data = []
+ self.__lineno = -1
+
+ def __call__(self, ttype, tstring, stup, etup, line):
+ # dispatch
+ self.__state(ttype, tstring, stup[0])
+
+ def __waiting(self, ttype, tstring, lineno):
+ if ttype == tokenize.NAME and tstring in self.__options.keywords:
+ self.__state = self.__keywordseen
+
+ def __keywordseen(self, ttype, tstring, lineno):
+ if ttype == tokenize.OP and tstring == '(':
+ self.__data = []
+ self.__lineno = lineno
+ self.__state = self.__openseen
+ else:
+ self.__state = self.__waiting
+
+ def __openseen(self, ttype, tstring, lineno):
+ if ttype == tokenize.OP and tstring == ')':
+ # We've seen the last of the translatable strings. Record the
+ # line number of the first line of the strings and update the list
+ # of messages seen. Reset state for the next batch. If there
+ # were no strings inside _(), then just ignore this entry.
+ if self.__data:
+ msg = string.join(self.__data, '')
+ entry = (self.__curfile, self.__lineno)
+ linenos = self.__messages.get(msg)
+ if linenos is None:
+ self.__messages[msg] = [entry]
+ else:
+ linenos.append(entry)
+ self.__state = self.__waiting
+ elif ttype == tokenize.STRING:
+ self.__data.append(normalize(tstring))
+ # TBD: should we warn if we seen anything else?
+
+ def set_filename(self, filename):
+ self.__curfile = filename
+
+ def write(self, fp):
+ options = self.__options
+ timestamp = time.ctime(time.time())
+ # common header
+ try:
+ sys.stdout = fp
+ print '# POT file generated by pygettext.py', __version__
+ print '#', timestamp
+ print '#'
+ for k, v in self.__messages.items():
+ for filename, lineno in v:
+ # location comments are different b/w Solaris and GNU
+ if options.location == options.SOLARIS:
+ print '# File: %s,' % filename, 'line: %d' % lineno
+ elif options.location == options.GNU:
+ print '#: %s:%d' % (filename, lineno)
+ # TBD: sorting, normalizing
+ print 'msgid', k
+ print 'msgstr '
+ print
+ finally:
+ sys.stdout = sys.__stdout__
+
+
+def main():
+ default_keywords = ['_']
+ try:
+ opts, args = getopt.getopt(
+ sys.argv[1:],
+ 'k:d:n:h',
+ ['keyword', 'default-domain', 'help',
+ 'add-location=', 'no-location'])
+ except getopt.error, msg:
+ usage(1, msg)
+
+ # for holding option values
+ class Options:
+ # constants
+ GNU = 1
+ SOLARIS = 2
+ # defaults
+ keywords = []
+ outfile = 'messages.pot'
+ location = GNU
+
+ options = Options()
+ locations = {'gnu' : options.GNU,
+ 'solaris' : options.SOLARIS,
+ }
+
+ # parse options
+ for opt, arg in opts:
+ if opt in ('-h', '--help'):
+ usage(0)
+ elif opt in ('-k', '--keyword'):
+ if arg is None:
+ default_keywords = []
+ options.keywords.append(arg)
+ elif opt in ('-d', '--default-domain'):
+ options.outfile = arg + '.pot'
+ elif opt in ('-n', '--add-location'):
+ if arg is None:
+ arg = 'gnu'
+ try:
+ options.location = locations[string.lower(arg)]
+ except KeyError:
+ usage(1, 'Invalid value for --add-location: ' + arg)
+ elif opt in ('--no-location',):
+ options.location = 0
+
+ # calculate all keywords
+ options.keywords.extend(default_keywords)
+
+ # slurp through all the files
+ eater = TokenEater(options)
+ for filename in args:
+ fp = open(filename)
+ eater.set_filename(filename)
+ tokenize.tokenize(fp.readline, eater)
+ fp.close()
+
+ fp = open(options.outfile, 'w')
+ eater.write(fp)
+ fp.close()
+
+
+
+if __name__ == '__main__':
+ main()