diff options
Diffstat (limited to 'Tools/i18n/pygettext.py')
-rwxr-xr-x | Tools/i18n/pygettext.py | 268 |
1 files changed, 268 insertions, 0 deletions
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index ee3ecd2..3542f3f 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -1 +1,269 @@ #! /usr/bin/env python + +"""pygettext -- Python equivalent of xgettext(1) + +Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the +internationalization of C programs. Most of these tools are independent of +the programming language and can be used from within Python programs. Martin +von Loewis' work[1] helps considerably in this regard. + +There's one hole though; xgettext is the program that scans source code +looking for message strings, but it groks only C (or C++). Python introduces +a few wrinkles, such as dual quoting characters, triple quoted strings, and +raw strings. xgettext understands none of this. + +Enter pygettext, which uses Python's standard tokenize module to scan Python +source code, generating .pot files identical to what GNU xgettext[2] generates +for C and C++ code. From there, the standard GNU tools can be used. + +A word about marking Python strings as candidates for translation. GNU +xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and +gettext_noop. But those can be a lot of text to include all over your code. +C and C++ have a trick: they use the C preprocessor. Most internationalized C +source includes a #define for gettext() to _() so that what has to be written +in the source is much less. Thus these are both translatable strings: + + gettext("Translatable String") + _("Translatable String") + +Python of course has no preprocessor so this doesn't work so well. Thus, +pygettext searches only for _() by default, but see the -k/--keyword flag +below for how to augment this. + + [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html + [2] http://www.gnu.org/software/gettext/gettext.html + + +NOTE: pygettext attempts to be option and feature compatible with GNU xgettext +where ever possible. + +Usage: pygettext [options] filename ... + +Options: + + -a + --extract-all + Extract all strings + + -d default-domain + --default-domain=default-domain + Rename the default output file from messages.pot to default-domain.pot + + -k [word] + --keyword[=word] + Additional keywords to look for. Without `word' means not to use the + default keywords. The default keywords, which are always looked for + if not explicitly disabled: _ + + The default keyword list is different than GNU xgettext. You can have + multiple -k flags on the command line. + + --no-location + Do not write filename/lineno location comments + + -n [style] + --add-location[=style] + Write filename/lineno location comments indicating where each + extracted string is found in the source. These lines appear before + each msgid. Two styles are supported: + + Solaris # File: filename, line: line-number + Gnu #: filename:line + + If style is omitted, Gnu is used. The style name is case + insensitive. By default, locations are included. + + --help + -h + print this help message and exit + +""" + +import os +import sys +import string +import time +import getopt +import tokenize + +__version__ = '0.1' + + + +def usage(code, msg=''): + print __doc__ % globals() + if msg: + print msg + sys.exit(code) + + + +def normalize(s): + # This converts the various Python string types into a format that is + # appropriate for .po files, namely much closer to C style. + # + # unwrap quotes, safely + s = eval(s, {'__builtins__':{}}, {}) + # now escape any embedded double quotes + parts = [] + last = 0 + i = string.find(s, '"') + while i >= 0: + # find the number of preceding backslashes + j = i + n = 0 + while j >= 0 and s[i] == '\\': + j = j - 1 + n = n + 1 + if (n % 2) == 0: + parts.append(s[last:j]) + parts.append('\\') + parts.append(s[j:i]) + else: + parts.append(s[last:i]) + last = i + i = string.find(s, '"', i+1) + else: + parts.append(s[last:]) + if parts: + return '"' + string.join(parts, '') + '"' + else: + return '"' + s + '"' + + + +class TokenEater: + def __init__(self, options): + self.__options = options + self.__messages = {} + self.__state = self.__waiting + self.__data = [] + self.__lineno = -1 + + def __call__(self, ttype, tstring, stup, etup, line): + # dispatch + self.__state(ttype, tstring, stup[0]) + + def __waiting(self, ttype, tstring, lineno): + if ttype == tokenize.NAME and tstring in self.__options.keywords: + self.__state = self.__keywordseen + + def __keywordseen(self, ttype, tstring, lineno): + if ttype == tokenize.OP and tstring == '(': + self.__data = [] + self.__lineno = lineno + self.__state = self.__openseen + else: + self.__state = self.__waiting + + def __openseen(self, ttype, tstring, lineno): + if ttype == tokenize.OP and tstring == ')': + # We've seen the last of the translatable strings. Record the + # line number of the first line of the strings and update the list + # of messages seen. Reset state for the next batch. If there + # were no strings inside _(), then just ignore this entry. + if self.__data: + msg = string.join(self.__data, '') + entry = (self.__curfile, self.__lineno) + linenos = self.__messages.get(msg) + if linenos is None: + self.__messages[msg] = [entry] + else: + linenos.append(entry) + self.__state = self.__waiting + elif ttype == tokenize.STRING: + self.__data.append(normalize(tstring)) + # TBD: should we warn if we seen anything else? + + def set_filename(self, filename): + self.__curfile = filename + + def write(self, fp): + options = self.__options + timestamp = time.ctime(time.time()) + # common header + try: + sys.stdout = fp + print '# POT file generated by pygettext.py', __version__ + print '#', timestamp + print '#' + for k, v in self.__messages.items(): + for filename, lineno in v: + # location comments are different b/w Solaris and GNU + if options.location == options.SOLARIS: + print '# File: %s,' % filename, 'line: %d' % lineno + elif options.location == options.GNU: + print '#: %s:%d' % (filename, lineno) + # TBD: sorting, normalizing + print 'msgid', k + print 'msgstr ' + print + finally: + sys.stdout = sys.__stdout__ + + +def main(): + default_keywords = ['_'] + try: + opts, args = getopt.getopt( + sys.argv[1:], + 'k:d:n:h', + ['keyword', 'default-domain', 'help', + 'add-location=', 'no-location']) + except getopt.error, msg: + usage(1, msg) + + # for holding option values + class Options: + # constants + GNU = 1 + SOLARIS = 2 + # defaults + keywords = [] + outfile = 'messages.pot' + location = GNU + + options = Options() + locations = {'gnu' : options.GNU, + 'solaris' : options.SOLARIS, + } + + # parse options + for opt, arg in opts: + if opt in ('-h', '--help'): + usage(0) + elif opt in ('-k', '--keyword'): + if arg is None: + default_keywords = [] + options.keywords.append(arg) + elif opt in ('-d', '--default-domain'): + options.outfile = arg + '.pot' + elif opt in ('-n', '--add-location'): + if arg is None: + arg = 'gnu' + try: + options.location = locations[string.lower(arg)] + except KeyError: + usage(1, 'Invalid value for --add-location: ' + arg) + elif opt in ('--no-location',): + options.location = 0 + + # calculate all keywords + options.keywords.extend(default_keywords) + + # slurp through all the files + eater = TokenEater(options) + for filename in args: + fp = open(filename) + eater.set_filename(filename) + tokenize.tokenize(fp.readline, eater) + fp.close() + + fp = open(options.outfile, 'w') + eater.write(fp) + fp.close() + + + +if __name__ == '__main__': + main() |