diff options
author | Tomas R. <tomas.roun8@gmail.com> | 2024-11-22 14:52:16 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-11-22 14:52:16 (GMT) |
commit | 0a1944cda8504ba0478a51075eba540576570336 (patch) | |
tree | ba65890fded973497dc94915c277e56a001fef2b | |
parent | f83ca6962af973fff6a3124f4bd3d45fea4dd5b8 (diff) | |
download | cpython-0a1944cda8504ba0478a51075eba540576570336.zip cpython-0a1944cda8504ba0478a51075eba540576570336.tar.gz cpython-0a1944cda8504ba0478a51075eba540576570336.tar.bz2 |
gh-126700: pygettext: Support more gettext functions (GH-126912)
Support multi-argument gettext functions: ngettext(), pgettext(), dgettext(), etc.
-rw-r--r-- | Lib/test/test_tools/i18n_data/messages.pot | 46 | ||||
-rw-r--r-- | Lib/test/test_tools/i18n_data/messages.py | 52 | ||||
-rw-r--r-- | Lib/test/test_tools/test_i18n.py | 4 | ||||
-rw-r--r-- | Lib/test/translationdata/argparse/msgids.txt | 2 | ||||
-rw-r--r-- | Lib/test/translationdata/optparse/msgids.txt | 1 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst | 1 | ||||
-rwxr-xr-x | Tools/i18n/pygettext.py | 244 |
7 files changed, 260 insertions, 90 deletions
diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index ddfbd18..8d66fbc 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -15,53 +15,75 @@ msgstr "" "Generated-By: pygettext.py 1.5\n" -#: messages.py:5 +#: messages.py:16 msgid "" msgstr "" -#: messages.py:8 messages.py:9 +#: messages.py:19 messages.py:20 msgid "parentheses" msgstr "" -#: messages.py:12 +#: messages.py:23 msgid "Hello, world!" msgstr "" -#: messages.py:15 +#: messages.py:26 msgid "" "Hello,\n" " multiline!\n" msgstr "" -#: messages.py:29 +#: messages.py:46 messages.py:89 messages.py:90 messages.py:93 messages.py:94 +#: messages.py:99 +msgid "foo" +msgid_plural "foos" +msgstr[0] "" +msgstr[1] "" + +#: messages.py:47 +msgid "something" +msgstr "" + +#: messages.py:50 msgid "Hello, {}!" msgstr "" -#: messages.py:33 +#: messages.py:54 msgid "1" msgstr "" -#: messages.py:33 +#: messages.py:54 msgid "2" msgstr "" -#: messages.py:34 messages.py:35 +#: messages.py:55 messages.py:56 msgid "A" msgstr "" -#: messages.py:34 messages.py:35 +#: messages.py:55 messages.py:56 msgid "B" msgstr "" -#: messages.py:36 +#: messages.py:57 msgid "set" msgstr "" -#: messages.py:42 +#: messages.py:63 msgid "nested string" msgstr "" -#: messages.py:47 +#: messages.py:68 msgid "baz" msgstr "" +#: messages.py:91 messages.py:92 messages.py:95 messages.py:96 +msgctxt "context" +msgid "foo" +msgid_plural "foos" +msgstr[0] "" +msgstr[1] "" + +#: messages.py:100 +msgid "domain foo" +msgstr "" + diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py index f220294..1e03f4e 100644 --- a/Lib/test/test_tools/i18n_data/messages.py +++ b/Lib/test/test_tools/i18n_data/messages.py @@ -1,5 +1,16 @@ # Test message extraction -from gettext import gettext as _ +from gettext import ( + gettext, + ngettext, + pgettext, + npgettext, + dgettext, + dngettext, + dpgettext, + dnpgettext +) + +_ = gettext # Empty string _("") @@ -21,13 +32,23 @@ _() _(None) _(1) _(False) -_(x="kwargs are not allowed") +_(("invalid")) +_(["invalid"]) +_({"invalid"}) +_("string"[3]) +_("string"[:3]) +_({"string": "foo"}) + +# pygettext does not allow keyword arguments, but both xgettext and pybabel do +_(x="kwargs work!") + +# Unusual, but valid arguments _("foo", "bar") _("something", x="something else") # .format() _("Hello, {}!").format("world") # valid -_("Hello, {}!".format("world")) # invalid +_("Hello, {}!".format("world")) # invalid, but xgettext and pybabel extract the first string # Nested structures _("1"), _("2") @@ -62,3 +83,28 @@ def _(x): def _(x="don't extract me"): pass + + +# Other gettext functions +gettext("foo") +ngettext("foo", "foos", 1) +pgettext("context", "foo") +npgettext("context", "foo", "foos", 1) +dgettext("domain", "foo") +dngettext("domain", "foo", "foos", 1) +dpgettext("domain", "context", "foo") +dnpgettext("domain", "context", "foo", "foos", 1) + +# Complex arguments +ngettext("foo", "foos", 42 + (10 - 20)) +dgettext(["some", {"complex"}, ("argument",)], "domain foo") + +# Invalid calls which are not extracted +gettext() +ngettext('foo') +pgettext('context') +npgettext('context', 'foo') +dgettext('domain') +dngettext('domain', 'foo') +dpgettext('domain', 'context') +dnpgettext('domain', 'context', 'foo') diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py index 6f71f09..29c3423 100644 --- a/Lib/test/test_tools/test_i18n.py +++ b/Lib/test/test_tools/test_i18n.py @@ -332,14 +332,14 @@ class Test_pygettext(unittest.TestCase): msgids = self.extract_docstrings_from_str(dedent('''\ f"{_('foo', 'bar')}" ''')) - self.assertNotIn('foo', msgids) + self.assertIn('foo', msgids) self.assertNotIn('bar', msgids) def test_calls_in_fstring_with_keyword_args(self): msgids = self.extract_docstrings_from_str(dedent('''\ f"{_('foo', bar='baz')}" ''')) - self.assertNotIn('foo', msgids) + self.assertIn('foo', msgids) self.assertNotIn('bar', msgids) self.assertNotIn('baz', msgids) diff --git a/Lib/test/translationdata/argparse/msgids.txt b/Lib/test/translationdata/argparse/msgids.txt index 2b01290..ae89ac7 100644 --- a/Lib/test/translationdata/argparse/msgids.txt +++ b/Lib/test/translationdata/argparse/msgids.txt @@ -8,6 +8,8 @@ argument %(argument_name)s: %(message)s argument '%(argument_name)s' is deprecated can't open '%(filename)s': %(error)s command '%(parser_name)s' is deprecated +conflicting option string: %s +expected %s argument expected at least one argument expected at most one argument expected one argument diff --git a/Lib/test/translationdata/optparse/msgids.txt b/Lib/test/translationdata/optparse/msgids.txt index ac5317c..8f405a2 100644 --- a/Lib/test/translationdata/optparse/msgids.txt +++ b/Lib/test/translationdata/optparse/msgids.txt @@ -1,3 +1,4 @@ +%(option)s option requires %(number)d argument %prog [options] %s option does not take a value Options diff --git a/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst b/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst new file mode 100644 index 0000000..c08ad9d --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst @@ -0,0 +1 @@ +Add support for multi-argument :mod:`gettext` functions in :program:`pygettext.py`. diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 0d16e8f..f78ff16 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -163,16 +163,13 @@ import glob import time import getopt import ast -import token import tokenize +from collections import defaultdict +from dataclasses import dataclass, field +from operator import itemgetter __version__ = '1.5' -default_keywords = ['_'] -DEFAULTKEYWORDS = ', '.join(default_keywords) - -EMPTYSTRING = '' - # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's # there. @@ -306,12 +303,64 @@ def getFilesForName(name): return [] +# Key is the function name, value is a dictionary mapping argument positions to the +# type of the argument. The type is one of 'msgid', 'msgid_plural', or 'msgctxt'. +DEFAULTKEYWORDS = { + '_': {0: 'msgid'}, + 'gettext': {0: 'msgid'}, + 'ngettext': {0: 'msgid', 1: 'msgid_plural'}, + 'pgettext': {0: 'msgctxt', 1: 'msgid'}, + 'npgettext': {0: 'msgctxt', 1: 'msgid', 2: 'msgid_plural'}, + 'dgettext': {1: 'msgid'}, + 'dngettext': {1: 'msgid', 2: 'msgid_plural'}, + 'dpgettext': {1: 'msgctxt', 2: 'msgid'}, + 'dnpgettext': {1: 'msgctxt', 2: 'msgid', 3: 'msgid_plural'}, +} + + +def matches_spec(message, spec): + """Check if a message has all the keys defined by the keyword spec.""" + return all(key in message for key in spec.values()) + + +@dataclass(frozen=True) +class Location: + filename: str + lineno: int + + def __lt__(self, other): + return (self.filename, self.lineno) < (other.filename, other.lineno) + + +@dataclass +class Message: + msgid: str + msgid_plural: str | None + msgctxt: str | None + locations: set[Location] = field(default_factory=set) + is_docstring: bool = False + + def add_location(self, filename, lineno, msgid_plural=None, *, is_docstring=False): + if self.msgid_plural is None: + self.msgid_plural = msgid_plural + self.locations.add(Location(filename, lineno)) + self.is_docstring |= is_docstring + + +def key_for(msgid, msgctxt=None): + if msgctxt is not None: + return (msgctxt, msgid) + return msgid + + class TokenEater: def __init__(self, options): self.__options = options self.__messages = {} self.__state = self.__waiting - self.__data = [] + self.__data = defaultdict(str) + self.__curr_arg = 0 + self.__curr_keyword = None self.__lineno = -1 self.__freshmodule = 1 self.__curfile = None @@ -331,7 +380,7 @@ class TokenEater: # module docstring? if self.__freshmodule: if ttype == tokenize.STRING and is_literal_string(tstring): - self.__addentry(safe_eval(tstring), lineno, isdocstring=1) + self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True) self.__freshmodule = 0 return if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING): @@ -346,6 +395,7 @@ class TokenEater: return if ttype == tokenize.NAME and tstring in opts.keywords: self.__state = self.__keywordseen + self.__curr_keyword = tstring return if ttype == tokenize.STRING: maybe_fstring = ast.parse(tstring, mode='eval').body @@ -397,7 +447,8 @@ class TokenEater: }, file=sys.stderr) continue if isinstance(arg.value, str): - self.__addentry(arg.value, lineno) + self.__curr_keyword = func_name + self.__addentry({'msgid': arg.value}, lineno) def __suiteseen(self, ttype, tstring, lineno): # skip over any enclosure pairs until we see the colon @@ -413,7 +464,7 @@ class TokenEater: def __suitedocstring(self, ttype, tstring, lineno): # ignore any intervening noise if ttype == tokenize.STRING and is_literal_string(tstring): - self.__addentry(safe_eval(tstring), lineno, isdocstring=1) + self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True) self.__state = self.__waiting elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, tokenize.COMMENT): @@ -422,44 +473,90 @@ class TokenEater: def __keywordseen(self, ttype, tstring, lineno): if ttype == tokenize.OP and tstring == '(': - self.__data = [] + self.__data.clear() + self.__curr_arg = 0 + self.__enclosurecount = 0 self.__lineno = lineno self.__state = self.__openseen else: self.__state = self.__waiting def __openseen(self, ttype, tstring, lineno): - if ttype == tokenize.OP and tstring == ')': - # We've seen the last of the translatable strings. Record the - # line number of the first line of the strings and update the list - # of messages seen. Reset state for the next batch. If there - # were no strings inside _(), then just ignore this entry. - if self.__data: - self.__addentry(EMPTYSTRING.join(self.__data)) - self.__state = self.__waiting - elif ttype == tokenize.STRING and is_literal_string(tstring): - self.__data.append(safe_eval(tstring)) - elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, - token.NEWLINE, tokenize.NL]: - # warn if we see anything else than STRING or whitespace - print(_( - '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' - ) % { - 'token': tstring, - 'file': self.__curfile, - 'lineno': self.__lineno - }, file=sys.stderr) - self.__state = self.__waiting + spec = self.__options.keywords[self.__curr_keyword] + arg_type = spec.get(self.__curr_arg) + expect_string_literal = arg_type is not None + + if ttype == tokenize.OP and self.__enclosurecount == 0: + if tstring == ')': + # We've seen the last of the translatable strings. Record the + # line number of the first line of the strings and update the list + # of messages seen. Reset state for the next batch. If there + # were no strings inside _(), then just ignore this entry. + if self.__data: + self.__addentry(self.__data) + self.__state = self.__waiting + return + elif tstring == ',': + # Advance to the next argument + self.__curr_arg += 1 + return + + if expect_string_literal: + if ttype == tokenize.STRING and is_literal_string(tstring): + self.__data[arg_type] += safe_eval(tstring) + elif ttype not in (tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT, + tokenize.NEWLINE, tokenize.NL): + # We are inside an argument which is a translatable string and + # we encountered a token that is not a string. This is an error. + self.warn_unexpected_token(tstring) + self.__enclosurecount = 0 + self.__state = self.__waiting + elif ttype == tokenize.OP: + if tstring in '([{': + self.__enclosurecount += 1 + elif tstring in ')]}': + self.__enclosurecount -= 1 def __ignorenext(self, ttype, tstring, lineno): self.__state = self.__waiting - def __addentry(self, msg, lineno=None, isdocstring=0): + def __addentry(self, msg, lineno=None, *, is_docstring=False): + msgid = msg.get('msgid') + if msgid in self.__options.toexclude: + return + if not is_docstring: + spec = self.__options.keywords[self.__curr_keyword] + if not matches_spec(msg, spec): + return if lineno is None: lineno = self.__lineno - if not msg in self.__options.toexclude: - entry = (self.__curfile, lineno) - self.__messages.setdefault(msg, {})[entry] = isdocstring + msgctxt = msg.get('msgctxt') + msgid_plural = msg.get('msgid_plural') + key = key_for(msgid, msgctxt) + if key in self.__messages: + self.__messages[key].add_location( + self.__curfile, + lineno, + msgid_plural, + is_docstring=is_docstring, + ) + else: + self.__messages[key] = Message( + msgid=msgid, + msgid_plural=msgid_plural, + msgctxt=msgctxt, + locations={Location(self.__curfile, lineno)}, + is_docstring=is_docstring, + ) + + def warn_unexpected_token(self, token): + print(_( + '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' + ) % { + 'token': token, + 'file': self.__curfile, + 'lineno': self.__lineno + }, file=sys.stderr) def set_filename(self, filename): self.__curfile = filename @@ -472,55 +569,54 @@ class TokenEater: print(pot_header % {'time': timestamp, 'version': __version__, 'charset': encoding, 'encoding': '8bit'}, file=fp) - # Sort the entries. First sort each particular entry's keys, then - # sort all the entries by their first item. - reverse = {} - for k, v in self.__messages.items(): - keys = sorted(v.keys()) - reverse.setdefault(tuple(keys), []).append((k, v)) - rkeys = sorted(reverse.keys()) - for rkey in rkeys: - rentries = reverse[rkey] - rentries.sort() - for k, v in rentries: - # If the entry was gleaned out of a docstring, then add a - # comment stating so. This is to aid translators who may wish - # to skip translating some unimportant docstrings. - isdocstring = any(v.values()) - # k is the message string, v is a dictionary-set of (filename, - # lineno) tuples. We want to sort the entries in v first by - # file name and then by line number. - v = sorted(v.keys()) - if not options.writelocations: - pass + + # Sort locations within each message by filename and lineno + sorted_keys = [ + (key, sorted(msg.locations)) + for key, msg in self.__messages.items() + ] + # Sort messages by locations + # For example, a message with locations [('test.py', 1), ('test.py', 2)] will + # appear before a message with locations [('test.py', 1), ('test.py', 3)] + sorted_keys.sort(key=itemgetter(1)) + + for key, locations in sorted_keys: + msg = self.__messages[key] + if options.writelocations: # location comments are different b/w Solaris and GNU: - elif options.locationstyle == options.SOLARIS: - for filename, lineno in v: - d = {'filename': filename, 'lineno': lineno} - print(_( - '# File: %(filename)s, line: %(lineno)d') % d, file=fp) + if options.locationstyle == options.SOLARIS: + for location in locations: + print(f'# File: {location.filename}, line: {location.lineno}', file=fp) elif options.locationstyle == options.GNU: # fit as many locations on one line, as long as the # resulting line length doesn't exceed 'options.width' locline = '#:' - for filename, lineno in v: - d = {'filename': filename, 'lineno': lineno} - s = _(' %(filename)s:%(lineno)d') % d + for location in locations: + s = f' {location.filename}:{location.lineno}' if len(locline) + len(s) <= options.width: locline = locline + s else: print(locline, file=fp) - locline = "#:" + s + locline = f'#:{s}' if len(locline) > 2: print(locline, file=fp) - if isdocstring: - print('#, docstring', file=fp) - print('msgid', normalize(k, encoding), file=fp) + if msg.is_docstring: + # If the entry was gleaned out of a docstring, then add a + # comment stating so. This is to aid translators who may wish + # to skip translating some unimportant docstrings. + print('#, docstring', file=fp) + if msg.msgctxt is not None: + print('msgctxt', normalize(msg.msgctxt, encoding), file=fp) + print('msgid', normalize(msg.msgid, encoding), file=fp) + if msg.msgid_plural is not None: + print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp) + print('msgstr[0] ""', file=fp) + print('msgstr[1] ""\n', file=fp) + else: print('msgstr ""\n', file=fp) def main(): - global default_keywords try: opts, args = getopt.getopt( sys.argv[1:], @@ -557,7 +653,7 @@ def main(): locations = {'gnu' : options.GNU, 'solaris' : options.SOLARIS, } - + no_default_keywords = False # parse options for opt, arg in opts: if opt in ('-h', '--help'): @@ -573,7 +669,7 @@ def main(): elif opt in ('-k', '--keyword'): options.keywords.append(arg) elif opt in ('-K', '--no-default-keywords'): - default_keywords = [] + no_default_keywords = True elif opt in ('-n', '--add-location'): options.writelocations = 1 elif opt in ('--no-location',): @@ -613,7 +709,9 @@ def main(): make_escapes(not options.escape) # calculate all keywords - options.keywords.extend(default_keywords) + options.keywords = {kw: {0: 'msgid'} for kw in options.keywords} + if not no_default_keywords: + options.keywords |= DEFAULTKEYWORDS # initialize list of strings to exclude if options.excludefilename: |