1 files changed, 140 insertions, 173 deletions
diff --git a/Lib/gettext.py b/Lib/gettext.py
index c216089..647fe56 100644
--- a/Lib/gettext.py
+++ b/Lib/gettext.py
@@ -6,83 +6,7 @@ message catalog library.
 
 I18N refers to the operation by which a program is made aware of multiple
 languages.  L10N refers to the adaptation of your program, once
-internationalized, to the local language and cultural habits.  In order to
-provide multilingual messages for your Python programs, you need to take the
-following steps:
-
-    - prepare your program by specially marking translatable strings
-    - run a suite of tools over your marked program files to generate raw
-      messages catalogs
-    - create language specific translations of the message catalogs
-    - use this module so that message strings are properly translated
-
-In order to prepare your program for I18N, you need to look at all the strings
-in your program.  Any string that needs to be translated should be marked by
-wrapping it in _('...') -- i.e. a call to the function `_'.  For example:
-
-    filename = 'mylog.txt'
-    message = _('writing a log message')
-    fp = open(filename, 'w')
-    fp.write(message)
-    fp.close()
-
-In this example, the string `writing a log message' is marked as a candidate
-for translation, while the strings `mylog.txt' and `w' are not.
-
-The GNU gettext package provides a tool, called xgettext, that scans C and C++
-source code looking for these specially marked strings.  xgettext generates
-what are called `.pot' files, essentially structured human readable files
-which contain every marked string in the source code.  These .pot files are
-copied and handed over to translators who write language-specific versions for
-every supported language.
-
-For I18N Python programs however, xgettext won't work; it doesn't understand
-the myriad of string types support by Python.  The standard Python
-distribution provides a tool called pygettext that does though (found in the
-Tools/i18n directory).  This is a command line script that supports a similar
-interface as xgettext; see its documentation for details.  Once you've used
-pygettext to create your .pot files, you can use the standard GNU gettext
-tools to generate your machine-readable .mo files, which are what's used by
-this module.
-
-In the simple case, to use this module then, you need only add the following
-bit of code to the main driver file of your application:
-
-    import gettext
-    gettext.install()
-
-This sets everything up so that your _('...') function calls Just Work.  In
-other words, it installs `_' in the builtins namespace for convenience.  You
-can skip this step and do it manually by the equivalent code:
-
-    import gettext
-    import __builtin__
-    __builtin__['_'] = gettext.gettext
-
-Once you've done this, you probably want to call bindtextdomain() and
-textdomain() to get the domain set up properly.  Again, for convenience, you
-can pass the domain and localedir to install to set everything up in one fell
-swoop:
-
-    import gettext
-    gettext.install('mydomain', '/my/locale/dir')
-
-If your program needs to support many languages at the same time, you will
-want to create Translation objects explicitly, like so:
-
-    import gettext
-    gettext.install()
-
-    lang1 = gettext.Translations(open('/path/to/my/lang1/messages.mo'))
-    lang2 = gettext.Translations(open('/path/to/my/lang2/messages.mo'))
-    lang3 = gettext.Translations(open('/path/to/my/lang3/messages.mo'))
-
-    gettext.set(lang1)
-    # all _() will now translate to language 1
-    gettext.set(lang2)
-    # all _() will now translate to language 2
-
-Currently, only GNU gettext format binary .mo files are supported.
+internationalized, to the local language and cultural habits.
 
 """
 
@@ -104,21 +28,27 @@ Currently, only GNU gettext format binary .mo files are supported.
 #
 # Barry Warsaw integrated these modules, wrote the .install() API and code,
 # and conformed all C and Python code to Python's coding standards.
+#
+# Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
+# module.
+#
+# TODO:
+# - Lazy loading of .mo files.  Currently the entire catalog is loaded into
+#   memory, but that's probably bad for large translated programs.  Instead,
+#   the lexical sort of original strings in GNU .mo files should be exploited
+#   to do binary searches and lazy initializations.  Or you might want to use
+#   the undocumented double-hash algorithm for .mo files with hash tables, but
+#   you'll need to study the GNU gettext code to do this.
+#
+# - Support Solaris .mo file formats.  Unfortunately, we've been unable to
+#   find this format documented anywhere.
 
 import os
 import sys
 import struct
-from UserDict import UserDict
-
-
-
-# globals
-_translations = {}
-_current_translation = None
-_current_domain = 'messages'
+from errno import ENOENT
 
-# Domain to directory mapping, for use by bindtextdomain()
-_localedirs = {}
+_default_localedir = os.path.join(sys.prefix, 'share', 'locale')
 
 
 
@@ -165,16 +95,37 @@ def _expand_lang(locale):
 
 
 
-class GNUTranslations(UserDict):
-    # Magic number of .mo files
-    MAGIC = 0x950412de
+class NullTranslations:
+    def __init__(self, fp=None):
+        self._info = {}
+        self._charset = None
+        if fp:
+            self._parse(fp)
 
-    def __init__(self, fp):
-        if fp is None:
-            d = {}
-        else:
-            d = self._parse(fp)
-        UserDict.__init__(self, d)
+    def _parse(self, fp):
+        pass
+
+    def gettext(self, message):
+        return message
+
+    def ugettext(self, message):
+        return unicode(message)
+
+    def info(self):
+        return self._info
+
+    def charset(self):
+        return self._charset
+
+    def install(self, unicode=0):
+        import __builtin__
+        __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
+
+
+class GNUTranslations(NullTranslations):
+    # Magic number of .mo files
+    LE_MAGIC = 0x950412de
+    BE_MAGIC = struct.unpack('>i', struct.pack('<i', LE_MAGIC))[0]
 
     def _parse(self, fp):
         """Override this method to support alternative .mo formats."""
@@ -182,51 +133,62 @@ class GNUTranslations(UserDict):
         filename = getattr(fp, 'name', '')
         # Parse the .mo file header, which consists of 5 little endian 32
         # bit words.
-        catalog = {}
+        self._catalog = catalog = {}
         buf = fp.read()
-        magic, version, msgcount, masteridx, transidx = unpack(
-            '<5i', buf[:20])
-        if magic <> self.MAGIC:
+        # Are we big endian or little endian?
+        magic = unpack('<i', buf[:4])[0]
+        if magic == self.LE_MAGIC:
+            version, msgcount, masteridx, transidx = unpack('<4i', buf[4:20])
+            ii = '<ii'
+        elif magic == self.BE_MAGIC:
+            version, msgcount, masteridx, transidx = unpack('>4i', buf[4:20])
+            ii = '>ii'
+        else:
             raise IOError(0, 'Bad magic number', filename)
         #
         # Now put all messages from the .mo file buffer into the catalog
         # dictionary.
         for i in xrange(0, msgcount):
-            mstart = unpack('<i', buf[masteridx+4:masteridx+8])[0]
-            mend = mstart + unpack('<i', buf[masteridx:masteridx+4])[0]
-            tstart = unpack('<i', buf[transidx+4:transidx+8])[0]
-            tend = tstart + unpack('<i', buf[transidx:transidx+4])[0]
+            mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
+            mend = moff + mlen
+            tlen, toff = unpack(ii, buf[transidx:transidx+8])
+            tend = toff + tlen
             if mend < len(buf) and tend < len(buf):
-                catalog[buf[mstart:mend]] = buf[tstart:tend]
+                tmsg = buf[toff:tend]
+                catalog[buf[moff:mend]] = tmsg
             else:
                 raise IOError(0, 'File is corrupt', filename)
-            #
+            # See if we're looking at GNU .mo conventions for metadata
+            if mlen == 0 and tmsg.lower().startswith('project-id-version:'):
+                # Catalog description
+                for item in tmsg.split('\n'):
+                    item = item.strip()
+                    if not item:
+                        continue
+                    k, v = item.split(':', 1)
+                    k = k.strip().lower()
+                    v = v.strip()
+                    self._info[k] = v
+                    if k == 'content-type':
+                        self._charset = v.split('charset=')[1]
             # advance to next entry in the seek tables
             masteridx += 8
             transidx += 8
-        return catalog
 
+    def gettext(self, message):
+        return self._catalog.get(message, message)
+
+    def ugettext(self, message):
+        tmsg = self._catalog.get(message, message)
+        return unicode(tmsg, self._charset)
 
-
-# By default, use GNU gettext format .mo files
-Translations = GNUTranslations
 
+
 # Locate a .mo file using the gettext strategy
-def _find(localedir=None, languages=None, domain=None):
-    global _current_domain
-    global _localedirs
+def find(domain, localedir=None, languages=None):
     # Get some reasonable defaults for arguments that were not supplied
-    if domain is None:
-        domain = _current_domain
     if localedir is None:
-        localedir = _localedirs.get(
-            domain,
-            # TBD: The default localedir is actually system dependent.  I
-            # don't know of a good platform-consistent and portable way to
-            # default it, so instead, we'll just use sys.prefix.  Most
-            # programs should be calling bindtextdomain() or such explicitly
-            # anyway.
-            os.path.join(sys.prefix, 'share', 'locale'))
+        localedir = _default_localedir
     if languages is None:
         languages = []
         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
@@ -247,72 +209,77 @@ def _find(localedir=None, languages=None, domain=None):
         if lang == 'C':
             break
         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
-        # see if it's in the cache
-        mo = _translations.get(mofile)
-        if mo:
-            return mo
-        fp = None
-        try:
-            try:
-                fp = open(mofile, 'rb')
-                t = Translations(fp)
-                _translations[mofile] = t
-                return t
-            except IOError:
-                pass
-        finally:
-            if fp:
-                fp.close()
-    return {}
+        if os.path.exists(mofile):
+            return mofile
+    return None
 
 
 
-def bindtextdomain(domain=None, localedir=None):
-    """Bind domain to a file in the specified directory."""
-    global _localedirs
-    if domain is None:
-        return None
-    if localedir is None:
-        return _localedirs.get(domain, _localedirs.get('C'))
-    _localedirs[domain] = localedir
-    return localedir
+# a mapping between absolute .mo file path and Translation object
+_translations = {}
+
+def translation(domain, localedir=None, languages=None, class_=None):
+    if class_ is None:
+        class_ = GNUTranslations
+    mofile = find(domain, localedir, languages)
+    if mofile is None:
+        raise IOError(ENOENT, 'No translation file found for domain', domain)
+    key = os.path.abspath(mofile)
+    # TBD: do we need to worry about the file pointer getting collected?
+    t = _translations.setdefault(key, class_(open(mofile, 'rb')))
+    return t
+
+
+
+def install(domain, localedir=None, unicode=0):
+    translation(domain, localedir).install(unicode)
+
+
+
+# a mapping b/w domains and locale directories
+_localedirs = {}
+# current global domain, `messages' used for compatibility w/ GNU gettext
+_current_domain = 'messages'
 
 
 def textdomain(domain=None):
-    """Change or query the current global domain."""
     global _current_domain
-    if domain is None:
-        return _current_domain
-    else:
+    if domain is not None:
         _current_domain = domain
-        return domain
+    return _current_domain
 
 
-def gettext(message):
-    """Return localized version of a message."""
-    return _find().get(message, message)
+def bindtextdomain(domain, localedir=None):
+    global _localedirs
+    if localedir is not None:
+        _localedirs[domain] = localedir
+    return _localedirs.get(domain, _default_localedir)
 
 
 def dgettext(domain, message):
-    """Like gettext(), but look up message in specified domain."""
-    return _find(domain=domain).get(message, message)
+    try:
+        t = translation(domain, _localedirs.get(domain, None))
+    except IOError:
+        return message
+    return t.gettext(message)
+    
 
+def gettext(message):
+    return dgettext(_current_domain, message)
 
-
-# A higher level API
-def set(translation):
-    global _current_translation
-    _current_translation = translation
 
+# dcgettext() has been deemed unnecessary and is not implemented.
 
-def get():
-    global _current_translation
-    return _current_translation
+# James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
+# was:
+#
+#    import gettext
+#    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
+#    _ = cat.gettext
+#    print _('Hello World')
 
+# The resulting catalog object currently don't support access through a
+# dictionary API, which was supported (but apparently unused) in GNOME
+# gettext.
 
-def install(domain=None, localedir=None):
-    import __builtin__
-    __builtin__.__dict__['_'] = gettext
-    if domain is not None:
-        bindtextdomain(domain, localedir)
-        textdomain(domain)
+Catalog = translation