diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-12-19 19:21:25 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-12-19 19:21:25 (GMT) |
commit | c8cc42edf489706c71384d9bfcf5fe457f8a3692 (patch) | |
tree | 7089fc169684a98fb7a214870e9a1eea7ebc3b13 /Lib | |
parent | 1e81a399a25edd23d76601c0c421bdad46b5c19c (diff) | |
download | cpython-c8cc42edf489706c71384d9bfcf5fe457f8a3692.zip cpython-c8cc42edf489706c71384d9bfcf5fe457f8a3692.tar.gz cpython-c8cc42edf489706c71384d9bfcf5fe457f8a3692.tar.bz2 |
Issue #5815: Fixed support for locales with modifiers. Fixed support for
locale encodings with hyphens.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/locale.py | 118 | ||||
-rw-r--r-- | Lib/test/test_locale.py | 58 |
2 files changed, 133 insertions, 43 deletions
diff --git a/Lib/locale.py b/Lib/locale.py index d2a885d..80de2c1 100644 --- a/Lib/locale.py +++ b/Lib/locale.py @@ -336,6 +336,22 @@ def _test(): # overridden below) _setlocale = setlocale +def _replace_encoding(code, encoding): + if '.' in code: + langname = code[:code.index('.')] + else: + langname = code + # Convert the encoding to a C lib compatible encoding string + norm_encoding = encodings.normalize_encoding(encoding) + #print('norm encoding: %r' % norm_encoding) + norm_encoding = encodings.aliases.aliases.get(norm_encoding, + norm_encoding) + #print('aliased encoding: %r' % norm_encoding) + encoding = locale_encoding_alias.get(norm_encoding, + norm_encoding) + #print('found encoding %r' % encoding) + return langname + '.' + encoding + def normalize(localename): """ Returns a normalized locale code for the given locale @@ -352,55 +368,71 @@ def normalize(localename): does. """ - # Normalize the locale name and extract the encoding - fullname = localename.lower() - if ':' in fullname: + # Normalize the locale name and extract the encoding and modifier + code = localename.lower() + if ':' in code: # ':' is sometimes used as encoding delimiter. - fullname = fullname.replace(':', '.') - if '.' in fullname: - langname, encoding = fullname.split('.')[:2] - fullname = langname + '.' + encoding + code = code.replace(':', '.') + if '@' in code: + code, modifier = code.split('@', 1) else: - langname = fullname + modifier = '' + if '.' in code: + langname, encoding = code.split('.')[:2] + else: + langname = code encoding = '' - # First lookup: fullname (possibly with encoding) - norm_encoding = encoding.replace('-', '') - norm_encoding = norm_encoding.replace('_', '') - lookup_name = langname + '.' + encoding + # First lookup: fullname (possibly with encoding and modifier) + lang_enc = langname + if encoding: + norm_encoding = encoding.replace('-', '') + norm_encoding = norm_encoding.replace('_', '') + lang_enc += '.' + norm_encoding + lookup_name = lang_enc + if modifier: + lookup_name += '@' + modifier code = locale_alias.get(lookup_name, None) if code is not None: return code - #print 'first lookup failed' - - # Second try: langname (without encoding) - code = locale_alias.get(langname, None) - if code is not None: - #print 'langname lookup succeeded' - if '.' in code: - langname, defenc = code.split('.') - else: - langname = code - defenc = '' - if encoding: - # Convert the encoding to a C lib compatible encoding string - norm_encoding = encodings.normalize_encoding(encoding) - #print 'norm encoding: %r' % norm_encoding - norm_encoding = encodings.aliases.aliases.get(norm_encoding, - norm_encoding) - #print 'aliased encoding: %r' % norm_encoding - encoding = locale_encoding_alias.get(norm_encoding, - norm_encoding) - else: - encoding = defenc - #print 'found encoding %r' % encoding - if encoding: - return langname + '.' + encoding - else: - return langname - - else: - return localename + #print('first lookup failed') + + if modifier: + # Second try: fullname without modifier (possibly with encoding) + code = locale_alias.get(lang_enc, None) + if code is not None: + #print('lookup without modifier succeeded') + if '@' not in code: + return code + '@' + modifier + if code.split('@', 1)[1].lower() == modifier: + return code + #print('second lookup failed') + + if encoding: + # Third try: langname (without encoding, possibly with modifier) + lookup_name = langname + if modifier: + lookup_name += '@' + modifier + code = locale_alias.get(lookup_name, None) + if code is not None: + #print('lookup without encoding succeeded') + if '@' not in code: + return _replace_encoding(code, encoding) + code, modifier = code.split('@', 1) + return _replace_encoding(code, encoding) + '@' + modifier + + if modifier: + # Fourth try: langname (without encoding and modifier) + code = locale_alias.get(langname, None) + if code is not None: + #print('lookup without modifier and encoding succeeded') + if '@' not in code: + return _replace_encoding(code, encoding) + '@' + modifier + code, defmod = code.split('@', 1) + if defmod.lower() == modifier: + return _replace_encoding(code, encoding) + '@' + defmod + + return localename def _parse_localename(localename): @@ -419,7 +451,7 @@ def _parse_localename(localename): code = normalize(localename) if '@' in code: # Deal with locale modifiers - code, modifier = code.split('@') + code, modifier = code.split('@', 1) if modifier == 'euro' and '.' not in code: # Assume Latin-9 for @euro locales. This is bogus, # since some systems may use other encodings for these diff --git a/Lib/test/test_locale.py b/Lib/test/test_locale.py index 48bf36d..76202df 100644 --- a/Lib/test/test_locale.py +++ b/Lib/test/test_locale.py @@ -365,6 +365,64 @@ class TestEnUSCollation(BaseLocalizedTest, TestCollation): self.assertLess(locale.strxfrm('à'), locale.strxfrm('b')) +class NormalizeTest(unittest.TestCase): + def check(self, localename, expected): + self.assertEqual(locale.normalize(localename), expected, msg=localename) + + def test_locale_alias(self): + for localename, alias in locale.locale_alias.items(): + with self.subTest(locale=(localename, alias)): + self.check(localename, alias) + + def test_empty(self): + self.check('', '') + + def test_c(self): + self.check('c', 'C') + self.check('posix', 'C') + + def test_english(self): + self.check('en', 'en_US.ISO8859-1') + self.check('EN', 'en_US.ISO8859-1') + self.check('en_US', 'en_US.ISO8859-1') + self.check('en_us', 'en_US.ISO8859-1') + self.check('en_GB', 'en_GB.ISO8859-1') + self.check('en_US.UTF-8', 'en_US.UTF-8') + self.check('en_US.utf8', 'en_US.UTF-8') + self.check('en_US:UTF-8', 'en_US.UTF-8') + self.check('en_US.ISO8859-1', 'en_US.ISO8859-1') + self.check('en_US.US-ASCII', 'en_US.ISO8859-1') + self.check('english', 'en_EN.ISO8859-1') + + def test_hyphenated_encoding(self): + self.check('az_AZ.iso88599e', 'az_AZ.ISO8859-9E') + self.check('az_AZ.ISO8859-9E', 'az_AZ.ISO8859-9E') + self.check('tt_RU.koi8c', 'tt_RU.KOI8-C') + self.check('tt_RU.KOI8-C', 'tt_RU.KOI8-C') + self.check('lo_LA.cp1133', 'lo_LA.IBM-CP1133') + self.check('lo_LA.ibmcp1133', 'lo_LA.IBM-CP1133') + self.check('lo_LA.IBM-CP1133', 'lo_LA.IBM-CP1133') + self.check('uk_ua.microsoftcp1251', 'uk_UA.CP1251') + self.check('uk_ua.microsoft-cp1251', 'uk_UA.CP1251') + self.check('ka_ge.georgianacademy', 'ka_GE.GEORGIAN-ACADEMY') + self.check('ka_GE.GEORGIAN-ACADEMY', 'ka_GE.GEORGIAN-ACADEMY') + self.check('cs_CZ.iso88592', 'cs_CZ.ISO8859-2') + self.check('cs_CZ.ISO8859-2', 'cs_CZ.ISO8859-2') + + def test_euro_modifier(self): + self.check('de_DE@euro', 'de_DE.ISO8859-15') + self.check('en_US.ISO8859-15@euro', 'en_US.ISO8859-15') + + def test_latin_modifier(self): + self.check('be_BY.UTF-8@latin', 'be_BY.UTF-8@latin') + self.check('sr_RS.UTF-8@latin', 'sr_RS.UTF-8@latin') + + def test_valencia_modifier(self): + self.check('ca_ES.UTF-8@valencia', 'ca_ES.UTF-8@valencia') + self.check('ca_ES@valencia', 'ca_ES.ISO8859-1@valencia') + self.check('ca@valencia', 'ca_ES.ISO8859-1@valencia') + + class TestMiscellaneous(unittest.TestCase): def test_getpreferredencoding(self): # Invoke getpreferredencoding to make sure it does not cause exceptions. |