summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_ucn.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/test/test_ucn.py')
-rw-r--r--Lib/test/test_ucn.py88
1 files changed, 81 insertions, 7 deletions
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
index fd620f0..68a3219 100644
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -8,8 +8,11 @@ Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
"""#"
import unittest
+import unicodedata
from test import support
+from http.client import HTTPException
+from test.test_normalization import check_version
class UnicodeNamesTest(unittest.TestCase):
@@ -59,8 +62,6 @@ class UnicodeNamesTest(unittest.TestCase):
)
def test_ascii_letters(self):
- import unicodedata
-
for char in "".join(map(chr, range(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
@@ -81,7 +82,6 @@ class UnicodeNamesTest(unittest.TestCase):
self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
- import unicodedata
self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
def test_cjk_unified_ideographs(self):
@@ -97,14 +97,11 @@ class UnicodeNamesTest(unittest.TestCase):
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
def test_bmp_characters(self):
- import unicodedata
- count = 0
for code in range(0x10000):
char = chr(code)
name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
- count += 1
def test_misc_symbols(self):
self.checkletter("PILCROW SIGN", "\u00b6")
@@ -112,8 +109,85 @@ class UnicodeNamesTest(unittest.TestCase):
self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
+ def test_aliases(self):
+ # Check that the aliases defined in the NameAliases.txt file work.
+ # This should be updated when new aliases are added or the file
+ # should be downloaded and parsed instead. See #12753.
+ aliases = [
+ ('LATIN CAPITAL LETTER GHA', 0x01A2),
+ ('LATIN SMALL LETTER GHA', 0x01A3),
+ ('KANNADA LETTER LLLA', 0x0CDE),
+ ('LAO LETTER FO FON', 0x0E9D),
+ ('LAO LETTER FO FAY', 0x0E9F),
+ ('LAO LETTER RO', 0x0EA3),
+ ('LAO LETTER LO', 0x0EA5),
+ ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
+ ('YI SYLLABLE ITERATION MARK', 0xA015),
+ ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
+ ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
+ ]
+ for alias, codepoint in aliases:
+ self.checkletter(alias, chr(codepoint))
+ name = unicodedata.name(chr(codepoint))
+ self.assertNotEqual(name, alias)
+ self.assertEqual(unicodedata.lookup(alias),
+ unicodedata.lookup(name))
+ with self.assertRaises(KeyError):
+ unicodedata.ucd_3_2_0.lookup(alias)
+
+ def test_aliases_names_in_pua_range(self):
+ # We are storing aliases in the PUA 15, but their names shouldn't leak
+ for cp in range(0xf0000, 0xf0100):
+ with self.assertRaises(ValueError) as cm:
+ unicodedata.name(chr(cp))
+ self.assertEqual(str(cm.exception), 'no such name')
+
+ def test_named_sequences_names_in_pua_range(self):
+ # We are storing named seq in the PUA 15, but their names shouldn't leak
+ for cp in range(0xf0100, 0xf0fff):
+ with self.assertRaises(ValueError) as cm:
+ unicodedata.name(chr(cp))
+ self.assertEqual(str(cm.exception), 'no such name')
+
+ def test_named_sequences_sample(self):
+ # Check a few named sequences. See #12753.
+ sequences = [
+ ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
+ ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
+ ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
+ ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
+ ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
+ ]
+ for seqname, codepoints in sequences:
+ self.assertEqual(unicodedata.lookup(seqname), codepoints)
+ with self.assertRaises(SyntaxError):
+ self.checkletter(seqname, None)
+ with self.assertRaises(KeyError):
+ unicodedata.ucd_3_2_0.lookup(seqname)
+
+ def test_named_sequences_full(self):
+ # Check all the named sequences
+ url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
+ unicodedata.unidata_version)
+ try:
+ testdata = support.open_urlresource(url, encoding="utf-8",
+ check=check_version)
+ except (IOError, HTTPException):
+ self.skipTest("Could not retrieve " + url)
+ self.addCleanup(testdata.close)
+ for line in testdata:
+ line = line.strip()
+ if not line or line.startswith('#'):
+ continue
+ seqname, codepoints = line.split(';')
+ codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
+ self.assertEqual(unicodedata.lookup(seqname), codepoints)
+ with self.assertRaises(SyntaxError):
+ self.checkletter(seqname, None)
+ with self.assertRaises(KeyError):
+ unicodedata.ucd_3_2_0.lookup(seqname)
+
def test_errors(self):
- import unicodedata
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, 'xx')
self.assertRaises(TypeError, unicodedata.lookup)