summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_tools
diff options
context:
space:
mode:
authorCF Bolz-Tereick <cfbolz@gmx.de>2023-11-04 14:56:58 (GMT)
committerGitHub <noreply@github.com>2023-11-04 14:56:58 (GMT)
commit9573d142157d8432f2772a109c304dafeaa454a5 (patch)
tree3ad678dac4e01bbc4825d9d28fa38595addbb608 /Lib/test/test_tools
parent0e9c364f4ac18a2237bdbac702b96bcf8ef9cb09 (diff)
downloadcpython-9573d142157d8432f2772a109c304dafeaa454a5.zip
cpython-9573d142157d8432f2772a109c304dafeaa454a5.tar.gz
cpython-9573d142157d8432f2772a109c304dafeaa454a5.tar.bz2
gh-96954: use a directed acyclic word graph for storing the unicodedata codepoint names (#97906)
Co-authored-by: Ɓukasz Langa <lukasz@langa.pl> Co-authored-by: Pieter Eendebak <pieter.eendebak@gmail.com> Co-authored-by: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com>
Diffstat (limited to 'Lib/test/test_tools')
-rw-r--r--Lib/test/test_tools/test_makeunicodedata.py121
1 files changed, 121 insertions, 0 deletions
diff --git a/Lib/test/test_tools/test_makeunicodedata.py b/Lib/test/test_tools/test_makeunicodedata.py
new file mode 100644
index 0000000..eee6867
--- /dev/null
+++ b/Lib/test/test_tools/test_makeunicodedata.py
@@ -0,0 +1,121 @@
+import unittest
+from test.test_tools import toolsdir, imports_under_tool
+from test import support
+from test.support.hypothesis_helper import hypothesis
+
+st = hypothesis.strategies
+given = hypothesis.given
+example = hypothesis.example
+
+
+with imports_under_tool("unicode"):
+ from dawg import Dawg, build_compression_dawg, lookup, inverse_lookup
+
+
+@st.composite
+def char_name_db(draw, min_length=1, max_length=30):
+ m = draw(st.integers(min_value=min_length, max_value=max_length))
+ names = draw(
+ st.sets(st.text("abcd", min_size=1, max_size=10), min_size=m, max_size=m)
+ )
+ characters = draw(st.sets(st.characters(), min_size=m, max_size=m))
+ return list(zip(names, characters))
+
+
+class TestDawg(unittest.TestCase):
+ """Tests for the directed acyclic word graph data structure that is used
+ to store the unicode character names in unicodedata. Tests ported from PyPy
+ """
+
+ def test_dawg_direct_simple(self):
+ dawg = Dawg()
+ dawg.insert("a", -4)
+ dawg.insert("c", -2)
+ dawg.insert("cat", -1)
+ dawg.insert("catarr", 0)
+ dawg.insert("catnip", 1)
+ dawg.insert("zcatnip", 5)
+ packed, data, inverse = dawg.finish()
+
+ self.assertEqual(lookup(packed, data, b"a"), -4)
+ self.assertEqual(lookup(packed, data, b"c"), -2)
+ self.assertEqual(lookup(packed, data, b"cat"), -1)
+ self.assertEqual(lookup(packed, data, b"catarr"), 0)
+ self.assertEqual(lookup(packed, data, b"catnip"), 1)
+ self.assertEqual(lookup(packed, data, b"zcatnip"), 5)
+ self.assertRaises(KeyError, lookup, packed, data, b"b")
+ self.assertRaises(KeyError, lookup, packed, data, b"catni")
+ self.assertRaises(KeyError, lookup, packed, data, b"catnipp")
+
+ self.assertEqual(inverse_lookup(packed, inverse, -4), b"a")
+ self.assertEqual(inverse_lookup(packed, inverse, -2), b"c")
+ self.assertEqual(inverse_lookup(packed, inverse, -1), b"cat")
+ self.assertEqual(inverse_lookup(packed, inverse, 0), b"catarr")
+ self.assertEqual(inverse_lookup(packed, inverse, 1), b"catnip")
+ self.assertEqual(inverse_lookup(packed, inverse, 5), b"zcatnip")
+ self.assertRaises(KeyError, inverse_lookup, packed, inverse, 12)
+
+ def test_forbid_empty_dawg(self):
+ dawg = Dawg()
+ self.assertRaises(ValueError, dawg.finish)
+
+ @given(char_name_db())
+ @example([("abc", "a"), ("abd", "b")])
+ @example(
+ [
+ ("bab", "1"),
+ ("a", ":"),
+ ("ad", "@"),
+ ("b", "<"),
+ ("aacc", "?"),
+ ("dab", "D"),
+ ("aa", "0"),
+ ("ab", "F"),
+ ("aaa", "7"),
+ ("cbd", "="),
+ ("abad", ";"),
+ ("ac", "B"),
+ ("abb", "4"),
+ ("bb", "2"),
+ ("aab", "9"),
+ ("caaaaba", "E"),
+ ("ca", ">"),
+ ("bbaaa", "5"),
+ ("d", "3"),
+ ("baac", "8"),
+ ("c", "6"),
+ ("ba", "A"),
+ ]
+ )
+ @example(
+ [
+ ("bcdac", "9"),
+ ("acc", "g"),
+ ("d", "d"),
+ ("daabdda", "0"),
+ ("aba", ";"),
+ ("c", "6"),
+ ("aa", "7"),
+ ("abbd", "c"),
+ ("badbd", "?"),
+ ("bbd", "f"),
+ ("cc", "@"),
+ ("bb", "8"),
+ ("daca", ">"),
+ ("ba", ":"),
+ ("baac", "3"),
+ ("dbdddac", "a"),
+ ("a", "2"),
+ ("cabd", "b"),
+ ("b", "="),
+ ("abd", "4"),
+ ("adcbd", "5"),
+ ("abc", "e"),
+ ("ab", "1"),
+ ]
+ )
+ def test_dawg(self, data):
+ # suppress debug prints
+ with support.captured_stdout() as output:
+ # it's enough to build it, building will also check the result
+ build_compression_dawg(data)