gh-96954: use a directed acyclic word graph for storing the unicodedata codepoint names (#97906)

Co-authored-by: Łukasz Langa <lukasz@langa.pl> Co-authored-by: Pieter Eendebak <pieter.eendebak@gmail.com> Co-authored-by: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com>
author: CF Bolz-Tereick <cfbolz@gmx.de> 2023-11-04 14:56:58 (GMT)
committer: GitHub <noreply@github.com> 2023-11-04 14:56:58 (GMT)
commit: 9573d142157d8432f2772a109c304dafeaa454a5 (patch)
tree: 3ad678dac4e01bbc4825d9d28fa38595addbb608 /Lib/test/test_tools
parent: 0e9c364f4ac18a2237bdbac702b96bcf8ef9cb09 (diff)
download: cpython-9573d142157d8432f2772a109c304dafeaa454a5.zip
cpython-9573d142157d8432f2772a109c304dafeaa454a5.tar.gz
cpython-9573d142157d8432f2772a109c304dafeaa454a5.tar.bz2
1 files changed, 121 insertions, 0 deletions
diff --git a/Lib/test/test_tools/test_makeunicodedata.py b/Lib/test/test_tools/test_makeunicodedata.py
new file mode 100644
index 0000000..eee6867
--- /dev/null
+++ b/Lib/test/test_tools/test_makeunicodedata.py
@@ -0,0 +1,121 @@
+import unittest
+from test.test_tools import toolsdir, imports_under_tool
+from test import support
+from test.support.hypothesis_helper import hypothesis
+
+st = hypothesis.strategies
+given = hypothesis.given
+example = hypothesis.example
+
+
+with imports_under_tool("unicode"):
+    from dawg import Dawg, build_compression_dawg, lookup, inverse_lookup
+
+
+@st.composite
+def char_name_db(draw, min_length=1, max_length=30):
+    m = draw(st.integers(min_value=min_length, max_value=max_length))
+    names = draw(
+        st.sets(st.text("abcd", min_size=1, max_size=10), min_size=m, max_size=m)
+    )
+    characters = draw(st.sets(st.characters(), min_size=m, max_size=m))
+    return list(zip(names, characters))
+
+
+class TestDawg(unittest.TestCase):
+    """Tests for the directed acyclic word graph data structure that is used
+    to store the unicode character names in unicodedata. Tests ported from PyPy
+    """
+
+    def test_dawg_direct_simple(self):
+        dawg = Dawg()
+        dawg.insert("a", -4)
+        dawg.insert("c", -2)
+        dawg.insert("cat", -1)
+        dawg.insert("catarr", 0)
+        dawg.insert("catnip", 1)
+        dawg.insert("zcatnip", 5)
+        packed, data, inverse = dawg.finish()
+
+        self.assertEqual(lookup(packed, data, b"a"), -4)
+        self.assertEqual(lookup(packed, data, b"c"), -2)
+        self.assertEqual(lookup(packed, data, b"cat"), -1)
+        self.assertEqual(lookup(packed, data, b"catarr"), 0)
+        self.assertEqual(lookup(packed, data, b"catnip"), 1)
+        self.assertEqual(lookup(packed, data, b"zcatnip"), 5)
+        self.assertRaises(KeyError, lookup, packed, data, b"b")
+        self.assertRaises(KeyError, lookup, packed, data, b"catni")
+        self.assertRaises(KeyError, lookup, packed, data, b"catnipp")
+
+        self.assertEqual(inverse_lookup(packed, inverse, -4), b"a")
+        self.assertEqual(inverse_lookup(packed, inverse, -2), b"c")
+        self.assertEqual(inverse_lookup(packed, inverse, -1), b"cat")
+        self.assertEqual(inverse_lookup(packed, inverse, 0), b"catarr")
+        self.assertEqual(inverse_lookup(packed, inverse, 1), b"catnip")
+        self.assertEqual(inverse_lookup(packed, inverse, 5), b"zcatnip")
+        self.assertRaises(KeyError, inverse_lookup, packed, inverse, 12)
+
+    def test_forbid_empty_dawg(self):
+        dawg = Dawg()
+        self.assertRaises(ValueError, dawg.finish)
+
+    @given(char_name_db())
+    @example([("abc", "a"), ("abd", "b")])
+    @example(
+        [
+            ("bab", "1"),
+            ("a", ":"),
+            ("ad", "@"),
+            ("b", "<"),
+            ("aacc", "?"),
+            ("dab", "D"),
+            ("aa", "0"),
+            ("ab", "F"),
+            ("aaa", "7"),
+            ("cbd", "="),
+            ("abad", ";"),
+            ("ac", "B"),
+            ("abb", "4"),
+            ("bb", "2"),
+            ("aab", "9"),
+            ("caaaaba", "E"),
+            ("ca", ">"),
+            ("bbaaa", "5"),
+            ("d", "3"),
+            ("baac", "8"),
+            ("c", "6"),
+            ("ba", "A"),
+        ]
+    )
+    @example(
+        [
+            ("bcdac", "9"),
+            ("acc", "g"),
+            ("d", "d"),
+            ("daabdda", "0"),
+            ("aba", ";"),
+            ("c", "6"),
+            ("aa", "7"),
+            ("abbd", "c"),
+            ("badbd", "?"),
+            ("bbd", "f"),
+            ("cc", "@"),
+            ("bb", "8"),
+            ("daca", ">"),
+            ("ba", ":"),
+            ("baac", "3"),
+            ("dbdddac", "a"),
+            ("a", "2"),
+            ("cabd", "b"),
+            ("b", "="),
+            ("abd", "4"),
+            ("adcbd", "5"),
+            ("abc", "e"),
+            ("ab", "1"),
+        ]
+    )
+    def test_dawg(self, data):
+        # suppress debug prints
+        with support.captured_stdout() as output:
+            # it's enough to build it, building will also check the result
+            build_compression_dawg(data)
author	CF Bolz-Tereick <cfbolz@gmx.de>	2023-11-04 14:56:58 (GMT)
committer	GitHub <noreply@github.com>	2023-11-04 14:56:58 (GMT)
commit	9573d142157d8432f2772a109c304dafeaa454a5 (patch)
tree	3ad678dac4e01bbc4825d9d28fa38595addbb608 /Lib/test/test_tools
parent	0e9c364f4ac18a2237bdbac702b96bcf8ef9cb09 (diff)
download	cpython-9573d142157d8432f2772a109c304dafeaa454a5.zip cpython-9573d142157d8432f2772a109c304dafeaa454a5.tar.gz cpython-9573d142157d8432f2772a109c304dafeaa454a5.tar.bz2