[3.12] bpo-43950: handle wide unicode characters in tracebacks (GH-28150) (#111346)

author: Pablo Galindo Salgado <Pablogsal@gmail.com> 2023-10-26 21:43:38 (GMT)
committer: GitHub <noreply@github.com> 2023-10-26 21:43:38 (GMT)
commit: c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8 (patch)
tree: 31d0a5052319b1480704f513aba52057ca829478 /Parser
parent: e25d8b40cd70744513e190b1ca153087382b6b09 (diff)
download: cpython-c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8.zip
cpython-c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8.tar.gz
cpython-c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8.tar.bz2
2 files changed, 56 insertions, 0 deletions
diff --git a/Parser/pegen.c b/Parser/pegen.c
index b9894dd..ff02e88 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -38,6 +38,61 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
     return size;
 }
 
+// Calculate the extra amount of width space the given source
+// code segment might take if it were to be displayed on a fixed
+// width output device. Supports wide unicode characters and emojis.
+Py_ssize_t
+_PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
+{
+    PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
+    if (!segment) {
+        return -1;
+    }
+
+    // Fast track for ascii strings
+    if (PyUnicode_IS_ASCII(segment)) {
+        Py_DECREF(segment);
+        return character_offset;
+    }
+
+    PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
+    if (!width_fn) {
+        return -1;
+    }
+
+    Py_ssize_t width = 0;
+    Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
+    for (Py_ssize_t i = 0; i < len; i++) {
+        PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
+        if (!chr) {
+            Py_DECREF(segment);
+            Py_DECREF(width_fn);
+            return -1;
+        }
+
+        PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
+        Py_DECREF(chr);
+        if (!width_specifier) {
+            Py_DECREF(segment);
+            Py_DECREF(width_fn);
+            return -1;
+        }
+
+        if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
+            _PyUnicode_EqualToASCIIString(width_specifier, "F")) {
+            width += 2;
+        }
+        else {
+            width += 1;
+        }
+        Py_DECREF(width_specifier);
+    }
+
+    Py_DECREF(segment);
+    Py_DECREF(width_fn);
+    return width;
+}
+
 // Here, mark is the start of the node, while p->mark is the end.
 // If node==NULL, they should be the same.
 int
diff --git a/Parser/pegen.h b/Parser/pegen.h
index a8bfa78..268f380 100644
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -151,6 +151,7 @@ expr_ty _PyPegen_name_token(Parser *p);
 expr_ty _PyPegen_number_token(Parser *p);
 void *_PyPegen_string_token(Parser *p);
 Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
+Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
 
 // Error handling functions and APIs
 typedef enum {
author	Pablo Galindo Salgado <Pablogsal@gmail.com>	2023-10-26 21:43:38 (GMT)
committer	GitHub <noreply@github.com>	2023-10-26 21:43:38 (GMT)
commit	c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8 (patch)
tree	31d0a5052319b1480704f513aba52057ca829478 /Parser
parent	e25d8b40cd70744513e190b1ca153087382b6b09 (diff)
download	cpython-c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8.zip cpython-c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8.tar.gz cpython-c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8.tar.bz2