summaryrefslogtreecommitdiffstats
path: root/Parser
diff options
context:
space:
mode:
authorPablo Galindo Salgado <Pablogsal@gmail.com>2023-10-26 21:43:38 (GMT)
committerGitHub <noreply@github.com>2023-10-26 21:43:38 (GMT)
commitc81ebf5b3d9aa080d2039f1225f6fed91c29f5a8 (patch)
tree31d0a5052319b1480704f513aba52057ca829478 /Parser
parente25d8b40cd70744513e190b1ca153087382b6b09 (diff)
downloadcpython-c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8.zip
cpython-c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8.tar.gz
cpython-c81ebf5b3d9aa080d2039f1225f6fed91c29f5a8.tar.bz2
[3.12] bpo-43950: handle wide unicode characters in tracebacks (GH-28150) (#111346)
Diffstat (limited to 'Parser')
-rw-r--r--Parser/pegen.c55
-rw-r--r--Parser/pegen.h1
2 files changed, 56 insertions, 0 deletions
diff --git a/Parser/pegen.c b/Parser/pegen.c
index b9894dd..ff02e88 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -38,6 +38,61 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
return size;
}
+// Calculate the extra amount of width space the given source
+// code segment might take if it were to be displayed on a fixed
+// width output device. Supports wide unicode characters and emojis.
+Py_ssize_t
+_PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
+{
+ PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
+ if (!segment) {
+ return -1;
+ }
+
+ // Fast track for ascii strings
+ if (PyUnicode_IS_ASCII(segment)) {
+ Py_DECREF(segment);
+ return character_offset;
+ }
+
+ PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
+ if (!width_fn) {
+ return -1;
+ }
+
+ Py_ssize_t width = 0;
+ Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
+ for (Py_ssize_t i = 0; i < len; i++) {
+ PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
+ if (!chr) {
+ Py_DECREF(segment);
+ Py_DECREF(width_fn);
+ return -1;
+ }
+
+ PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
+ Py_DECREF(chr);
+ if (!width_specifier) {
+ Py_DECREF(segment);
+ Py_DECREF(width_fn);
+ return -1;
+ }
+
+ if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
+ _PyUnicode_EqualToASCIIString(width_specifier, "F")) {
+ width += 2;
+ }
+ else {
+ width += 1;
+ }
+ Py_DECREF(width_specifier);
+ }
+
+ Py_DECREF(segment);
+ Py_DECREF(width_fn);
+ return width;
+}
+
// Here, mark is the start of the node, while p->mark is the end.
// If node==NULL, they should be the same.
int
diff --git a/Parser/pegen.h b/Parser/pegen.h
index a8bfa78..268f380 100644
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -151,6 +151,7 @@ expr_ty _PyPegen_name_token(Parser *p);
expr_ty _PyPegen_number_token(Parser *p);
void *_PyPegen_string_token(Parser *p);
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
+Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
// Error handling functions and APIs
typedef enum {