summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_tokenize.py10
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst2
-rw-r--r--Parser/pegen.c16
-rw-r--r--Parser/pegen.h1
-rw-r--r--Python/Python-tokenize.c2
5 files changed, 25 insertions, 6 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index bbbc337..2886bce 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -622,6 +622,16 @@ f'__{
FSTRING_END "'" (3, 3) (3, 4)
""")
+ self.check_tokenize("""\
+ '''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli
+ aktualni pracownicy, obecni pracownicy'''
+""", """\
+ INDENT ' ' (1, 0) (1, 4)
+ STRING "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45)
+ NEWLINE '\\n' (2, 45) (2, 46)
+ DEDENT '' (3, 0) (3, 0)
+ """)
+
def test_function(self):
self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
NAME 'def' (1, 0) (1, 3)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst b/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst
new file mode 100644
index 0000000..4bc2fe7
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst
@@ -0,0 +1,2 @@
+Correctly compute end column offsets for multiline tokens in the
+:mod:`tokenize` module. Patch by Pablo Galindo
diff --git a/Parser/pegen.c b/Parser/pegen.c
index ff02e88..cbceaae 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -18,12 +18,8 @@ _PyPegen_interactive_exit(Parser *p)
}
Py_ssize_t
-_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
+_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
{
- const char *str = PyUnicode_AsUTF8(line);
- if (!str) {
- return -1;
- }
Py_ssize_t len = strlen(str);
if (col_offset > len + 1) {
col_offset = len + 1;
@@ -93,6 +89,16 @@ _PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
return width;
}
+Py_ssize_t
+_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
+{
+ const char *str = PyUnicode_AsUTF8(line);
+ if (!str) {
+ return -1;
+ }
+ return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
+}
+
// Here, mark is the start of the node, while p->mark is the end.
// If node==NULL, they should be the same.
int
diff --git a/Parser/pegen.h b/Parser/pegen.h
index 268f380..c2a3e02 100644
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -151,6 +151,7 @@ expr_ty _PyPegen_name_token(Parser *p);
expr_ty _PyPegen_number_token(Parser *p);
void *_PyPegen_string_token(Parser *p);
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
+Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);
Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
// Error handling functions and APIs
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 1938562..179f71a 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -224,7 +224,7 @@ tokenizeriter_next(tokenizeriterobject *it)
col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
}
if (token.end != NULL && token.end >= it->tok->line_start) {
- end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
+ end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
}
if (it->tok->tok_extra_tokens) {