summaryrefslogtreecommitdiffstats
path: root/Python
diff options
context:
space:
mode:
authorPablo Galindo Salgado <Pablogsal@gmail.com>2021-08-24 16:50:05 (GMT)
committerGitHub <noreply@github.com>2021-08-24 16:50:05 (GMT)
commita24676bedcd332dd7e6fa5521d0449206391d190 (patch)
tree45ccbb5c30c4debd12c08df5edbdb87e38353348 /Python
parent9ed523159c7ba840dbf403e02498eeae1b5d3ed9 (diff)
downloadcpython-a24676bedcd332dd7e6fa5521d0449206391d190.zip
cpython-a24676bedcd332dd7e6fa5521d0449206391d190.tar.gz
cpython-a24676bedcd332dd7e6fa5521d0449206391d190.tar.bz2
Add tests for the C tokenizer and expose it as a private module (GH-27924)
Diffstat (limited to 'Python')
-rw-r--r--Python/Python-tokenize.c195
-rw-r--r--Python/clinic/Python-tokenize.c.h41
-rw-r--r--Python/stdlib_module_names.h1
3 files changed, 237 insertions, 0 deletions
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
new file mode 100644
index 0000000..b9fb169
--- /dev/null
+++ b/Python/Python-tokenize.c
@@ -0,0 +1,195 @@
+#include "Python.h"
+#include "../Parser/tokenizer.h"
+
+static struct PyModuleDef _tokenizemodule;
+
+typedef struct {
+ PyTypeObject* TokenizerIter;
+} tokenize_state;
+
+static tokenize_state*
+get_tokenize_state(PyObject* module)
+{
+ return (tokenize_state*)PyModule_GetState(module);
+}
+
+#define _tokenize_get_state_by_type(type) \
+ get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))
+
+#include "clinic/Python-tokenize.c.h"
+
+/*[clinic input]
+module _tokenizer
+class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
+
+typedef struct {
+ PyObject_HEAD
+ struct tok_state* tok;
+} tokenizeriterobject;
+
+/*[clinic input]
+@classmethod
+_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
+
+ source: str
+[clinic start generated code]*/
+
+static PyObject *
+tokenizeriter_new_impl(PyTypeObject *type, const char *source)
+/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
+{
+ tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
+ if (self == NULL) {
+ return NULL;
+ }
+ PyObject* filename = PyUnicode_FromString("<string>");
+ if (filename == NULL) {
+ return NULL;
+ }
+ self->tok = PyTokenizer_FromUTF8(source, 1);
+ if (self->tok == NULL) {
+ return NULL;
+ }
+ self->tok->filename = filename;
+ return (PyObject*)self;
+}
+
+static PyObject*
+tokenizeriter_next(tokenizeriterobject* it)
+{
+ const char* start;
+ const char* end;
+ int type = PyTokenizer_Get(it->tok, &start, &end);
+ if (type == ERRORTOKEN && PyErr_Occurred()) {
+ return NULL;
+ }
+ if (type == ERRORTOKEN || type == ENDMARKER) {
+ PyErr_SetString(PyExc_StopIteration, "EOF");
+ return NULL;
+ }
+ PyObject* str = NULL;
+ if (start == NULL || end == NULL) {
+ str = PyUnicode_FromString("");
+ } else {
+ str = PyUnicode_FromStringAndSize(start, end - start);
+ }
+ if (str == NULL) {
+ return NULL;
+ }
+
+ Py_ssize_t size = it->tok->inp - it->tok->buf;
+ PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
+ if (line == NULL) {
+ Py_DECREF(str);
+ return NULL;
+ }
+ const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
+ int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
+ int end_lineno = it->tok->lineno;
+ int col_offset = -1;
+ int end_col_offset = -1;
+ if (start != NULL && start >= line_start) {
+ col_offset = (int)(start - line_start);
+ }
+ if (end != NULL && end >= it->tok->line_start) {
+ end_col_offset = (int)(end - it->tok->line_start);
+ }
+
+ return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+}
+
+static void
+tokenizeriter_dealloc(tokenizeriterobject* it)
+{
+ PyTypeObject* tp = Py_TYPE(it);
+ PyTokenizer_Free(it->tok);
+ tp->tp_free(it);
+ Py_DECREF(tp);
+}
+
+static PyType_Slot tokenizeriter_slots[] = {
+ {Py_tp_new, tokenizeriter_new},
+ {Py_tp_dealloc, tokenizeriter_dealloc},
+ {Py_tp_getattro, PyObject_GenericGetAttr},
+ {Py_tp_iter, PyObject_SelfIter},
+ {Py_tp_iternext, tokenizeriter_next},
+ {0, NULL},
+};
+
+static PyType_Spec tokenizeriter_spec = {
+ .name = "_tokenize.TokenizerIter",
+ .basicsize = sizeof(tokenizeriterobject),
+ .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
+ .slots = tokenizeriter_slots,
+};
+
+
+static int
+tokenizemodule_exec(PyObject* m)
+{
+ tokenize_state* state = get_tokenize_state(m);
+ if (state == NULL) {
+ return -1;
+ }
+
+ state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
+ m, &tokenizeriter_spec, NULL);
+ if (state->TokenizerIter == NULL) {
+ return -1;
+ }
+ if (PyModule_AddType(m, state->TokenizerIter) < 0) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static PyMethodDef tokenize_methods[] = {
+ {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+static PyModuleDef_Slot tokenizemodule_slots[] = {
+ {Py_mod_exec, tokenizemodule_exec},
+ {0, NULL}
+};
+
+static int
+tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
+{
+ tokenize_state *state = get_tokenize_state(m);
+ Py_VISIT(state->TokenizerIter);
+ return 0;
+}
+
+static int
+tokenizemodule_clear(PyObject *m)
+{
+ tokenize_state *state = get_tokenize_state(m);
+ Py_CLEAR(state->TokenizerIter);
+ return 0;
+}
+
+static void
+tokenizemodule_free(void *m)
+{
+ tokenizemodule_clear((PyObject *)m);
+}
+
+static struct PyModuleDef _tokenizemodule = {
+ PyModuleDef_HEAD_INIT,
+ .m_name = "_tokenize",
+ .m_size = sizeof(tokenize_state),
+ .m_slots = tokenizemodule_slots,
+ .m_methods = tokenize_methods,
+ .m_traverse = tokenizemodule_traverse,
+ .m_clear = tokenizemodule_clear,
+ .m_free = tokenizemodule_free,
+};
+
+PyMODINIT_FUNC
+PyInit__tokenize(void)
+{
+ return PyModuleDef_Init(&_tokenizemodule);
+}
diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h
new file mode 100644
index 0000000..050b4d4
--- /dev/null
+++ b/Python/clinic/Python-tokenize.c.h
@@ -0,0 +1,41 @@
+/*[clinic input]
+preserve
+[clinic start generated code]*/
+
+static PyObject *
+tokenizeriter_new_impl(PyTypeObject *type, const char *source);
+
+static PyObject *
+tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+ PyObject *return_value = NULL;
+ static const char * const _keywords[] = {"source", NULL};
+ static _PyArg_Parser _parser = {NULL, _keywords, "tokenizeriter", 0};
+ PyObject *argsbuf[1];
+ PyObject * const *fastargs;
+ Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+ const char *source;
+
+ fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf);
+ if (!fastargs) {
+ goto exit;
+ }
+ if (!PyUnicode_Check(fastargs[0])) {
+ _PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]);
+ goto exit;
+ }
+ Py_ssize_t source_length;
+ source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length);
+ if (source == NULL) {
+ goto exit;
+ }
+ if (strlen(source) != (size_t)source_length) {
+ PyErr_SetString(PyExc_ValueError, "embedded null character");
+ goto exit;
+ }
+ return_value = tokenizeriter_new_impl(type, source);
+
+exit:
+ return return_value;
+}
+/*[clinic end generated code: output=dfcd64774e01bfe6 input=a9049054013a1b77]*/
diff --git a/Python/stdlib_module_names.h b/Python/stdlib_module_names.h
index 3c5f176..2f75c2e 100644
--- a/Python/stdlib_module_names.h
+++ b/Python/stdlib_module_names.h
@@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = {
"_thread",
"_threading_local",
"_tkinter",
+"_tokenize",
"_tracemalloc",
"_typing",
"_uuid",