diff options
author | Pablo Galindo Salgado <Pablogsal@gmail.com> | 2021-08-24 16:50:05 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-08-24 16:50:05 (GMT) |
commit | a24676bedcd332dd7e6fa5521d0449206391d190 (patch) | |
tree | 45ccbb5c30c4debd12c08df5edbdb87e38353348 /Python | |
parent | 9ed523159c7ba840dbf403e02498eeae1b5d3ed9 (diff) | |
download | cpython-a24676bedcd332dd7e6fa5521d0449206391d190.zip cpython-a24676bedcd332dd7e6fa5521d0449206391d190.tar.gz cpython-a24676bedcd332dd7e6fa5521d0449206391d190.tar.bz2 |
Add tests for the C tokenizer and expose it as a private module (GH-27924)
Diffstat (limited to 'Python')
-rw-r--r-- | Python/Python-tokenize.c | 195 | ||||
-rw-r--r-- | Python/clinic/Python-tokenize.c.h | 41 | ||||
-rw-r--r-- | Python/stdlib_module_names.h | 1 |
3 files changed, 237 insertions, 0 deletions
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c new file mode 100644 index 0000000..b9fb169 --- /dev/null +++ b/Python/Python-tokenize.c @@ -0,0 +1,195 @@ +#include "Python.h" +#include "../Parser/tokenizer.h" + +static struct PyModuleDef _tokenizemodule; + +typedef struct { + PyTypeObject* TokenizerIter; +} tokenize_state; + +static tokenize_state* +get_tokenize_state(PyObject* module) +{ + return (tokenize_state*)PyModule_GetState(module); +} + +#define _tokenize_get_state_by_type(type) \ + get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule)) + +#include "clinic/Python-tokenize.c.h" + +/*[clinic input] +module _tokenizer +class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter" +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/ + +typedef struct { + PyObject_HEAD + struct tok_state* tok; +} tokenizeriterobject; + +/*[clinic input] +@classmethod +_tokenizer.tokenizeriter.__new__ as tokenizeriter_new + + source: str +[clinic start generated code]*/ + +static PyObject * +tokenizeriter_new_impl(PyTypeObject *type, const char *source) +/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/ +{ + tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0); + if (self == NULL) { + return NULL; + } + PyObject* filename = PyUnicode_FromString("<string>"); + if (filename == NULL) { + return NULL; + } + self->tok = PyTokenizer_FromUTF8(source, 1); + if (self->tok == NULL) { + return NULL; + } + self->tok->filename = filename; + return (PyObject*)self; +} + +static PyObject* +tokenizeriter_next(tokenizeriterobject* it) +{ + const char* start; + const char* end; + int type = PyTokenizer_Get(it->tok, &start, &end); + if (type == ERRORTOKEN && PyErr_Occurred()) { + return NULL; + } + if (type == ERRORTOKEN || type == ENDMARKER) { + PyErr_SetString(PyExc_StopIteration, "EOF"); + return NULL; + } + PyObject* str = NULL; + if (start == NULL || end == NULL) { + str = PyUnicode_FromString(""); + } else { + str = PyUnicode_FromStringAndSize(start, end - start); + } + if (str == NULL) { + return NULL; + } + + Py_ssize_t size = it->tok->inp - it->tok->buf; + PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace"); + if (line == NULL) { + Py_DECREF(str); + return NULL; + } + const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start; + int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno; + int end_lineno = it->tok->lineno; + int col_offset = -1; + int end_col_offset = -1; + if (start != NULL && start >= line_start) { + col_offset = (int)(start - line_start); + } + if (end != NULL && end >= it->tok->line_start) { + end_col_offset = (int)(end - it->tok->line_start); + } + + return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); +} + +static void +tokenizeriter_dealloc(tokenizeriterobject* it) +{ + PyTypeObject* tp = Py_TYPE(it); + PyTokenizer_Free(it->tok); + tp->tp_free(it); + Py_DECREF(tp); +} + +static PyType_Slot tokenizeriter_slots[] = { + {Py_tp_new, tokenizeriter_new}, + {Py_tp_dealloc, tokenizeriter_dealloc}, + {Py_tp_getattro, PyObject_GenericGetAttr}, + {Py_tp_iter, PyObject_SelfIter}, + {Py_tp_iternext, tokenizeriter_next}, + {0, NULL}, +}; + +static PyType_Spec tokenizeriter_spec = { + .name = "_tokenize.TokenizerIter", + .basicsize = sizeof(tokenizeriterobject), + .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE), + .slots = tokenizeriter_slots, +}; + + +static int +tokenizemodule_exec(PyObject* m) +{ + tokenize_state* state = get_tokenize_state(m); + if (state == NULL) { + return -1; + } + + state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec( + m, &tokenizeriter_spec, NULL); + if (state->TokenizerIter == NULL) { + return -1; + } + if (PyModule_AddType(m, state->TokenizerIter) < 0) { + return -1; + } + + return 0; +} + +static PyMethodDef tokenize_methods[] = { + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + +static PyModuleDef_Slot tokenizemodule_slots[] = { + {Py_mod_exec, tokenizemodule_exec}, + {0, NULL} +}; + +static int +tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg) +{ + tokenize_state *state = get_tokenize_state(m); + Py_VISIT(state->TokenizerIter); + return 0; +} + +static int +tokenizemodule_clear(PyObject *m) +{ + tokenize_state *state = get_tokenize_state(m); + Py_CLEAR(state->TokenizerIter); + return 0; +} + +static void +tokenizemodule_free(void *m) +{ + tokenizemodule_clear((PyObject *)m); +} + +static struct PyModuleDef _tokenizemodule = { + PyModuleDef_HEAD_INIT, + .m_name = "_tokenize", + .m_size = sizeof(tokenize_state), + .m_slots = tokenizemodule_slots, + .m_methods = tokenize_methods, + .m_traverse = tokenizemodule_traverse, + .m_clear = tokenizemodule_clear, + .m_free = tokenizemodule_free, +}; + +PyMODINIT_FUNC +PyInit__tokenize(void) +{ + return PyModuleDef_Init(&_tokenizemodule); +} diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h new file mode 100644 index 0000000..050b4d4 --- /dev/null +++ b/Python/clinic/Python-tokenize.c.h @@ -0,0 +1,41 @@ +/*[clinic input] +preserve +[clinic start generated code]*/ + +static PyObject * +tokenizeriter_new_impl(PyTypeObject *type, const char *source); + +static PyObject * +tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"source", NULL}; + static _PyArg_Parser _parser = {NULL, _keywords, "tokenizeriter", 0}; + PyObject *argsbuf[1]; + PyObject * const *fastargs; + Py_ssize_t nargs = PyTuple_GET_SIZE(args); + const char *source; + + fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf); + if (!fastargs) { + goto exit; + } + if (!PyUnicode_Check(fastargs[0])) { + _PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]); + goto exit; + } + Py_ssize_t source_length; + source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length); + if (source == NULL) { + goto exit; + } + if (strlen(source) != (size_t)source_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + goto exit; + } + return_value = tokenizeriter_new_impl(type, source); + +exit: + return return_value; +} +/*[clinic end generated code: output=dfcd64774e01bfe6 input=a9049054013a1b77]*/ diff --git a/Python/stdlib_module_names.h b/Python/stdlib_module_names.h index 3c5f176..2f75c2e 100644 --- a/Python/stdlib_module_names.h +++ b/Python/stdlib_module_names.h @@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = { "_thread", "_threading_local", "_tkinter", +"_tokenize", "_tracemalloc", "_typing", "_uuid", |