diff options
author | Lysandros Nikolaou <lisandrosnik@gmail.com> | 2023-10-11 15:14:44 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-11 15:14:44 (GMT) |
commit | 01481f2dc13341c84b64d6dffc08ffed022712a6 (patch) | |
tree | 706f721ed9a7e5fa7e1c6cb3c3026191c7c95475 /Parser/tokenizer/utf8_tokenizer.c | |
parent | eb50cd37eac47dd4dc71ab42d0582dfb6eac4515 (diff) | |
download | cpython-01481f2dc13341c84b64d6dffc08ffed022712a6.zip cpython-01481f2dc13341c84b64d6dffc08ffed022712a6.tar.gz cpython-01481f2dc13341c84b64d6dffc08ffed022712a6.tar.bz2 |
gh-104169: Refactor tokenizer into lexer and wrappers (#110684)
* The lexer, which include the actual lexeme producing logic, goes into
the `lexer` directory.
* The wrappers, one wrapper per input mode (file, string, utf-8, and
readline), go into the `tokenizer` directory and include logic for
creating a lexer instance and managing the buffer for different modes.
---------
Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Diffstat (limited to 'Parser/tokenizer/utf8_tokenizer.c')
-rw-r--r-- | Parser/tokenizer/utf8_tokenizer.c | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/Parser/tokenizer/utf8_tokenizer.c b/Parser/tokenizer/utf8_tokenizer.c new file mode 100644 index 0000000..1a925f4 --- /dev/null +++ b/Parser/tokenizer/utf8_tokenizer.c @@ -0,0 +1,55 @@ +#include "Python.h" +#include "errcode.h" + +#include "helpers.h" +#include "../lexer/state.h" + +static int +tok_underflow_string(struct tok_state *tok) { + char *end = strchr(tok->inp, '\n'); + if (end != NULL) { + end++; + } + else { + end = strchr(tok->inp, '\0'); + if (end == tok->inp) { + tok->done = E_EOF; + return 0; + } + } + if (tok->start == NULL) { + tok->buf = tok->cur; + } + tok->line_start = tok->cur; + ADVANCE_LINENO(); + tok->inp = end; + return 1; +} + +/* Set up tokenizer for UTF-8 string */ +struct tok_state * +_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf) +{ + struct tok_state *tok = _PyTokenizer_tok_new(); + char *translated; + if (tok == NULL) + return NULL; + tok->input = translated = _PyTokenizer_translate_newlines(str, exec_input, preserve_crlf, tok); + if (translated == NULL) { + _PyTokenizer_Free(tok); + return NULL; + } + tok->decoding_state = STATE_NORMAL; + tok->enc = NULL; + tok->str = translated; + tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok); + if (!tok->encoding) { + _PyTokenizer_Free(tok); + return NULL; + } + + tok->buf = tok->cur = tok->inp = translated; + tok->end = translated; + tok->underflow = &tok_underflow_string; + return tok; +} |