gh-104169: Refactor tokenizer into lexer and wrappers (#110684)

* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
author: Lysandros Nikolaou <lisandrosnik@gmail.com> 2023-10-11 15:14:44 (GMT)
committer: GitHub <noreply@github.com> 2023-10-11 15:14:44 (GMT)
commit: 01481f2dc13341c84b64d6dffc08ffed022712a6 (patch)
tree: 706f721ed9a7e5fa7e1c6cb3c3026191c7c95475 /Parser/tokenizer/utf8_tokenizer.c
parent: eb50cd37eac47dd4dc71ab42d0582dfb6eac4515 (diff)
download: cpython-01481f2dc13341c84b64d6dffc08ffed022712a6.zip
cpython-01481f2dc13341c84b64d6dffc08ffed022712a6.tar.gz
cpython-01481f2dc13341c84b64d6dffc08ffed022712a6.tar.bz2
1 files changed, 55 insertions, 0 deletions
diff --git a/Parser/tokenizer/utf8_tokenizer.c b/Parser/tokenizer/utf8_tokenizer.c
new file mode 100644
index 0000000..1a925f4
--- /dev/null
+++ b/Parser/tokenizer/utf8_tokenizer.c
@@ -0,0 +1,55 @@
+#include "Python.h"
+#include "errcode.h"
+
+#include "helpers.h"
+#include "../lexer/state.h"
+
+static int
+tok_underflow_string(struct tok_state *tok) {
+    char *end = strchr(tok->inp, '\n');
+    if (end != NULL) {
+        end++;
+    }
+    else {
+        end = strchr(tok->inp, '\0');
+        if (end == tok->inp) {
+            tok->done = E_EOF;
+            return 0;
+        }
+    }
+    if (tok->start == NULL) {
+        tok->buf = tok->cur;
+    }
+    tok->line_start = tok->cur;
+    ADVANCE_LINENO();
+    tok->inp = end;
+    return 1;
+}
+
+/* Set up tokenizer for UTF-8 string */
+struct tok_state *
+_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
+{
+    struct tok_state *tok = _PyTokenizer_tok_new();
+    char *translated;
+    if (tok == NULL)
+        return NULL;
+    tok->input = translated = _PyTokenizer_translate_newlines(str, exec_input, preserve_crlf, tok);
+    if (translated == NULL) {
+        _PyTokenizer_Free(tok);
+        return NULL;
+    }
+    tok->decoding_state = STATE_NORMAL;
+    tok->enc = NULL;
+    tok->str = translated;
+    tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok);
+    if (!tok->encoding) {
+        _PyTokenizer_Free(tok);
+        return NULL;
+    }
+
+    tok->buf = tok->cur = tok->inp = translated;
+    tok->end = translated;
+    tok->underflow = &tok_underflow_string;
+    return tok;
+}
author	Lysandros Nikolaou <lisandrosnik@gmail.com>	2023-10-11 15:14:44 (GMT)
committer	GitHub <noreply@github.com>	2023-10-11 15:14:44 (GMT)
commit	01481f2dc13341c84b64d6dffc08ffed022712a6 (patch)
tree	706f721ed9a7e5fa7e1c6cb3c3026191c7c95475 /Parser/tokenizer/utf8_tokenizer.c
parent	eb50cd37eac47dd4dc71ab42d0582dfb6eac4515 (diff)
download	cpython-01481f2dc13341c84b64d6dffc08ffed022712a6.zip cpython-01481f2dc13341c84b64d6dffc08ffed022712a6.tar.gz cpython-01481f2dc13341c84b64d6dffc08ffed022712a6.tar.bz2