summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2023-05-31 10:11:53 (GMT)
committerGitHub <noreply@github.com>2023-05-31 10:11:53 (GMT)
commitc687946f6815a17bc5ceacaf3bbceba5b41e73fd (patch)
tree232c64d0c0190d8da0f3d6b9c3ab4528e4bcba0c
parent2f8c22f1d6c22f018c78264937db66d52fb18869 (diff)
downloadcpython-c687946f6815a17bc5ceacaf3bbceba5b41e73fd.zip
cpython-c687946f6815a17bc5ceacaf3bbceba5b41e73fd.tar.gz
cpython-c687946f6815a17bc5ceacaf3bbceba5b41e73fd.tar.bz2
[3.12] gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (GH-105070) (#105119)
gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (GH-105070) (cherry picked from commit 9216e69a87d16d871625721ed5a8aa302511f367) Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
-rw-r--r--Lib/inspect.py2
-rw-r--r--Lib/test/test_tokenize.py145
-rw-r--r--Lib/tokenize.py32
-rw-r--r--Parser/tokenizer.c136
-rw-r--r--Parser/tokenizer.h2
-rw-r--r--Python/Python-tokenize.c12
-rw-r--r--Python/clinic/Python-tokenize.c.h41
7 files changed, 274 insertions, 96 deletions
diff --git a/Lib/inspect.py b/Lib/inspect.py
index 55530fc..15eefdb 100644
--- a/Lib/inspect.py
+++ b/Lib/inspect.py
@@ -2203,7 +2203,7 @@ def _signature_strip_non_python_syntax(signature):
add(string)
if (string == ','):
add(' ')
- clean_signature = ''.join(text).strip()
+ clean_signature = ''.join(text).strip().replace("\n", "")
return clean_signature, self_parameter
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 3adcc4e..a9a2b76 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,6 +1,6 @@
from test import support
from test.support import os_helper
-from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
+from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens,
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
@@ -51,6 +51,25 @@ class TokenizeTest(TestCase):
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
expected.rstrip().splitlines())
+ def test_invalid_readline(self):
+ def gen():
+ yield "sdfosdg"
+ yield "sdfosdg"
+ with self.assertRaises(TypeError):
+ list(tokenize(gen().__next__))
+
+ def gen():
+ yield b"sdfosdg"
+ yield b"sdfosdg"
+ with self.assertRaises(TypeError):
+ list(generate_tokens(gen().__next__))
+
+ def gen():
+ yield "sdfosdg"
+ 1/0
+ with self.assertRaises(ZeroDivisionError):
+ list(generate_tokens(gen().__next__))
+
def test_implicit_newline(self):
# Make sure that the tokenizer puts in an implicit NEWLINE
# when the input lacks a trailing new line.
@@ -1161,7 +1180,8 @@ class TestTokenizerAdheresToPep0263(TestCase):
def _testFile(self, filename):
path = os.path.join(os.path.dirname(__file__), filename)
- TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
+ with open(path, 'rb') as f:
+ TestRoundtrip.check_roundtrip(self, f)
def test_utf8_coding_cookie_and_no_utf8_bom(self):
f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
@@ -1206,7 +1226,8 @@ class Test_Tokenize(TestCase):
yield b''
# skip the initial encoding token and the end tokens
- tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
+ tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8',
+ extra_tokens=True))[:-2]
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
self.assertEqual(tokens, expected_tokens,
"bytes not decoded with encoding")
@@ -1475,13 +1496,13 @@ class TestTokenize(TestCase):
def mock_detect_encoding(readline):
return encoding, [b'first', b'second']
- def mock__tokenize(readline, encoding):
+ def mock__tokenize(readline, encoding, **kwargs):
nonlocal encoding_used
encoding_used = encoding
out = []
while True:
try:
- next_line = next(readline)
+ next_line = readline()
except StopIteration:
return out
if next_line:
@@ -1498,16 +1519,16 @@ class TestTokenize(TestCase):
return str(counter).encode()
orig_detect_encoding = tokenize_module.detect_encoding
- orig__tokenize = tokenize_module._tokenize
+ orig_c_token = tokenize_module._generate_tokens_from_c_tokenizer
tokenize_module.detect_encoding = mock_detect_encoding
- tokenize_module._tokenize = mock__tokenize
+ tokenize_module._generate_tokens_from_c_tokenizer = mock__tokenize
try:
results = tokenize(mock_readline)
self.assertEqual(list(results)[1:],
[b'first', b'second', b'1', b'2', b'3', b'4'])
finally:
tokenize_module.detect_encoding = orig_detect_encoding
- tokenize_module._tokenize = orig__tokenize
+ tokenize_module._generate_tokens_from_c_tokenizer = orig_c_token
self.assertEqual(encoding_used, encoding)
@@ -1834,12 +1855,33 @@ class CTokenizeTest(TestCase):
def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
# The ENDMARKER and final NEWLINE are omitted.
+ f = StringIO(s)
with self.subTest(source=s):
result = stringify_tokens_from_source(
- _generate_tokens_from_c_tokenizer(s), s
+ _generate_tokens_from_c_tokenizer(f.readline), s
)
self.assertEqual(result, expected.rstrip().splitlines())
+ def test_encoding(self):
+ def readline(encoding):
+ yield "1+1".encode(encoding)
+
+ expected = [
+ TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'),
+ TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'),
+ TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'),
+ TokenInfo(type=NEWLINE, string='\n', start=(1, 3), end=(1, 4), line='1+1\n'),
+ TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
+ ]
+ for encoding in ["utf-8", "latin-1", "utf-16"]:
+ with self.subTest(encoding=encoding):
+ tokens = list(_generate_tokens_from_c_tokenizer(
+ readline(encoding).__next__,
+ extra_tokens=True,
+ encoding=encoding,
+ ))
+ self.assertEqual(tokens, expected)
+
def test_int(self):
self.check_tokenize('0xff <= 255', """\
@@ -2675,43 +2717,44 @@ async def f():
def test_invalid_syntax(self):
def get_tokens(string):
- return list(_generate_tokens_from_c_tokenizer(string))
-
- self.assertRaises(SyntaxError, get_tokens, "(1+2]")
- self.assertRaises(SyntaxError, get_tokens, "(1+2}")
- self.assertRaises(SyntaxError, get_tokens, "{1+2]")
-
- self.assertRaises(SyntaxError, get_tokens, "1_")
- self.assertRaises(SyntaxError, get_tokens, "1.2_")
- self.assertRaises(SyntaxError, get_tokens, "1e2_")
- self.assertRaises(SyntaxError, get_tokens, "1e+")
-
- self.assertRaises(SyntaxError, get_tokens, "\xa0")
- self.assertRaises(SyntaxError, get_tokens, "€")
-
- self.assertRaises(SyntaxError, get_tokens, "0b12")
- self.assertRaises(SyntaxError, get_tokens, "0b1_2")
- self.assertRaises(SyntaxError, get_tokens, "0b2")
- self.assertRaises(SyntaxError, get_tokens, "0b1_")
- self.assertRaises(SyntaxError, get_tokens, "0b")
- self.assertRaises(SyntaxError, get_tokens, "0o18")
- self.assertRaises(SyntaxError, get_tokens, "0o1_8")
- self.assertRaises(SyntaxError, get_tokens, "0o8")
- self.assertRaises(SyntaxError, get_tokens, "0o1_")
- self.assertRaises(SyntaxError, get_tokens, "0o")
- self.assertRaises(SyntaxError, get_tokens, "0x1_")
- self.assertRaises(SyntaxError, get_tokens, "0x")
- self.assertRaises(SyntaxError, get_tokens, "1_")
- self.assertRaises(SyntaxError, get_tokens, "012")
- self.assertRaises(SyntaxError, get_tokens, "1.2_")
- self.assertRaises(SyntaxError, get_tokens, "1e2_")
- self.assertRaises(SyntaxError, get_tokens, "1e+")
-
- self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
- self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
-
- self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
- self.assertRaises(SyntaxError, get_tokens, "]")
+ the_string = StringIO(string)
+ return list(_generate_tokens_from_c_tokenizer(the_string.readline))
+
+ for case in [
+ "(1+2]",
+ "(1+2}",
+ "{1+2]",
+ "1_",
+ "1.2_",
+ "1e2_",
+ "1e+",
+
+ "\xa0",
+ "€",
+ "0b12",
+ "0b1_2",
+ "0b2",
+ "0b1_",
+ "0b",
+ "0o18",
+ "0o1_8",
+ "0o8",
+ "0o1_",
+ "0o",
+ "0x1_",
+ "0x",
+ "1_",
+ "012",
+ "1.2_",
+ "1e2_",
+ "1e+",
+ "'sdfsdf",
+ "'''sdfsdf''",
+ "("*1000+"a"+")"*1000,
+ "]",
+ ]:
+ with self.subTest(case=case):
+ self.assertRaises(SyntaxError, get_tokens, case)
def test_max_indent(self):
MAXINDENT = 100
@@ -2722,20 +2765,24 @@ async def f():
return source
valid = generate_source(MAXINDENT - 1)
- tokens = list(_generate_tokens_from_c_tokenizer(valid))
+ the_input = StringIO(valid)
+ tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline))
self.assertEqual(tokens[-2].type, DEDENT)
self.assertEqual(tokens[-1].type, ENDMARKER)
compile(valid, "<string>", "exec")
invalid = generate_source(MAXINDENT)
- self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
+ the_input = StringIO(invalid)
+ self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
self.assertRaises(
IndentationError, compile, invalid, "<string>", "exec"
)
def test_continuation_lines_indentation(self):
def get_tokens(string):
- return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
+ the_string = StringIO(string)
+ return [(kind, string) for (kind, string, *_)
+ in _generate_tokens_from_c_tokenizer(the_string.readline)]
code = dedent("""
def fib(n):
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 4895e94..380dc2a 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -34,6 +34,7 @@ import re
import sys
from token import *
from token import EXACT_TOKEN_TYPES
+import _tokenize
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -443,12 +444,7 @@ def tokenize(readline):
# BOM will already have been stripped.
encoding = "utf-8"
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
- yield from _tokenize(rl_gen, encoding)
-
-def _tokenize(rl_gen, encoding):
- source = b"".join(rl_gen).decode(encoding)
- for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
- yield token
+ yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
def generate_tokens(readline):
"""Tokenize a source reading Python code as unicode strings.
@@ -456,16 +452,7 @@ def generate_tokens(readline):
This has the same API as tokenize(), except that it expects the *readline*
callable to return str objects instead of bytes.
"""
- def _gen():
- while True:
- try:
- line = readline()
- except StopIteration:
- return
- if not line:
- return
- yield line.encode()
- return _tokenize(_gen(), 'utf-8')
+ return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
def main():
import argparse
@@ -502,9 +489,9 @@ def main():
tokens = list(tokenize(f.readline))
else:
filename = "<stdin>"
- tokens = _tokenize(
+ tokens = _generate_tokens_from_c_tokenizer(
(x.encode('utf-8') for x in iter(sys.stdin.readline, "")
- ), "utf-8")
+ ), "utf-8", extra_tokens=True)
# Output the tokenization
@@ -531,10 +518,13 @@ def main():
perror("unexpected error: %s" % err)
raise
-def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
+def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
- import _tokenize as c_tokenizer
- for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
+ if encoding is None:
+ it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
+ else:
+ it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
+ for info in it:
yield TokenInfo._make(info)
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index b6d63e1..fae613e 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -103,6 +103,7 @@ tok_new(void)
tok->filename = NULL;
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
+ tok->readline = NULL;
tok->type_comments = 0;
tok->async_hacks = 0;
tok->async_def = 0;
@@ -139,8 +140,9 @@ static char *
error_ret(struct tok_state *tok) /* XXX */
{
tok->decoding_erred = 1;
- if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
+ if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */
PyMem_Free(tok->buf);
+ }
tok->buf = tok->cur = tok->inp = NULL;
tok->start = NULL;
tok->end = NULL;
@@ -900,6 +902,33 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
return tok;
}
+struct tok_state *
+_PyTokenizer_FromReadline(PyObject* readline, const char* enc,
+ int exec_input, int preserve_crlf)
+{
+ struct tok_state *tok = tok_new();
+ if (tok == NULL)
+ return NULL;
+ if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
+ _PyTokenizer_Free(tok);
+ return NULL;
+ }
+ tok->cur = tok->inp = tok->buf;
+ tok->end = tok->buf + BUFSIZ;
+ tok->fp = NULL;
+ if (enc != NULL) {
+ tok->encoding = new_string(enc, strlen(enc), tok);
+ if (!tok->encoding) {
+ _PyTokenizer_Free(tok);
+ return NULL;
+ }
+ }
+ tok->decoding_state = STATE_NORMAL;
+ Py_INCREF(readline);
+ tok->readline = readline;
+ return tok;
+}
+
/* Set up tokenizer for UTF-8 string */
struct tok_state *
@@ -969,8 +998,9 @@ _PyTokenizer_Free(struct tok_state *tok)
}
Py_XDECREF(tok->decoding_readline);
Py_XDECREF(tok->decoding_buffer);
+ Py_XDECREF(tok->readline);
Py_XDECREF(tok->filename);
- if (tok->fp != NULL && tok->buf != NULL) {
+ if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
PyMem_Free(tok->buf);
}
if (tok->input) {
@@ -1022,6 +1052,71 @@ tok_readline_raw(struct tok_state *tok)
}
static int
+tok_readline_string(struct tok_state* tok) {
+ PyObject* line = NULL;
+ PyObject* raw_line = PyObject_CallNoArgs(tok->readline);
+ if (raw_line == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+ PyErr_Clear();
+ return 1;
+ }
+ error_ret(tok);
+ goto error;
+ }
+ if(tok->encoding != NULL) {
+ if (!PyBytes_Check(raw_line)) {
+ PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object");
+ error_ret(tok);
+ goto error;
+ }
+ line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line),
+ tok->encoding, "replace");
+ Py_CLEAR(raw_line);
+ if (line == NULL) {
+ error_ret(tok);
+ goto error;
+ }
+ } else {
+ if(!PyUnicode_Check(raw_line)) {
+ PyErr_Format(PyExc_TypeError, "readline() returned a non-string object");
+ error_ret(tok);
+ goto error;
+ }
+ line = raw_line;
+ raw_line = NULL;
+ }
+ Py_ssize_t buflen;
+ const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
+ if (buf == NULL) {
+ error_ret(tok);
+ goto error;
+ }
+
+ // Make room for the null terminator *and* potentially
+ // an extra newline character that we may need to artificially
+ // add.
+ size_t buffer_size = buflen + 2;
+ if (!tok_reserve_buf(tok, buffer_size)) {
+ goto error;
+ }
+ memcpy(tok->inp, buf, buflen);
+ tok->inp += buflen;
+ *tok->inp = '\0';
+
+ if (tok->start == NULL) {
+ tok->buf = tok->cur;
+ }
+ tok->line_start = tok->cur;
+
+ Py_DECREF(line);
+ return 1;
+error:
+ Py_XDECREF(raw_line);
+ Py_XDECREF(line);
+ return 0;
+}
+
+static int
tok_underflow_string(struct tok_state *tok) {
char *end = strchr(tok->inp, '\n');
if (end != NULL) {
@@ -1195,6 +1290,38 @@ tok_underflow_file(struct tok_state *tok) {
return tok->done == E_OK;
}
+static int
+tok_underflow_readline(struct tok_state* tok) {
+ assert(tok->decoding_state == STATE_NORMAL);
+ assert(tok->fp == NULL && tok->input == NULL && tok->decoding_readline == NULL);
+ if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
+ tok->cur = tok->inp = tok->buf;
+ }
+ if (!tok_readline_string(tok)) {
+ return 0;
+ }
+ if (tok->inp == tok->cur) {
+ tok->done = E_EOF;
+ return 0;
+ }
+ if (tok->inp[-1] != '\n') {
+ assert(tok->inp + 1 < tok->end);
+ /* Last line does not end in \n, fake one */
+ *tok->inp++ = '\n';
+ *tok->inp = '\0';
+ }
+
+ ADVANCE_LINENO();
+ /* The default encoding is UTF-8, so make sure we don't have any
+ non-UTF-8 sequences in it. */
+ if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
+ error_ret(tok);
+ return 0;
+ }
+ assert(tok->done == E_OK);
+ return tok->done == E_OK;
+}
+
#if defined(Py_DEBUG)
static void
print_escape(FILE *f, const char *s, Py_ssize_t size)
@@ -1238,7 +1365,10 @@ tok_nextc(struct tok_state *tok)
if (tok->done != E_OK) {
return EOF;
}
- if (tok->fp == NULL) {
+ if (tok->readline) {
+ rc = tok_underflow_readline(tok);
+ }
+ else if (tok->fp == NULL) {
rc = tok_underflow_string(tok);
}
else if (tok->prompt != NULL) {
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 02749e3..600d429 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -109,6 +109,7 @@ struct tok_state {
expression (cf. issue 16806) */
PyObject *decoding_readline; /* open(...).readline */
PyObject *decoding_buffer;
+ PyObject *readline; /* readline() function */
const char* enc; /* Encoding for the current str. */
char* str; /* Source string being tokenized (if tokenizing from a string)*/
char* input; /* Tokenizer's newline translated copy of the string. */
@@ -137,6 +138,7 @@ struct tok_state {
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
+extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
const char *, const char *);
extern void _PyTokenizer_Free(struct tok_state *);
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 2de1daa..a7933b2 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -37,15 +37,17 @@ typedef struct
@classmethod
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
- source: str
+ readline: object
+ /
*
extra_tokens: bool
+ encoding: str(c_default="NULL") = 'utf-8'
[clinic start generated code]*/
static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source,
- int extra_tokens)
-/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
+tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
+ int extra_tokens, const char *encoding)
+/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
{
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
if (self == NULL) {
@@ -55,7 +57,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
if (filename == NULL) {
return NULL;
}
- self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
+ self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
if (self->tok == NULL) {
Py_DECREF(filename);
return NULL;
diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h
index 7e77938..28f5075 100644
--- a/Python/clinic/Python-tokenize.c.h
+++ b/Python/clinic/Python-tokenize.c.h
@@ -9,8 +9,8 @@ preserve
static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source,
- int extra_tokens);
+tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
+ int extra_tokens, const char *encoding);
static PyObject *
tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
@@ -25,7 +25,7 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
PyObject *ob_item[NUM_KEYWORDS];
} _kwtuple = {
.ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
- .ob_item = { &_Py_ID(source), &_Py_ID(extra_tokens), },
+ .ob_item = { &_Py_ID(extra_tokens), &_Py_ID(encoding), },
};
#undef NUM_KEYWORDS
#define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -34,43 +34,50 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
# define KWTUPLE NULL
#endif // !Py_BUILD_CORE
- static const char * const _keywords[] = {"source", "extra_tokens", NULL};
+ static const char * const _keywords[] = {"", "extra_tokens", "encoding", NULL};
static _PyArg_Parser _parser = {
.keywords = _keywords,
.fname = "tokenizeriter",
.kwtuple = KWTUPLE,
};
#undef KWTUPLE
- PyObject *argsbuf[2];
+ PyObject *argsbuf[3];
PyObject * const *fastargs;
Py_ssize_t nargs = PyTuple_GET_SIZE(args);
- const char *source;
+ Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 2;
+ PyObject *readline;
int extra_tokens;
+ const char *encoding = NULL;
fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf);
if (!fastargs) {
goto exit;
}
- if (!PyUnicode_Check(fastargs[0])) {
- _PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]);
+ readline = fastargs[0];
+ extra_tokens = PyObject_IsTrue(fastargs[1]);
+ if (extra_tokens < 0) {
goto exit;
}
- Py_ssize_t source_length;
- source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length);
- if (source == NULL) {
+ if (!noptargs) {
+ goto skip_optional_kwonly;
+ }
+ if (!PyUnicode_Check(fastargs[2])) {
+ _PyArg_BadArgument("tokenizeriter", "argument 'encoding'", "str", fastargs[2]);
goto exit;
}
- if (strlen(source) != (size_t)source_length) {
- PyErr_SetString(PyExc_ValueError, "embedded null character");
+ Py_ssize_t encoding_length;
+ encoding = PyUnicode_AsUTF8AndSize(fastargs[2], &encoding_length);
+ if (encoding == NULL) {
goto exit;
}
- extra_tokens = PyObject_IsTrue(fastargs[1]);
- if (extra_tokens < 0) {
+ if (strlen(encoding) != (size_t)encoding_length) {
+ PyErr_SetString(PyExc_ValueError, "embedded null character");
goto exit;
}
- return_value = tokenizeriter_new_impl(type, source, extra_tokens);
+skip_optional_kwonly:
+ return_value = tokenizeriter_new_impl(type, readline, extra_tokens, encoding);
exit:
return return_value;
}
-/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=48be65a2808bdfa6 input=a9049054013a1b77]*/