From 3df85404d4bf420db3362eeae1345f2cad948a71 Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Mon, 22 Oct 2018 12:32:52 -0400 Subject: bpo-34454: Clean up datetime.fromisoformat surrogate handling (GH-8959) * Use _PyUnicode_Copy in sanitize_isoformat_str * Use repr in fromisoformat error message This reverses commit 67b74a98b2 per Serhiy Storchaka's suggestion: I suggested to use %R in the error message because including the raw string can be confusing in the case of empty string, or string containing trailing whitespaces, invisible or unprintable characters. We agree that it is better to change both the C and pure Python versions to use repr. * Retain non-sanitized dtstr for error printing This does not create an extra string, it just holds on to a reference to the original input string for purposes of creating the error message. * PEP 7 fixes to from_isoformat * Separate handling of Unicode and other errors In the initial implementation, errors other than encoding errors would both raise an error indicating an invalid format, which would not be true for errors like MemoryError. * Drop needs_decref from _sanitize_isoformat_str Instead _sanitize_isoformat_str returns a new reference, even to the original string. --- Lib/datetime.py | 8 +-- Lib/test/datetimetester.py | 9 +++ Modules/_datetimemodule.c | 165 +++++++++++++++++++++++++-------------------- 3 files changed, 106 insertions(+), 76 deletions(-) diff --git a/Lib/datetime.py b/Lib/datetime.py index cff9203..292919f 100644 --- a/Lib/datetime.py +++ b/Lib/datetime.py @@ -857,7 +857,7 @@ class date: assert len(date_string) == 10 return cls(*_parse_isoformat_date(date_string)) except Exception: - raise ValueError('Invalid isoformat string: %s' % date_string) + raise ValueError(f'Invalid isoformat string: {date_string!r}') # Conversions to string @@ -1369,7 +1369,7 @@ class time: try: return cls(*_parse_isoformat_time(time_string)) except Exception: - raise ValueError('Invalid isoformat string: %s' % time_string) + raise ValueError(f'Invalid isoformat string: {time_string!r}') def strftime(self, fmt): @@ -1646,13 +1646,13 @@ class datetime(date): try: date_components = _parse_isoformat_date(dstr) except ValueError: - raise ValueError('Invalid isoformat string: %s' % date_string) + raise ValueError(f'Invalid isoformat string: {date_string!r}') if tstr: try: time_components = _parse_isoformat_time(tstr) except ValueError: - raise ValueError('Invalid isoformat string: %s' % date_string) + raise ValueError(f'Invalid isoformat string: {date_string!r}') else: time_components = [0, 0, 0, 0, None] diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py index 9c6e71c..122f6b5 100644 --- a/Lib/test/datetimetester.py +++ b/Lib/test/datetimetester.py @@ -13,6 +13,7 @@ import sys import os import pickle import random +import re import struct import unittest @@ -2676,6 +2677,14 @@ class TestDateTime(TestDate): with self.assertRaises(ValueError): self.theclass.fromisoformat(bad_str) + def test_fromisoformat_fails_surrogate(self): + # Test that when fromisoformat() fails with a surrogate character as + # the separator, the error message contains the original string + dtstr = "2018-01-03\ud80001:0113" + + with self.assertRaisesRegex(ValueError, re.escape(repr(dtstr))): + self.theclass.fromisoformat(dtstr) + def test_fromisoformat_utc(self): dt_str = '2014-04-19T13:21:13+00:00' dt = self.theclass.fromisoformat(dt_str) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index cdfa235..bc4caa0 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -668,8 +668,8 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d) * String parsing utilities and helper functions */ -static const char* -parse_digits(const char* ptr, int* var, size_t num_digits) +static const char * +parse_digits(const char *ptr, int *var, size_t num_digits) { for (size_t i = 0; i < num_digits; ++i) { unsigned int tmp = (unsigned int)(*(ptr++) - '0'); @@ -683,15 +683,16 @@ parse_digits(const char* ptr, int* var, size_t num_digits) return ptr; } -static int parse_isoformat_date(const char *dtstr, - int* year, int *month, int* day) { +static int +parse_isoformat_date(const char *dtstr, int *year, int *month, int *day) +{ /* Parse the date components of the result of date.isoformat() - * - * Return codes: - * 0: Success - * -1: Failed to parse date component - * -2: Failed to parse dateseparator - */ + * + * Return codes: + * 0: Success + * -1: Failed to parse date component + * -2: Failed to parse dateseparator + */ const char *p = dtstr; p = parse_digits(p, year, 4); if (NULL == p) { @@ -720,8 +721,9 @@ static int parse_isoformat_date(const char *dtstr, } static int -parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, - int* hour, int* minute, int *second, int *microsecond) { +parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour, + int *minute, int *second, int *microsecond) +{ const char *p = tstr; const char *p_end = tstr_end; int *vals[3] = {hour, minute, second}; @@ -736,12 +738,15 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, char c = *(p++); if (p >= p_end) { return c != '\0'; - } else if (c == ':') { + } + else if (c == ':') { continue; - } else if (c == '.') { + } + else if (c == '.') { break; - } else { - return -4; // Malformed time separator + } + else { + return -4; // Malformed time separator } } @@ -765,9 +770,10 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, } static int -parse_isoformat_time(const char *dtstr, size_t dtlen, - int* hour, int *minute, int *second, int *microsecond, - int* tzoffset, int *tzmicrosecond) { +parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute, + int *second, int *microsecond, int *tzoffset, + int *tzmicrosecond) +{ // Parse the time portion of a datetime.isoformat() string // // Return codes: @@ -785,19 +791,21 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, if (*tzinfo_pos == '+' || *tzinfo_pos == '-') { break; } - } while(++tzinfo_pos < p_end); + } while (++tzinfo_pos < p_end); - int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos, - hour, minute, second, microsecond); + int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos, hour, minute, second, + microsecond); if (rv < 0) { return rv; - } else if (tzinfo_pos == p_end) { + } + else if (tzinfo_pos == p_end) { // We know that there's no time zone, so if there's stuff at the // end of the string it's an error. if (rv == 1) { return -5; - } else { + } + else { return 0; } } @@ -812,19 +820,18 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, return -5; } - int tzsign = (*tzinfo_pos == '-')?-1:1; + int tzsign = (*tzinfo_pos == '-') ? -1 : 1; tzinfo_pos++; int tzhour = 0, tzminute = 0, tzsecond = 0; - rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end, - &tzhour, &tzminute, &tzsecond, tzmicrosecond); + rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end, &tzhour, &tzminute, &tzsecond, + tzmicrosecond); *tzoffset = tzsign * ((tzhour * 3600) + (tzminute * 60) + tzsecond); *tzmicrosecond *= tzsign; - return rv?-5:1; + return rv ? -5 : 1; } - /* --------------------------------------------------------------------------- * Create various objects, mostly without range checking. */ @@ -839,30 +846,33 @@ new_date_ex(int year, int month, int day, PyTypeObject *type) return NULL; } - self = (PyDateTime_Date *) (type->tp_alloc(type, 0)); + self = (PyDateTime_Date *)(type->tp_alloc(type, 0)); if (self != NULL) set_date_fields(self, year, month, day); - return (PyObject *) self; + return (PyObject *)self; } #define new_date(year, month, day) \ new_date_ex(year, month, day, &PyDateTime_DateType) // Forward declaration -static PyObject * new_datetime_ex(int, int, int, int, int, int, int, - PyObject*, PyTypeObject*); +static PyObject * +new_datetime_ex(int, int, int, int, int, int, int, PyObject *, PyTypeObject *); /* Create date instance with no range checking, or call subclass constructor */ static PyObject * -new_date_subclass_ex(int year, int month, int day, PyObject *cls) { +new_date_subclass_ex(int year, int month, int day, PyObject *cls) +{ PyObject *result; // We have "fast path" constructors for two subclasses: date and datetime if ((PyTypeObject *)cls == &PyDateTime_DateType) { result = new_date_ex(year, month, day, (PyTypeObject *)cls); - } else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) { + } + else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) { result = new_datetime_ex(year, month, day, 0, 0, 0, 0, Py_None, (PyTypeObject *)cls); - } else { + } + else { result = PyObject_CallFunction(cls, "iii", year, month, day); } @@ -1281,7 +1291,8 @@ append_keyword_fold(PyObject *repr, int fold) } static inline PyObject * -tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) { +tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) +{ PyObject *tzinfo; if (rv == 1) { // Create a timezone from offset in seconds (0 returns UTC) @@ -1296,7 +1307,8 @@ tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) { } tzinfo = new_timezone(delta, NULL); Py_DECREF(delta); - } else { + } + else { tzinfo = Py_None; Py_INCREF(Py_None); } @@ -2886,17 +2898,19 @@ date_fromordinal(PyObject *cls, PyObject *args) /* Return the new date from a string as generated by date.isoformat() */ static PyObject * -date_fromisoformat(PyObject *cls, PyObject *dtstr) { +date_fromisoformat(PyObject *cls, PyObject *dtstr) +{ assert(dtstr != NULL); if (!PyUnicode_Check(dtstr)) { - PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str"); + PyErr_SetString(PyExc_TypeError, + "fromisoformat: argument must be str"); return NULL; } Py_ssize_t len; - const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); + const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); if (dt_ptr == NULL) { goto invalid_string_error; } @@ -2906,7 +2920,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { int rv; if (len == 10) { rv = parse_isoformat_date(dt_ptr, &year, &month, &day); - } else { + } + else { rv = -1; } @@ -2917,12 +2932,10 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { return new_date_subclass_ex(year, month, day, cls); invalid_string_error: - PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", - dtstr); + PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); return NULL; } - /* * Date arithmetic. */ @@ -4863,52 +4876,65 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw) } static PyObject * -_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) { +_sanitize_isoformat_str(PyObject *dtstr) +{ // `fromisoformat` allows surrogate characters in exactly one position, // the separator; to allow datetime_fromisoformat to make the simplifying // assumption that all valid strings can be encoded in UTF-8, this function // replaces any surrogate character separators with `T`. + // + // The result of this, if not NULL, returns a new reference Py_ssize_t len = PyUnicode_GetLength(dtstr); - *needs_decref = 0; - if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { + if (len < 0) { + return NULL; + } + + if (len <= 10 || + !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { + Py_INCREF(dtstr); return dtstr; } - PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr)); + PyObject *str_out = _PyUnicode_Copy(dtstr); if (str_out == NULL) { return NULL; } - if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 || - PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) { + if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) { Py_DECREF(str_out); return NULL; } - *needs_decref = 1; return str_out; } static PyObject * -datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { +datetime_fromisoformat(PyObject *cls, PyObject *dtstr) +{ assert(dtstr != NULL); if (!PyUnicode_Check(dtstr)) { - PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str"); + PyErr_SetString(PyExc_TypeError, + "fromisoformat: argument must be str"); return NULL; } - int needs_decref = 0; - dtstr = _sanitize_isoformat_str(dtstr, &needs_decref); - if (dtstr == NULL) { + PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr); + if (dtstr_clean == NULL) { goto error; } Py_ssize_t len; - const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); + const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr_clean, &len); if (dt_ptr == NULL) { - goto invalid_string_error; + if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) { + // Encoding errors are invalid string errors at this point + goto invalid_string_error; + } + else { + goto error; + } } const char *p = dt_ptr; @@ -4924,8 +4950,9 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { // In UTF-8, the length of multi-byte characters is encoded in the MSB if ((p[10] & 0x80) == 0) { p += 11; - } else { - switch(p[10] & 0xf0) { + } + else { + switch (p[10] & 0xf0) { case 0xe0: p += 13; break; @@ -4939,15 +4966,14 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { } len -= (p - dt_ptr); - rv = parse_isoformat_time(p, len, - &hour, &minute, &second, µsecond, - &tzoffset, &tzusec); + rv = parse_isoformat_time(p, len, &hour, &minute, &second, + µsecond, &tzoffset, &tzusec); } if (rv < 0) { goto invalid_string_error; } - PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec); + PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec); if (tzinfo == NULL) { goto error; } @@ -4956,23 +4982,18 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { second, microsecond, tzinfo, cls); Py_DECREF(tzinfo); - if (needs_decref) { - Py_DECREF(dtstr); - } + Py_DECREF(dtstr_clean); return dt; invalid_string_error: PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); error: - if (needs_decref) { - Py_DECREF(dtstr); - } + Py_XDECREF(dtstr_clean); return NULL; } - /* * Destructor. */ -- cgit v0.12