diff options
author | Paul Ganssle <1377457+pganssle@users.noreply.github.com> | 2022-05-06 00:31:24 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-06 00:31:24 (GMT) |
commit | 1303f8c927227b72d9ee9eae890be4692b4d4592 (patch) | |
tree | 32b20995417aa934e3eabbbce12c35f45f004b50 /Modules | |
parent | ada8b6d1b1b02ae7c38f161c2a0ad866559fe18b (diff) | |
download | cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.zip cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.tar.gz cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.tar.bz2 |
gh-80010: Expand fromisoformat to include most of ISO-8601 (#92177)
This expands `fromisoformat` to cover most of the common uses of ISO 8601. We may expand the scope more in the future.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_datetimemodule.c | 359 |
1 files changed, 288 insertions, 71 deletions
diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 20cdb18..efb5278 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -395,6 +395,39 @@ iso_week1_monday(int year) return week1_monday; } +static int +iso_to_ymd(const int iso_year, const int iso_week, const int iso_day, + int *year, int *month, int *day) { + if (iso_week <= 0 || iso_week >= 53) { + int out_of_range = 1; + if (iso_week == 53) { + // ISO years have 53 weeks in it on years starting with a Thursday + // and on leap years starting on Wednesday + int first_weekday = weekday(iso_year, 1, 1); + if (first_weekday == 3 || (first_weekday == 2 && is_leap(iso_year))) { + out_of_range = 0; + } + } + + if (out_of_range) { + return -2; + } + } + + if (iso_day <= 0 || iso_day >= 8) { + return -3; + } + + // Convert (Y, W, D) to (Y, M, D) in-place + int day_1 = iso_week1_monday(iso_year); + + int day_offset = (iso_week - 1)*7 + iso_day - 1; + + ord_to_ymd(day_1 + day_offset, year, month, day); + return 0; +} + + /* --------------------------------------------------------------------------- * Range checkers. */ @@ -680,6 +713,11 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d) * String parsing utilities and helper functions */ +static unsigned char +is_digit(const char c) { + return ((unsigned int)(c - '0')) < 10; +} + static const char * parse_digits(const char *ptr, int *var, size_t num_digits) { @@ -696,14 +734,17 @@ parse_digits(const char *ptr, int *var, size_t num_digits) } static int -parse_isoformat_date(const char *dtstr, int *year, int *month, int *day) +parse_isoformat_date(const char *dtstr, const size_t len, int *year, int *month, int *day) { /* Parse the date components of the result of date.isoformat() * * Return codes: * 0: Success * -1: Failed to parse date component - * -2: Failed to parse dateseparator + * -2: Inconsistent date separator usage + * -3: Failed to parse ISO week. + * -4: Failed to parse ISO day. + * -5, -6: Failure in iso_to_ymd */ const char *p = dtstr; p = parse_digits(p, year, 4); @@ -711,8 +752,42 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day) return -1; } - if (*(p++) != '-') { - return -2; + const unsigned char uses_separator = (*p == '-'); + if (uses_separator) { + ++p; + } + + if(*p == 'W') { + // This is an isocalendar-style date string + p++; + int iso_week = 0; + int iso_day = 0; + + p = parse_digits(p, &iso_week, 2); + if (NULL == p) { + return -3; + } + + assert(p > dtstr); + if ((size_t)(p - dtstr) < len) { + if (uses_separator && *(p++) != '-') { + return -2; + } + + p = parse_digits(p, &iso_day, 1); + if (NULL == p) { + return -4; + } + } else { + iso_day = 1; + } + + int rv = iso_to_ymd(*year, iso_week, iso_day, year, month, day); + if (rv) { + return -3 + rv; + } else { + return 0; + } } p = parse_digits(p, month, 2); @@ -720,15 +795,13 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day) return -1; } - if (*(p++) != '-') { + if (uses_separator && *(p++) != '-') { return -2; } - p = parse_digits(p, day, 2); if (p == NULL) { return -1; } - return 0; } @@ -736,11 +809,14 @@ static int parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour, int *minute, int *second, int *microsecond) { + *hour = *minute = *second = *microsecond = 0; const char *p = tstr; const char *p_end = tstr_end; int *vals[3] = {hour, minute, second}; + // This is initialized to satisfy an erroneous compiler warning. + unsigned char has_separator = 1; - // Parse [HH[:MM[:SS]]] + // Parse [HH[:?MM[:?SS]]] for (size_t i = 0; i < 3; ++i) { p = parse_digits(p, vals[i], 2); if (NULL == p) { @@ -748,33 +824,47 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour, } char c = *(p++); + if (i == 0) { + has_separator = (c == ':'); + } + if (p >= p_end) { return c != '\0'; } - else if (c == ':') { + else if (has_separator && (c == ':')) { continue; } - else if (c == '.') { + else if (c == '.' || c == ',') { break; - } - else { + } else if (!has_separator) { + --p; + } else { return -4; // Malformed time separator } } - // Parse .fff[fff] + // Parse fractional components size_t len_remains = p_end - p; - if (!(len_remains == 6 || len_remains == 3)) { - return -3; + size_t to_parse = len_remains; + if (len_remains >= 6) { + to_parse = 6; } - p = parse_digits(p, microsecond, len_remains); + p = parse_digits(p, microsecond, to_parse); if (NULL == p) { return -3; } - if (len_remains == 3) { - *microsecond *= 1000; + static int correction[] = { + 100000, 10000, 1000, 100, 10 + }; + + if (to_parse < 6) { + *microsecond *= correction[to_parse-1]; + } + + while (is_digit(*p)){ + ++p; // skip truncated digits } // Return 1 if it's not the end of the string @@ -800,7 +890,7 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute, const char *tzinfo_pos = p; do { - if (*tzinfo_pos == '+' || *tzinfo_pos == '-') { + if (*tzinfo_pos == 'Z' || *tzinfo_pos == '+' || *tzinfo_pos == '-') { break; } } while (++tzinfo_pos < p_end); @@ -822,14 +912,16 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute, } } - // Parse time zone component - // Valid formats are: - // - +HH:MM (len 6) - // - +HH:MM:SS (len 9) - // - +HH:MM:SS.ffffff (len 16) - size_t tzlen = p_end - tzinfo_pos; - if (!(tzlen == 6 || tzlen == 9 || tzlen == 16)) { - return -5; + // Special case UTC / Zulu time. + if (*tzinfo_pos == 'Z') { + *tzoffset = 0; + *tzmicrosecond = 0; + + if (*(tzinfo_pos + 1) != '\0') { + return -5; + } else { + return 1; + } } int tzsign = (*tzinfo_pos == '-') ? -1 : 1; @@ -2983,8 +3075,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) int year = 0, month = 0, day = 0; int rv; - if (len == 10) { - rv = parse_isoformat_date(dt_ptr, &year, &month, &day); + if (len == 7 || len == 8 || len == 10) { + rv = parse_isoformat_date(dt_ptr, len, &year, &month, &day); } else { rv = -1; @@ -3027,37 +3119,21 @@ date_fromisocalendar(PyObject *cls, PyObject *args, PyObject *kw) return NULL; } - if (week <= 0 || week >= 53) { - int out_of_range = 1; - if (week == 53) { - // ISO years have 53 weeks in it on years starting with a Thursday - // and on leap years starting on Wednesday - int first_weekday = weekday(year, 1, 1); - if (first_weekday == 3 || (first_weekday == 2 && is_leap(year))) { - out_of_range = 0; - } - } + int month; + int rv = iso_to_ymd(year, week, day, &year, &month, &day); - if (out_of_range) { - PyErr_Format(PyExc_ValueError, "Invalid week: %d", week); - return NULL; - } + + if (rv == -2) { + PyErr_Format(PyExc_ValueError, "Invalid week: %d", week); + return NULL; } - if (day <= 0 || day >= 8) { + if (rv == -3) { PyErr_Format(PyExc_ValueError, "Invalid day: %d (range is [1, 7])", day); return NULL; } - // Convert (Y, W, D) to (Y, M, D) in-place - int day_1 = iso_week1_monday(year); - - int month = week; - int day_offset = (month - 1)*7 + day - 1; - - ord_to_ymd(day_1 + day_offset, &year, &month, &day); - return new_date_subclass_ex(year, month, day, cls); } @@ -3489,7 +3565,7 @@ static PyMethodDef date_methods[] = { {"fromisoformat", (PyCFunction)date_fromisoformat, METH_O | METH_CLASS, - PyDoc_STR("str -> Construct a date from the output of date.isoformat()")}, + PyDoc_STR("str -> Construct a date from a string in ISO 8601 format.")}, {"fromisocalendar", _PyCFunction_CAST(date_fromisocalendar), METH_VARARGS | METH_KEYWORDS | METH_CLASS, @@ -4564,6 +4640,14 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { goto invalid_string_error; } + // The spec actually requires that time-only ISO 8601 strings start with + // T, but the extended format allows this to be omitted as long as there + // is no ambiguity with date strings. + if (*p == 'T') { + ++p; + len -= 1; + } + int hour = 0, minute = 0, second = 0, microsecond = 0; int tzoffset, tzimicrosecond = 0; int rv = parse_isoformat_time(p, len, @@ -4671,7 +4755,7 @@ static PyMethodDef time_methods[] = { PyDoc_STR("Return time with new specified fields.")}, {"fromisoformat", (PyCFunction)time_fromisoformat, METH_O | METH_CLASS, - PyDoc_STR("string -> time from time.isoformat() output")}, + PyDoc_STR("string -> time from a string in ISO 8601 format")}, {"__reduce_ex__", (PyCFunction)time_reduce_ex, METH_VARARGS, PyDoc_STR("__reduce_ex__(proto) -> (cls, state)")}, @@ -5184,19 +5268,42 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw) static PyObject * _sanitize_isoformat_str(PyObject *dtstr) { + Py_ssize_t len = PyUnicode_GetLength(dtstr); + if (len < 7) { // All valid ISO 8601 strings are at least 7 characters long + return NULL; + } + // `fromisoformat` allows surrogate characters in exactly one position, // the separator; to allow datetime_fromisoformat to make the simplifying // assumption that all valid strings can be encoded in UTF-8, this function // replaces any surrogate character separators with `T`. // // The result of this, if not NULL, returns a new reference - Py_ssize_t len = PyUnicode_GetLength(dtstr); - if (len < 0) { - return NULL; + const void* const unicode_data = PyUnicode_DATA(dtstr); + const unsigned int kind = PyUnicode_KIND(dtstr); + + // Depending on the format of the string, the separator can only ever be + // in positions 7, 8 or 10. We'll check each of these for a surrogate and + // if we find one, replace it with `T`. If there is more than one surrogate, + // we don't have to bother sanitizing it, because the function will later + // fail when we try to encode the string as ASCII. + static const size_t potential_separators[3] = {7, 8, 10}; + size_t surrogate_separator = 0; + for(size_t idx = 0; + idx < sizeof(potential_separators) / sizeof(*potential_separators); + ++idx) { + size_t pos = potential_separators[idx]; + if (pos > (size_t)len) { + break; + } + + if(Py_UNICODE_IS_SURROGATE(PyUnicode_READ(kind, unicode_data, pos))) { + surrogate_separator = pos; + break; + } } - if (len <= 10 || - !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { + if (surrogate_separator == 0) { Py_INCREF(dtstr); return dtstr; } @@ -5206,7 +5313,7 @@ _sanitize_isoformat_str(PyObject *dtstr) return NULL; } - if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) { + if (PyUnicode_WriteChar(str_out, surrogate_separator, (Py_UCS4)'T')) { Py_DECREF(str_out); return NULL; } @@ -5214,6 +5321,106 @@ _sanitize_isoformat_str(PyObject *dtstr) return str_out; } + +static Py_ssize_t +_find_isoformat_datetime_separator(const char *dtstr, Py_ssize_t len) { + // The valid date formats can all be distinguished by characters 4 and 5 + // and further narrowed down by character + // which tells us where to look for the separator character. + // Format | As-rendered | Position + // --------------------------------------- + // %Y-%m-%d | YYYY-MM-DD | 10 + // %Y%m%d | YYYYMMDD | 8 + // %Y-W%V | YYYY-Www | 8 + // %YW%V | YYYYWww | 7 + // %Y-W%V-%u | YYYY-Www-d | 10 + // %YW%V%u | YYYYWwwd | 8 + // %Y-%j | YYYY-DDD | 8 + // %Y%j | YYYYDDD | 7 + // + // Note that because we allow *any* character for the separator, in the + // case where character 4 is W, it's not straightforward to determine where + // the separator is — in the case of YYYY-Www-d, you have actual ambiguity, + // e.g. 2020-W01-0000 could be YYYY-Www-D0HH or YYYY-Www-HHMM, when the + // separator character is a number in the former case or a hyphen in the + // latter case. + // + // The case of YYYYWww can be distinguished from YYYYWwwd by tracking ahead + // to either the end of the string or the first non-numeric character — + // since the time components all come in pairs YYYYWww#HH can be + // distinguished from YYYYWwwd#HH by the fact that there will always be an + // odd number of digits before the first non-digit character in the former + // case. + static const char date_separator = '-'; + static const char week_indicator = 'W'; + + if (len == 7) { + return 7; + } + + if (dtstr[4] == date_separator) { + // YYYY-??? + + if (dtstr[5] == week_indicator) { + // YYYY-W?? + + if (len < 8) { + return -1; + } + + if (len > 8 && dtstr[8] == date_separator) { + // YYYY-Www-D (10) or YYYY-Www-HH (8) + if (len == 9) { return -1; } + if (len > 10 && is_digit(dtstr[10])) { + // This is as far as we'll try to go to resolve the + // ambiguity for the moment — if we have YYYY-Www-##, the + // separator is either a hyphen at 8 or a number at 10. + // + // We'll assume it's a hyphen at 8 because it's way more + // likely that someone will use a hyphen as a separator + // than a number, but at this point it's really best effort + // because this is an extension of the spec anyway. + return 8; + } + + return 10; + } else { + // YYYY-Www (8) + return 8; + } + } else { + // YYYY-MM-DD (10) + return 10; + } + } else { + // YYYY??? + if (dtstr[4] == week_indicator) { + // YYYYWww (7) or YYYYWwwd (8) + size_t idx = 7; + for (; idx < (size_t)len; ++idx) { + // Keep going until we run out of digits. + if (!is_digit(dtstr[idx])) { + break; + } + } + + if (idx < 9) { + return idx; + } + + if (idx % 2 == 0) { + // If the index of the last number is even, it's YYYYWww + return 7; + } else { + return 8; + } + } else { + // YYYYMMDD (8) + return 8; + } + } +} + static PyObject * datetime_fromisoformat(PyObject *cls, PyObject *dtstr) { @@ -5225,9 +5432,14 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr) return NULL; } + // We only need to sanitize this string if the separator is a surrogate + // character. In the situation where the separator location is ambiguous, + // we don't have to sanitize it anything because that can only happen when + // the separator is either '-' or a number. This should mostly be a noop + // but it makes the reference counting easier if we still sanitize. PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr); if (dtstr_clean == NULL) { - goto error; + goto invalid_string_error; } Py_ssize_t len; @@ -5243,30 +5455,35 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr) } } + const Py_ssize_t separator_location = _find_isoformat_datetime_separator( + dt_ptr, len); + + const char *p = dt_ptr; int year = 0, month = 0, day = 0; int hour = 0, minute = 0, second = 0, microsecond = 0; int tzoffset = 0, tzusec = 0; - // date has a fixed length of 10 - int rv = parse_isoformat_date(p, &year, &month, &day); + // date runs up to separator_location + int rv = parse_isoformat_date(p, separator_location, &year, &month, &day); - if (!rv && len > 10) { + if (!rv && len > separator_location) { // In UTF-8, the length of multi-byte characters is encoded in the MSB - if ((p[10] & 0x80) == 0) { - p += 11; + p += separator_location; + if ((p[0] & 0x80) == 0) { + p += 1; } else { - switch (p[10] & 0xf0) { + switch (p[0] & 0xf0) { case 0xe0: - p += 13; + p += 3; break; case 0xf0: - p += 14; + p += 4; break; default: - p += 12; + p += 2; break; } } @@ -6327,7 +6544,7 @@ static PyMethodDef datetime_methods[] = { {"fromisoformat", (PyCFunction)datetime_fromisoformat, METH_O | METH_CLASS, - PyDoc_STR("string -> datetime from datetime.isoformat() output")}, + PyDoc_STR("string -> datetime from a string in most ISO 8601 formats")}, /* Instance methods: */ |