gh-80010: Expand fromisoformat to include most of ISO-8601 (#92177)

This expands `fromisoformat` to cover most of the common uses of ISO 8601. We may expand the scope more in the future.
author: Paul Ganssle <1377457+pganssle@users.noreply.github.com> 2022-05-06 00:31:24 (GMT)
committer: GitHub <noreply@github.com> 2022-05-06 00:31:24 (GMT)
commit: 1303f8c927227b72d9ee9eae890be4692b4d4592 (patch)
tree: 32b20995417aa934e3eabbbce12c35f45f004b50 /Modules
parent: ada8b6d1b1b02ae7c38f161c2a0ad866559fe18b (diff)
download: cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.zip
cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.tar.gz
cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.tar.bz2
1 files changed, 288 insertions, 71 deletions
diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 20cdb18..efb5278 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -395,6 +395,39 @@ iso_week1_monday(int year)
     return week1_monday;
 }
 
+static int
+iso_to_ymd(const int iso_year, const int iso_week, const int iso_day,
+           int *year, int *month, int *day) {
+    if (iso_week <= 0 || iso_week >= 53) {
+        int out_of_range = 1;
+        if (iso_week == 53) {
+            // ISO years have 53 weeks in it on years starting with a Thursday
+            // and on leap years starting on Wednesday
+            int first_weekday = weekday(iso_year, 1, 1);
+            if (first_weekday == 3 || (first_weekday == 2 && is_leap(iso_year))) {
+                out_of_range = 0;
+            }
+        }
+
+        if (out_of_range) {
+            return -2;
+        }
+    }
+
+    if (iso_day <= 0 || iso_day >= 8) {
+        return -3;
+    }
+
+    // Convert (Y, W, D) to (Y, M, D) in-place
+    int day_1 = iso_week1_monday(iso_year);
+
+    int day_offset = (iso_week - 1)*7 + iso_day - 1;
+
+    ord_to_ymd(day_1 + day_offset, year, month, day);
+    return 0;
+}
+
+
 /* ---------------------------------------------------------------------------
  * Range checkers.
  */
@@ -680,6 +713,11 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d)
  * String parsing utilities and helper functions
  */
 
+static unsigned char
+is_digit(const char c) {
+    return ((unsigned int)(c - '0')) < 10;
+}
+
 static const char *
 parse_digits(const char *ptr, int *var, size_t num_digits)
 {
@@ -696,14 +734,17 @@ parse_digits(const char *ptr, int *var, size_t num_digits)
 }
 
 static int
-parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
+parse_isoformat_date(const char *dtstr, const size_t len, int *year, int *month, int *day)
 {
     /* Parse the date components of the result of date.isoformat()
      *
      *  Return codes:
      *       0:  Success
      *      -1:  Failed to parse date component
-     *      -2:  Failed to parse dateseparator
+     *      -2:  Inconsistent date separator usage
+     *      -3:  Failed to parse ISO week.
+     *      -4:  Failed to parse ISO day.
+     *      -5, -6: Failure in iso_to_ymd
      */
     const char *p = dtstr;
     p = parse_digits(p, year, 4);
@@ -711,8 +752,42 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
         return -1;
     }
 
-    if (*(p++) != '-') {
-        return -2;
+    const unsigned char uses_separator = (*p == '-');
+    if (uses_separator) {
+        ++p;
+    }
+
+    if(*p == 'W') {
+        // This is an isocalendar-style date string
+        p++;
+        int iso_week = 0;
+        int iso_day = 0;
+
+        p = parse_digits(p, &iso_week, 2);
+        if (NULL == p) {
+            return -3;
+        }
+
+        assert(p > dtstr);
+        if ((size_t)(p - dtstr) < len) {
+            if (uses_separator && *(p++) != '-') {
+                return -2;
+            }
+
+            p = parse_digits(p, &iso_day, 1);
+            if (NULL == p) {
+                return -4;
+            }
+        } else {
+            iso_day = 1;
+        }
+
+        int rv = iso_to_ymd(*year, iso_week, iso_day, year, month, day);
+        if (rv) {
+            return -3 + rv;
+        } else {
+            return 0;
+        }
     }
 
     p = parse_digits(p, month, 2);
@@ -720,15 +795,13 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
         return -1;
     }
 
-    if (*(p++) != '-') {
+    if (uses_separator && *(p++) != '-') {
         return -2;
     }
-
     p = parse_digits(p, day, 2);
     if (p == NULL) {
         return -1;
     }
-
     return 0;
 }
 
@@ -736,11 +809,14 @@ static int
 parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
                   int *minute, int *second, int *microsecond)
 {
+    *hour = *minute = *second = *microsecond = 0;
     const char *p = tstr;
     const char *p_end = tstr_end;
     int *vals[3] = {hour, minute, second};
+    // This is initialized to satisfy an erroneous compiler warning.
+    unsigned char has_separator = 1;
 
-    // Parse [HH[:MM[:SS]]]
+    // Parse [HH[:?MM[:?SS]]]
     for (size_t i = 0; i < 3; ++i) {
         p = parse_digits(p, vals[i], 2);
         if (NULL == p) {
@@ -748,33 +824,47 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
         }
 
         char c = *(p++);
+        if (i == 0) {
+            has_separator = (c == ':');
+        }
+
         if (p >= p_end) {
             return c != '\0';
         }
-        else if (c == ':') {
+        else if (has_separator && (c == ':')) {
             continue;
         }
-        else if (c == '.') {
+        else if (c == '.' || c == ',') {
             break;
-        }
-        else {
+        } else if (!has_separator) {
+            --p;
+        } else {
             return -4;  // Malformed time separator
         }
     }
 
-    // Parse .fff[fff]
+    // Parse fractional components
     size_t len_remains = p_end - p;
-    if (!(len_remains == 6 || len_remains == 3)) {
-        return -3;
+    size_t to_parse = len_remains;
+    if (len_remains >= 6) {
+        to_parse = 6;
     }
 
-    p = parse_digits(p, microsecond, len_remains);
+    p = parse_digits(p, microsecond, to_parse);
     if (NULL == p) {
         return -3;
     }
 
-    if (len_remains == 3) {
-        *microsecond *= 1000;
+    static int correction[] = {
+        100000, 10000, 1000, 100, 10
+    };
+
+    if (to_parse < 6) {
+        *microsecond *= correction[to_parse-1];
+    }
+
+    while (is_digit(*p)){
+        ++p; // skip truncated digits
     }
 
     // Return 1 if it's not the end of the string
@@ -800,7 +890,7 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
 
     const char *tzinfo_pos = p;
     do {
-        if (*tzinfo_pos == '+' || *tzinfo_pos == '-') {
+        if (*tzinfo_pos == 'Z' || *tzinfo_pos == '+' || *tzinfo_pos == '-') {
             break;
         }
     } while (++tzinfo_pos < p_end);
@@ -822,14 +912,16 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
         }
     }
 
-    // Parse time zone component
-    // Valid formats are:
-    //    - +HH:MM           (len  6)
-    //    - +HH:MM:SS        (len  9)
-    //    - +HH:MM:SS.ffffff (len 16)
-    size_t tzlen = p_end - tzinfo_pos;
-    if (!(tzlen == 6 || tzlen == 9 || tzlen == 16)) {
-        return -5;
+    // Special case UTC / Zulu time.
+    if (*tzinfo_pos == 'Z') {
+        *tzoffset = 0;
+        *tzmicrosecond = 0;
+
+        if (*(tzinfo_pos + 1) != '\0') {
+            return -5;
+        } else {
+            return 1;
+        }
     }
 
     int tzsign = (*tzinfo_pos == '-') ? -1 : 1;
@@ -2983,8 +3075,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr)
     int year = 0, month = 0, day = 0;
 
     int rv;
-    if (len == 10) {
-        rv = parse_isoformat_date(dt_ptr, &year, &month, &day);
+    if (len == 7 || len == 8 || len == 10) {
+        rv = parse_isoformat_date(dt_ptr, len, &year, &month, &day);
     }
     else {
         rv = -1;
@@ -3027,37 +3119,21 @@ date_fromisocalendar(PyObject *cls, PyObject *args, PyObject *kw)
         return NULL;
     }
 
-    if (week <= 0 || week >= 53) {
-        int out_of_range = 1;
-        if (week == 53) {
-            // ISO years have 53 weeks in it on years starting with a Thursday
-            // and on leap years starting on Wednesday
-            int first_weekday = weekday(year, 1, 1);
-            if (first_weekday == 3 || (first_weekday == 2 && is_leap(year))) {
-                out_of_range = 0;
-            }
-        }
+    int month;
+    int rv = iso_to_ymd(year, week, day, &year, &month, &day);
 
-        if (out_of_range) {
-            PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
-            return NULL;
-        }
+
+    if (rv == -2) {
+        PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
+        return NULL;
     }
 
-    if (day <= 0 || day >= 8) {
+    if (rv == -3) {
         PyErr_Format(PyExc_ValueError, "Invalid day: %d (range is [1, 7])",
                      day);
         return NULL;
     }
 
-    // Convert (Y, W, D) to (Y, M, D) in-place
-    int day_1 = iso_week1_monday(year);
-
-    int month = week;
-    int day_offset = (month - 1)*7 + day - 1;
-
-    ord_to_ymd(day_1 + day_offset, &year, &month, &day);
-
     return new_date_subclass_ex(year, month, day, cls);
 }
 
@@ -3489,7 +3565,7 @@ static PyMethodDef date_methods[] = {
 
      {"fromisoformat", (PyCFunction)date_fromisoformat,  METH_O |
                                                          METH_CLASS,
-      PyDoc_STR("str -> Construct a date from the output of date.isoformat()")},
+      PyDoc_STR("str -> Construct a date from a string in ISO 8601 format.")},
 
      {"fromisocalendar", _PyCFunction_CAST(date_fromisocalendar),
       METH_VARARGS | METH_KEYWORDS | METH_CLASS,
@@ -4564,6 +4640,14 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
         goto invalid_string_error;
     }
 
+    // The spec actually requires that time-only ISO 8601 strings start with
+    // T, but the extended format allows this to be omitted as long as there
+    // is no ambiguity with date strings.
+    if (*p == 'T') {
+        ++p;
+        len -= 1;
+    }
+
     int hour = 0, minute = 0, second = 0, microsecond = 0;
     int tzoffset, tzimicrosecond = 0;
     int rv = parse_isoformat_time(p, len,
@@ -4671,7 +4755,7 @@ static PyMethodDef time_methods[] = {
      PyDoc_STR("Return time with new specified fields.")},
 
      {"fromisoformat", (PyCFunction)time_fromisoformat, METH_O | METH_CLASS,
-     PyDoc_STR("string -> time from time.isoformat() output")},
+     PyDoc_STR("string -> time from a string in ISO 8601 format")},
 
     {"__reduce_ex__", (PyCFunction)time_reduce_ex,        METH_VARARGS,
      PyDoc_STR("__reduce_ex__(proto) -> (cls, state)")},
@@ -5184,19 +5268,42 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
 static PyObject *
 _sanitize_isoformat_str(PyObject *dtstr)
 {
+    Py_ssize_t len = PyUnicode_GetLength(dtstr);
+    if (len < 7) {  // All valid ISO 8601 strings are at least 7 characters long
+        return NULL;
+    }
+
     // `fromisoformat` allows surrogate characters in exactly one position,
     // the separator; to allow datetime_fromisoformat to make the simplifying
     // assumption that all valid strings can be encoded in UTF-8, this function
     // replaces any surrogate character separators with `T`.
     //
     // The result of this, if not NULL, returns a new reference
-    Py_ssize_t len = PyUnicode_GetLength(dtstr);
-    if (len < 0) {
-        return NULL;
+    const void* const unicode_data = PyUnicode_DATA(dtstr);
+    const unsigned int kind = PyUnicode_KIND(dtstr);
+
+    // Depending on the format of the string, the separator can only ever be
+    // in positions 7, 8 or 10. We'll check each of these for a surrogate and
+    // if we find one, replace it with `T`. If there is more than one surrogate,
+    // we don't have to bother sanitizing it, because the function will later
+    // fail when we try to encode the string as ASCII.
+    static const size_t potential_separators[3] = {7, 8, 10};
+    size_t surrogate_separator = 0;
+    for(size_t idx = 0;
+         idx < sizeof(potential_separators) / sizeof(*potential_separators);
+         ++idx) {
+        size_t pos = potential_separators[idx];
+        if (pos > (size_t)len) {
+            break;
+        }
+
+        if(Py_UNICODE_IS_SURROGATE(PyUnicode_READ(kind, unicode_data, pos))) {
+            surrogate_separator = pos;
+            break;
+        }
     }
 
-    if (len <= 10 ||
-        !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
+    if (surrogate_separator == 0) {
         Py_INCREF(dtstr);
         return dtstr;
     }
@@ -5206,7 +5313,7 @@ _sanitize_isoformat_str(PyObject *dtstr)
         return NULL;
     }
 
-    if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
+    if (PyUnicode_WriteChar(str_out, surrogate_separator, (Py_UCS4)'T')) {
         Py_DECREF(str_out);
         return NULL;
     }
@@ -5214,6 +5321,106 @@ _sanitize_isoformat_str(PyObject *dtstr)
     return str_out;
 }
 
+
+static Py_ssize_t
+_find_isoformat_datetime_separator(const char *dtstr, Py_ssize_t len) {
+    // The valid date formats can all be distinguished by characters 4 and 5
+    // and further narrowed down by character
+    // which tells us where to look for the separator character.
+    // Format    |  As-rendered |   Position
+    // ---------------------------------------
+    // %Y-%m-%d  |  YYYY-MM-DD  |    10
+    // %Y%m%d    |  YYYYMMDD    |     8
+    // %Y-W%V    |  YYYY-Www    |     8
+    // %YW%V     |  YYYYWww     |     7
+    // %Y-W%V-%u |  YYYY-Www-d  |    10
+    // %YW%V%u   |  YYYYWwwd    |     8
+    // %Y-%j     |  YYYY-DDD    |     8
+    // %Y%j      |  YYYYDDD     |     7
+    //
+    // Note that because we allow *any* character for the separator, in the
+    // case where character 4 is W, it's not straightforward to determine where
+    // the separator is — in the case of YYYY-Www-d, you have actual ambiguity,
+    // e.g. 2020-W01-0000 could be YYYY-Www-D0HH or YYYY-Www-HHMM, when the
+    // separator character is a number in the former case or a hyphen in the
+    // latter case.
+    //
+    // The case of YYYYWww can be distinguished from YYYYWwwd by tracking ahead
+    // to either the end of the string or the first non-numeric character —
+    // since the time components all come in pairs YYYYWww#HH can be
+    // distinguished from YYYYWwwd#HH by the fact that there will always be an
+    // odd number of digits before the first non-digit character in the former
+    // case.
+    static const char date_separator = '-';
+    static const char week_indicator = 'W';
+
+    if (len == 7) {
+        return 7;
+    }
+
+    if (dtstr[4] == date_separator) {
+        // YYYY-???
+
+        if (dtstr[5] == week_indicator) {
+            // YYYY-W??
+
+            if (len < 8) {
+                return -1;
+            }
+
+            if (len > 8 && dtstr[8] == date_separator) {
+                // YYYY-Www-D (10) or YYYY-Www-HH (8)
+                if (len == 9) { return -1; }
+                if (len > 10 && is_digit(dtstr[10])) {
+                    // This is as far as we'll try to go to resolve the
+                    // ambiguity for the moment — if we have YYYY-Www-##, the
+                    // separator is either a hyphen at 8 or a number at 10.
+                    //
+                    // We'll assume it's a hyphen at 8 because it's way more
+                    // likely that someone will use a hyphen as a separator
+                    // than a number, but at this point it's really best effort
+                    // because this is an extension of the spec anyway.
+                    return 8;
+                }
+
+                return 10;
+            } else {
+                // YYYY-Www (8)
+                return 8;
+            }
+        } else {
+            // YYYY-MM-DD (10)
+            return 10;
+        }
+    } else {
+        // YYYY???
+        if (dtstr[4] == week_indicator) {
+            // YYYYWww (7) or YYYYWwwd (8)
+            size_t idx = 7;
+            for (; idx < (size_t)len; ++idx) {
+                // Keep going until we run out of digits.
+                if (!is_digit(dtstr[idx])) {
+                    break;
+                }
+            }
+
+            if (idx < 9) {
+                return idx;
+            }
+
+            if (idx % 2 == 0) {
+                // If the index of the last number is even, it's YYYYWww
+                return 7;
+            } else {
+                return 8;
+            }
+        } else {
+            // YYYYMMDD (8)
+            return 8;
+        }
+    }
+}
+
 static PyObject *
 datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
 {
@@ -5225,9 +5432,14 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
         return NULL;
     }
 
+    // We only need to sanitize this string if the separator is a surrogate
+    // character. In the situation where the separator location is ambiguous,
+    // we don't have to sanitize it anything because that can only happen when
+    // the separator is either '-' or a number. This should mostly be a noop
+    // but it makes the reference counting easier if we still sanitize.
     PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr);
     if (dtstr_clean == NULL) {
-        goto error;
+        goto invalid_string_error;
     }
 
     Py_ssize_t len;
@@ -5243,30 +5455,35 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
         }
     }
 
+    const Py_ssize_t separator_location = _find_isoformat_datetime_separator(
+            dt_ptr, len);
+
+
     const char *p = dt_ptr;
 
     int year = 0, month = 0, day = 0;
     int hour = 0, minute = 0, second = 0, microsecond = 0;
     int tzoffset = 0, tzusec = 0;
 
-    // date has a fixed length of 10
-    int rv = parse_isoformat_date(p, &year, &month, &day);
+    // date runs up to separator_location
+    int rv = parse_isoformat_date(p, separator_location, &year, &month, &day);
 
-    if (!rv && len > 10) {
+    if (!rv && len > separator_location) {
         // In UTF-8, the length of multi-byte characters is encoded in the MSB
-        if ((p[10] & 0x80) == 0) {
-            p += 11;
+        p += separator_location;
+        if ((p[0] & 0x80) == 0) {
+            p += 1;
         }
         else {
-            switch (p[10] & 0xf0) {
+            switch (p[0] & 0xf0) {
                 case 0xe0:
-                    p += 13;
+                    p += 3;
                     break;
                 case 0xf0:
-                    p += 14;
+                    p += 4;
                     break;
                 default:
-                    p += 12;
+                    p += 2;
                     break;
             }
         }
@@ -6327,7 +6544,7 @@ static PyMethodDef datetime_methods[] = {
 
     {"fromisoformat", (PyCFunction)datetime_fromisoformat,
      METH_O | METH_CLASS,
-     PyDoc_STR("string -> datetime from datetime.isoformat() output")},
+     PyDoc_STR("string -> datetime from a string in most ISO 8601 formats")},
 
     /* Instance methods: */
author	Paul Ganssle <1377457+pganssle@users.noreply.github.com>	2022-05-06 00:31:24 (GMT)
committer	GitHub <noreply@github.com>	2022-05-06 00:31:24 (GMT)
commit	1303f8c927227b72d9ee9eae890be4692b4d4592 (patch)
tree	32b20995417aa934e3eabbbce12c35f45f004b50 /Modules
parent	ada8b6d1b1b02ae7c38f161c2a0ad866559fe18b (diff)
download	cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.zip cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.tar.gz cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.tar.bz2