summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorPaul Ganssle <1377457+pganssle@users.noreply.github.com>2022-05-06 00:31:24 (GMT)
committerGitHub <noreply@github.com>2022-05-06 00:31:24 (GMT)
commit1303f8c927227b72d9ee9eae890be4692b4d4592 (patch)
tree32b20995417aa934e3eabbbce12c35f45f004b50 /Modules
parentada8b6d1b1b02ae7c38f161c2a0ad866559fe18b (diff)
downloadcpython-1303f8c927227b72d9ee9eae890be4692b4d4592.zip
cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.tar.gz
cpython-1303f8c927227b72d9ee9eae890be4692b4d4592.tar.bz2
gh-80010: Expand fromisoformat to include most of ISO-8601 (#92177)
This expands `fromisoformat` to cover most of the common uses of ISO 8601. We may expand the scope more in the future.
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_datetimemodule.c359
1 files changed, 288 insertions, 71 deletions
diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 20cdb18..efb5278 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -395,6 +395,39 @@ iso_week1_monday(int year)
return week1_monday;
}
+static int
+iso_to_ymd(const int iso_year, const int iso_week, const int iso_day,
+ int *year, int *month, int *day) {
+ if (iso_week <= 0 || iso_week >= 53) {
+ int out_of_range = 1;
+ if (iso_week == 53) {
+ // ISO years have 53 weeks in it on years starting with a Thursday
+ // and on leap years starting on Wednesday
+ int first_weekday = weekday(iso_year, 1, 1);
+ if (first_weekday == 3 || (first_weekday == 2 && is_leap(iso_year))) {
+ out_of_range = 0;
+ }
+ }
+
+ if (out_of_range) {
+ return -2;
+ }
+ }
+
+ if (iso_day <= 0 || iso_day >= 8) {
+ return -3;
+ }
+
+ // Convert (Y, W, D) to (Y, M, D) in-place
+ int day_1 = iso_week1_monday(iso_year);
+
+ int day_offset = (iso_week - 1)*7 + iso_day - 1;
+
+ ord_to_ymd(day_1 + day_offset, year, month, day);
+ return 0;
+}
+
+
/* ---------------------------------------------------------------------------
* Range checkers.
*/
@@ -680,6 +713,11 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d)
* String parsing utilities and helper functions
*/
+static unsigned char
+is_digit(const char c) {
+ return ((unsigned int)(c - '0')) < 10;
+}
+
static const char *
parse_digits(const char *ptr, int *var, size_t num_digits)
{
@@ -696,14 +734,17 @@ parse_digits(const char *ptr, int *var, size_t num_digits)
}
static int
-parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
+parse_isoformat_date(const char *dtstr, const size_t len, int *year, int *month, int *day)
{
/* Parse the date components of the result of date.isoformat()
*
* Return codes:
* 0: Success
* -1: Failed to parse date component
- * -2: Failed to parse dateseparator
+ * -2: Inconsistent date separator usage
+ * -3: Failed to parse ISO week.
+ * -4: Failed to parse ISO day.
+ * -5, -6: Failure in iso_to_ymd
*/
const char *p = dtstr;
p = parse_digits(p, year, 4);
@@ -711,8 +752,42 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
return -1;
}
- if (*(p++) != '-') {
- return -2;
+ const unsigned char uses_separator = (*p == '-');
+ if (uses_separator) {
+ ++p;
+ }
+
+ if(*p == 'W') {
+ // This is an isocalendar-style date string
+ p++;
+ int iso_week = 0;
+ int iso_day = 0;
+
+ p = parse_digits(p, &iso_week, 2);
+ if (NULL == p) {
+ return -3;
+ }
+
+ assert(p > dtstr);
+ if ((size_t)(p - dtstr) < len) {
+ if (uses_separator && *(p++) != '-') {
+ return -2;
+ }
+
+ p = parse_digits(p, &iso_day, 1);
+ if (NULL == p) {
+ return -4;
+ }
+ } else {
+ iso_day = 1;
+ }
+
+ int rv = iso_to_ymd(*year, iso_week, iso_day, year, month, day);
+ if (rv) {
+ return -3 + rv;
+ } else {
+ return 0;
+ }
}
p = parse_digits(p, month, 2);
@@ -720,15 +795,13 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
return -1;
}
- if (*(p++) != '-') {
+ if (uses_separator && *(p++) != '-') {
return -2;
}
-
p = parse_digits(p, day, 2);
if (p == NULL) {
return -1;
}
-
return 0;
}
@@ -736,11 +809,14 @@ static int
parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
int *minute, int *second, int *microsecond)
{
+ *hour = *minute = *second = *microsecond = 0;
const char *p = tstr;
const char *p_end = tstr_end;
int *vals[3] = {hour, minute, second};
+ // This is initialized to satisfy an erroneous compiler warning.
+ unsigned char has_separator = 1;
- // Parse [HH[:MM[:SS]]]
+ // Parse [HH[:?MM[:?SS]]]
for (size_t i = 0; i < 3; ++i) {
p = parse_digits(p, vals[i], 2);
if (NULL == p) {
@@ -748,33 +824,47 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
}
char c = *(p++);
+ if (i == 0) {
+ has_separator = (c == ':');
+ }
+
if (p >= p_end) {
return c != '\0';
}
- else if (c == ':') {
+ else if (has_separator && (c == ':')) {
continue;
}
- else if (c == '.') {
+ else if (c == '.' || c == ',') {
break;
- }
- else {
+ } else if (!has_separator) {
+ --p;
+ } else {
return -4; // Malformed time separator
}
}
- // Parse .fff[fff]
+ // Parse fractional components
size_t len_remains = p_end - p;
- if (!(len_remains == 6 || len_remains == 3)) {
- return -3;
+ size_t to_parse = len_remains;
+ if (len_remains >= 6) {
+ to_parse = 6;
}
- p = parse_digits(p, microsecond, len_remains);
+ p = parse_digits(p, microsecond, to_parse);
if (NULL == p) {
return -3;
}
- if (len_remains == 3) {
- *microsecond *= 1000;
+ static int correction[] = {
+ 100000, 10000, 1000, 100, 10
+ };
+
+ if (to_parse < 6) {
+ *microsecond *= correction[to_parse-1];
+ }
+
+ while (is_digit(*p)){
+ ++p; // skip truncated digits
}
// Return 1 if it's not the end of the string
@@ -800,7 +890,7 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
const char *tzinfo_pos = p;
do {
- if (*tzinfo_pos == '+' || *tzinfo_pos == '-') {
+ if (*tzinfo_pos == 'Z' || *tzinfo_pos == '+' || *tzinfo_pos == '-') {
break;
}
} while (++tzinfo_pos < p_end);
@@ -822,14 +912,16 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
}
}
- // Parse time zone component
- // Valid formats are:
- // - +HH:MM (len 6)
- // - +HH:MM:SS (len 9)
- // - +HH:MM:SS.ffffff (len 16)
- size_t tzlen = p_end - tzinfo_pos;
- if (!(tzlen == 6 || tzlen == 9 || tzlen == 16)) {
- return -5;
+ // Special case UTC / Zulu time.
+ if (*tzinfo_pos == 'Z') {
+ *tzoffset = 0;
+ *tzmicrosecond = 0;
+
+ if (*(tzinfo_pos + 1) != '\0') {
+ return -5;
+ } else {
+ return 1;
+ }
}
int tzsign = (*tzinfo_pos == '-') ? -1 : 1;
@@ -2983,8 +3075,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr)
int year = 0, month = 0, day = 0;
int rv;
- if (len == 10) {
- rv = parse_isoformat_date(dt_ptr, &year, &month, &day);
+ if (len == 7 || len == 8 || len == 10) {
+ rv = parse_isoformat_date(dt_ptr, len, &year, &month, &day);
}
else {
rv = -1;
@@ -3027,37 +3119,21 @@ date_fromisocalendar(PyObject *cls, PyObject *args, PyObject *kw)
return NULL;
}
- if (week <= 0 || week >= 53) {
- int out_of_range = 1;
- if (week == 53) {
- // ISO years have 53 weeks in it on years starting with a Thursday
- // and on leap years starting on Wednesday
- int first_weekday = weekday(year, 1, 1);
- if (first_weekday == 3 || (first_weekday == 2 && is_leap(year))) {
- out_of_range = 0;
- }
- }
+ int month;
+ int rv = iso_to_ymd(year, week, day, &year, &month, &day);
- if (out_of_range) {
- PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
- return NULL;
- }
+
+ if (rv == -2) {
+ PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
+ return NULL;
}
- if (day <= 0 || day >= 8) {
+ if (rv == -3) {
PyErr_Format(PyExc_ValueError, "Invalid day: %d (range is [1, 7])",
day);
return NULL;
}
- // Convert (Y, W, D) to (Y, M, D) in-place
- int day_1 = iso_week1_monday(year);
-
- int month = week;
- int day_offset = (month - 1)*7 + day - 1;
-
- ord_to_ymd(day_1 + day_offset, &year, &month, &day);
-
return new_date_subclass_ex(year, month, day, cls);
}
@@ -3489,7 +3565,7 @@ static PyMethodDef date_methods[] = {
{"fromisoformat", (PyCFunction)date_fromisoformat, METH_O |
METH_CLASS,
- PyDoc_STR("str -> Construct a date from the output of date.isoformat()")},
+ PyDoc_STR("str -> Construct a date from a string in ISO 8601 format.")},
{"fromisocalendar", _PyCFunction_CAST(date_fromisocalendar),
METH_VARARGS | METH_KEYWORDS | METH_CLASS,
@@ -4564,6 +4640,14 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
goto invalid_string_error;
}
+ // The spec actually requires that time-only ISO 8601 strings start with
+ // T, but the extended format allows this to be omitted as long as there
+ // is no ambiguity with date strings.
+ if (*p == 'T') {
+ ++p;
+ len -= 1;
+ }
+
int hour = 0, minute = 0, second = 0, microsecond = 0;
int tzoffset, tzimicrosecond = 0;
int rv = parse_isoformat_time(p, len,
@@ -4671,7 +4755,7 @@ static PyMethodDef time_methods[] = {
PyDoc_STR("Return time with new specified fields.")},
{"fromisoformat", (PyCFunction)time_fromisoformat, METH_O | METH_CLASS,
- PyDoc_STR("string -> time from time.isoformat() output")},
+ PyDoc_STR("string -> time from a string in ISO 8601 format")},
{"__reduce_ex__", (PyCFunction)time_reduce_ex, METH_VARARGS,
PyDoc_STR("__reduce_ex__(proto) -> (cls, state)")},
@@ -5184,19 +5268,42 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
static PyObject *
_sanitize_isoformat_str(PyObject *dtstr)
{
+ Py_ssize_t len = PyUnicode_GetLength(dtstr);
+ if (len < 7) { // All valid ISO 8601 strings are at least 7 characters long
+ return NULL;
+ }
+
// `fromisoformat` allows surrogate characters in exactly one position,
// the separator; to allow datetime_fromisoformat to make the simplifying
// assumption that all valid strings can be encoded in UTF-8, this function
// replaces any surrogate character separators with `T`.
//
// The result of this, if not NULL, returns a new reference
- Py_ssize_t len = PyUnicode_GetLength(dtstr);
- if (len < 0) {
- return NULL;
+ const void* const unicode_data = PyUnicode_DATA(dtstr);
+ const unsigned int kind = PyUnicode_KIND(dtstr);
+
+ // Depending on the format of the string, the separator can only ever be
+ // in positions 7, 8 or 10. We'll check each of these for a surrogate and
+ // if we find one, replace it with `T`. If there is more than one surrogate,
+ // we don't have to bother sanitizing it, because the function will later
+ // fail when we try to encode the string as ASCII.
+ static const size_t potential_separators[3] = {7, 8, 10};
+ size_t surrogate_separator = 0;
+ for(size_t idx = 0;
+ idx < sizeof(potential_separators) / sizeof(*potential_separators);
+ ++idx) {
+ size_t pos = potential_separators[idx];
+ if (pos > (size_t)len) {
+ break;
+ }
+
+ if(Py_UNICODE_IS_SURROGATE(PyUnicode_READ(kind, unicode_data, pos))) {
+ surrogate_separator = pos;
+ break;
+ }
}
- if (len <= 10 ||
- !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
+ if (surrogate_separator == 0) {
Py_INCREF(dtstr);
return dtstr;
}
@@ -5206,7 +5313,7 @@ _sanitize_isoformat_str(PyObject *dtstr)
return NULL;
}
- if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
+ if (PyUnicode_WriteChar(str_out, surrogate_separator, (Py_UCS4)'T')) {
Py_DECREF(str_out);
return NULL;
}
@@ -5214,6 +5321,106 @@ _sanitize_isoformat_str(PyObject *dtstr)
return str_out;
}
+
+static Py_ssize_t
+_find_isoformat_datetime_separator(const char *dtstr, Py_ssize_t len) {
+ // The valid date formats can all be distinguished by characters 4 and 5
+ // and further narrowed down by character
+ // which tells us where to look for the separator character.
+ // Format | As-rendered | Position
+ // ---------------------------------------
+ // %Y-%m-%d | YYYY-MM-DD | 10
+ // %Y%m%d | YYYYMMDD | 8
+ // %Y-W%V | YYYY-Www | 8
+ // %YW%V | YYYYWww | 7
+ // %Y-W%V-%u | YYYY-Www-d | 10
+ // %YW%V%u | YYYYWwwd | 8
+ // %Y-%j | YYYY-DDD | 8
+ // %Y%j | YYYYDDD | 7
+ //
+ // Note that because we allow *any* character for the separator, in the
+ // case where character 4 is W, it's not straightforward to determine where
+ // the separator is — in the case of YYYY-Www-d, you have actual ambiguity,
+ // e.g. 2020-W01-0000 could be YYYY-Www-D0HH or YYYY-Www-HHMM, when the
+ // separator character is a number in the former case or a hyphen in the
+ // latter case.
+ //
+ // The case of YYYYWww can be distinguished from YYYYWwwd by tracking ahead
+ // to either the end of the string or the first non-numeric character —
+ // since the time components all come in pairs YYYYWww#HH can be
+ // distinguished from YYYYWwwd#HH by the fact that there will always be an
+ // odd number of digits before the first non-digit character in the former
+ // case.
+ static const char date_separator = '-';
+ static const char week_indicator = 'W';
+
+ if (len == 7) {
+ return 7;
+ }
+
+ if (dtstr[4] == date_separator) {
+ // YYYY-???
+
+ if (dtstr[5] == week_indicator) {
+ // YYYY-W??
+
+ if (len < 8) {
+ return -1;
+ }
+
+ if (len > 8 && dtstr[8] == date_separator) {
+ // YYYY-Www-D (10) or YYYY-Www-HH (8)
+ if (len == 9) { return -1; }
+ if (len > 10 && is_digit(dtstr[10])) {
+ // This is as far as we'll try to go to resolve the
+ // ambiguity for the moment — if we have YYYY-Www-##, the
+ // separator is either a hyphen at 8 or a number at 10.
+ //
+ // We'll assume it's a hyphen at 8 because it's way more
+ // likely that someone will use a hyphen as a separator
+ // than a number, but at this point it's really best effort
+ // because this is an extension of the spec anyway.
+ return 8;
+ }
+
+ return 10;
+ } else {
+ // YYYY-Www (8)
+ return 8;
+ }
+ } else {
+ // YYYY-MM-DD (10)
+ return 10;
+ }
+ } else {
+ // YYYY???
+ if (dtstr[4] == week_indicator) {
+ // YYYYWww (7) or YYYYWwwd (8)
+ size_t idx = 7;
+ for (; idx < (size_t)len; ++idx) {
+ // Keep going until we run out of digits.
+ if (!is_digit(dtstr[idx])) {
+ break;
+ }
+ }
+
+ if (idx < 9) {
+ return idx;
+ }
+
+ if (idx % 2 == 0) {
+ // If the index of the last number is even, it's YYYYWww
+ return 7;
+ } else {
+ return 8;
+ }
+ } else {
+ // YYYYMMDD (8)
+ return 8;
+ }
+ }
+}
+
static PyObject *
datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
{
@@ -5225,9 +5432,14 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
return NULL;
}
+ // We only need to sanitize this string if the separator is a surrogate
+ // character. In the situation where the separator location is ambiguous,
+ // we don't have to sanitize it anything because that can only happen when
+ // the separator is either '-' or a number. This should mostly be a noop
+ // but it makes the reference counting easier if we still sanitize.
PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr);
if (dtstr_clean == NULL) {
- goto error;
+ goto invalid_string_error;
}
Py_ssize_t len;
@@ -5243,30 +5455,35 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
}
}
+ const Py_ssize_t separator_location = _find_isoformat_datetime_separator(
+ dt_ptr, len);
+
+
const char *p = dt_ptr;
int year = 0, month = 0, day = 0;
int hour = 0, minute = 0, second = 0, microsecond = 0;
int tzoffset = 0, tzusec = 0;
- // date has a fixed length of 10
- int rv = parse_isoformat_date(p, &year, &month, &day);
+ // date runs up to separator_location
+ int rv = parse_isoformat_date(p, separator_location, &year, &month, &day);
- if (!rv && len > 10) {
+ if (!rv && len > separator_location) {
// In UTF-8, the length of multi-byte characters is encoded in the MSB
- if ((p[10] & 0x80) == 0) {
- p += 11;
+ p += separator_location;
+ if ((p[0] & 0x80) == 0) {
+ p += 1;
}
else {
- switch (p[10] & 0xf0) {
+ switch (p[0] & 0xf0) {
case 0xe0:
- p += 13;
+ p += 3;
break;
case 0xf0:
- p += 14;
+ p += 4;
break;
default:
- p += 12;
+ p += 2;
break;
}
}
@@ -6327,7 +6544,7 @@ static PyMethodDef datetime_methods[] = {
{"fromisoformat", (PyCFunction)datetime_fromisoformat,
METH_O | METH_CLASS,
- PyDoc_STR("string -> datetime from datetime.isoformat() output")},
+ PyDoc_STR("string -> datetime from a string in most ISO 8601 formats")},
/* Instance methods: */