summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/datetime.rst83
-rw-r--r--Doc/whatsnew/3.11.rst8
-rw-r--r--Lib/datetime.py252
-rw-r--r--Lib/test/datetimetester.py251
-rw-r--r--Misc/NEWS.d/next/Library/2022-05-03-12-11-27.gh-issue-80010.yG54RE.rst3
-rw-r--r--Modules/_datetimemodule.c359
6 files changed, 778 insertions, 178 deletions
diff --git a/Doc/library/datetime.rst b/Doc/library/datetime.rst
index ca17dc8..e0b28d7 100644
--- a/Doc/library/datetime.rst
+++ b/Doc/library/datetime.rst
@@ -526,18 +526,20 @@ Other constructors, all class methods:
.. classmethod:: date.fromisoformat(date_string)
- Return a :class:`date` corresponding to a *date_string* given in the format
- ``YYYY-MM-DD``::
+ Return a :class:`date` corresponding to a *date_string* given in any valid
+ ISO 8601 format, except ordinal dates (e.g. ``YYYY-DDD``)::
>>> from datetime import date
>>> date.fromisoformat('2019-12-04')
datetime.date(2019, 12, 4)
-
- This is the inverse of :meth:`date.isoformat`. It only supports the format
- ``YYYY-MM-DD``.
+ >>> date.fromisoformat('20191204')
+ datetime.date(2019, 12, 4)
+ >>> date.fromisoformat('2021-W01-1')
+ datetime.date(2021, 1, 4)
.. versionadded:: 3.7
-
+ .. versionchanged:: 3.11
+ Previously, this method only supported the format ``YYYY-MM-DD``.
.. classmethod:: date.fromisocalendar(year, week, day)
@@ -710,8 +712,6 @@ Instance methods:
>>> date(2002, 12, 4).isoformat()
'2002-12-04'
- This is the inverse of :meth:`date.fromisoformat`.
-
.. method:: date.__str__()
For a date *d*, ``str(d)`` is equivalent to ``d.isoformat()``.
@@ -994,31 +994,29 @@ Other constructors, all class methods:
.. classmethod:: datetime.fromisoformat(date_string)
- Return a :class:`.datetime` corresponding to a *date_string* in one of the
- formats emitted by :meth:`date.isoformat` and :meth:`datetime.isoformat`.
-
- Specifically, this function supports strings in the format:
+ Return a :class:`.datetime` corresponding to a *date_string* in any valid
+ ISO 8601 format, with the following exceptions:
- .. code-block:: none
-
- YYYY-MM-DD[*HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]]]
-
- where ``*`` can match any single character.
-
- .. caution::
-
- This does *not* support parsing arbitrary ISO 8601 strings - it is only intended
- as the inverse operation of :meth:`datetime.isoformat`. A more full-featured
- ISO 8601 parser, ``dateutil.parser.isoparse`` is available in the third-party package
- `dateutil <https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.isoparse>`__.
+ 1. Time zone offsets may have fractional seconds.
+ 2. The `T` separator may be replaced by any single unicode character.
+ 3. Ordinal dates are not currently supported.
+ 4. Fractional hours and minutes are not supported.
Examples::
>>> from datetime import datetime
>>> datetime.fromisoformat('2011-11-04')
datetime.datetime(2011, 11, 4, 0, 0)
+ >>> datetime.fromisoformat('20111104')
+ datetime.datetime(2011, 11, 4, 0, 0)
>>> datetime.fromisoformat('2011-11-04T00:05:23')
datetime.datetime(2011, 11, 4, 0, 5, 23)
+ >>> datetime.fromisoformat('2011-11-04T00:05:23Z')
+ datetime.datetime(2011, 11, 4, 0, 5, 23, tzinfo=datetime.timezone.utc)
+ >>> datetime.fromisoformat('20111104T000523')
+ datetime.datetime(2011, 11, 4, 0, 5, 23)
+ >>> datetime.fromisoformat('2011-W01-2T00:05:23.283')
+ datetime.datetime(2011, 1, 4, 0, 5, 23, 283000)
>>> datetime.fromisoformat('2011-11-04 00:05:23.283')
datetime.datetime(2011, 11, 4, 0, 5, 23, 283000)
>>> datetime.fromisoformat('2011-11-04 00:05:23.283+00:00')
@@ -1028,6 +1026,10 @@ Other constructors, all class methods:
tzinfo=datetime.timezone(datetime.timedelta(seconds=14400)))
.. versionadded:: 3.7
+ .. versionchanged:: 3.11
+ Previously, this method only supported formats that could be emitted by
+ :meth:`date.isoformat()` or :meth:`datetime.isoformat()`.
+
.. classmethod:: datetime.fromisocalendar(year, week, day)
@@ -1763,30 +1765,41 @@ Other constructor:
.. classmethod:: time.fromisoformat(time_string)
- Return a :class:`.time` corresponding to a *time_string* in one of the
- formats emitted by :meth:`time.isoformat`. Specifically, this function supports
- strings in the format:
-
- .. code-block:: none
-
- HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]]
-
- .. caution::
+ Return a :class:`.time` corresponding to a *time_string* in any valid
+ ISO 8601 format, with the following exceptions:
- This does *not* support parsing arbitrary ISO 8601 strings. It is only
- intended as the inverse operation of :meth:`time.isoformat`.
+ 1. Time zone offsets may have fractional seconds.
+ 2. The leading `T`, normally required in cases where there may be ambiguity between
+ a date and a time, is not required.
+ 3. Fractional seconds may have any number of digits (anything beyond 6 will
+ be truncated).
+ 4. Fractional hours and minutes are not supported.
Examples::
>>> from datetime import time
>>> time.fromisoformat('04:23:01')
datetime.time(4, 23, 1)
+ >>> time.fromisoformat('T04:23:01')
+ datetime.time(4, 23, 1)
+ >>> time.fromisoformat('T042301')
+ datetime.time(4, 23, 1)
>>> time.fromisoformat('04:23:01.000384')
datetime.time(4, 23, 1, 384)
+ >>> time.fromisoformat('04:23:01,000')
+ datetime.time(4, 23, 1, 384)
>>> time.fromisoformat('04:23:01+04:00')
datetime.time(4, 23, 1, tzinfo=datetime.timezone(datetime.timedelta(seconds=14400)))
+ >>> time.fromisoformat('04:23:01Z')
+ datetime.time(4, 23, 1, tzinfo=datetime.timezone.utc)
+ >>> time.fromisoformat('04:23:01+00:00')
+ datetime.time(4, 23, 1, tzinfo=datetime.timezone.utc)
+
.. versionadded:: 3.7
+ .. versionchanged:: 3.11
+ Previously, this method only supported formats that could be emitted by
+ :meth:`time.isoformat()`.
Instance methods:
diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index 87dc5dd..efcfa17 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -425,6 +425,14 @@ asyncio
existing stream-based connections to TLS. (Contributed by Ian Good in
:issue:`34975`.)
+datetime
+--------
+
+* :meth:`datetime.date.fromisoformat`, :meth:`datetime.time.fromisoformat` and
+ :meth:`datetime.datetime.fromisoformat` can now be used to parse most ISO 8601
+ formats (barring only those that support fractional hours and minutes).
+ (Contributed by Paul Ganssle in :gh:`80010`.)
+
fractions
---------
diff --git a/Lib/datetime.py b/Lib/datetime.py
index 7f79aa4..afbb6fe 100644
--- a/Lib/datetime.py
+++ b/Lib/datetime.py
@@ -262,58 +262,150 @@ def _wrap_strftime(object, format, timetuple):
return _time.strftime(newformat, timetuple)
# Helpers for parsing the result of isoformat()
+def _is_ascii_digit(c):
+ return c in "0123456789"
+
+def _find_isoformat_datetime_separator(dtstr):
+ # See the comment in _datetimemodule.c:_find_isoformat_datetime_separator
+ len_dtstr = len(dtstr)
+ if len_dtstr == 7:
+ return 7
+
+ assert len_dtstr > 7
+ date_separator = "-"
+ week_indicator = "W"
+
+ if dtstr[4] == date_separator:
+ if dtstr[5] == week_indicator:
+ if len_dtstr < 8:
+ raise ValueError("Invalid ISO string")
+ if len_dtstr > 8 and dtstr[8] == date_separator:
+ if len_dtstr == 9:
+ raise ValueError("Invalid ISO string")
+ if len_dtstr > 10 and _is_ascii_digit(dtstr[10]):
+ # This is as far as we need to resolve the ambiguity for
+ # the moment - if we have YYYY-Www-##, the separator is
+ # either a hyphen at 8 or a number at 10.
+ #
+ # We'll assume it's a hyphen at 8 because it's way more
+ # likely that someone will use a hyphen as a separator than
+ # a number, but at this point it's really best effort
+ # because this is an extension of the spec anyway.
+ # TODO(pganssle): Document this
+ return 8
+ return 10
+ else:
+ # YYYY-Www (8)
+ return 8
+ else:
+ # YYYY-MM-DD (10)
+ return 10
+ else:
+ if dtstr[4] == week_indicator:
+ # YYYYWww (7) or YYYYWwwd (8)
+ idx = 7
+ while idx < len_dtstr:
+ if not _is_ascii_digit(dtstr[idx]):
+ break
+ idx += 1
+
+ if idx < 9:
+ return idx
+
+ if idx % 2 == 0:
+ # If the index of the last number is even, it's YYYYWwwd
+ return 7
+ else:
+ return 8
+ else:
+ # YYYYMMDD (8)
+ return 8
+
+
def _parse_isoformat_date(dtstr):
- # It is assumed that this function will only be called with a
- # string of length exactly 10, and (though this is not used) ASCII-only
+ # It is assumed that this is an ASCII-only string of lengths 7, 8 or 10,
+ # see the comment on Modules/_datetimemodule.c:_find_isoformat_datetime_separator
+ assert len(dtstr) in (7, 8, 10)
year = int(dtstr[0:4])
- if dtstr[4] != '-':
- raise ValueError('Invalid date separator: %s' % dtstr[4])
+ has_sep = dtstr[4] == '-'
+
+ pos = 4 + has_sep
+ if dtstr[pos:pos + 1] == "W":
+ # YYYY-?Www-?D?
+ pos += 1
+ weekno = int(dtstr[pos:pos + 2])
+ pos += 2
- month = int(dtstr[5:7])
+ dayno = 1
+ if len(dtstr) > pos:
+ if (dtstr[pos:pos + 1] == '-') != has_sep:
+ raise ValueError("Inconsistent use of dash separator")
- if dtstr[7] != '-':
- raise ValueError('Invalid date separator')
+ pos += has_sep
- day = int(dtstr[8:10])
+ dayno = int(dtstr[pos:pos + 1])
+
+ return list(_isoweek_to_gregorian(year, weekno, dayno))
+ else:
+ month = int(dtstr[pos:pos + 2])
+ pos += 2
+ if (dtstr[pos:pos + 1] == "-") != has_sep:
+ raise ValueError("Inconsistent use of dash separator")
+
+ pos += has_sep
+ day = int(dtstr[pos:pos + 2])
+
+ return [year, month, day]
+
+
+_FRACTION_CORRECTION = [100000, 10000, 1000, 100, 10]
- return [year, month, day]
def _parse_hh_mm_ss_ff(tstr):
- # Parses things of the form HH[:MM[:SS[.fff[fff]]]]
+ # Parses things of the form HH[:?MM[:?SS[{.,}fff[fff]]]]
len_str = len(tstr)
time_comps = [0, 0, 0, 0]
pos = 0
for comp in range(0, 3):
if (len_str - pos) < 2:
- raise ValueError('Incomplete time component')
+ raise ValueError("Incomplete time component")
time_comps[comp] = int(tstr[pos:pos+2])
pos += 2
next_char = tstr[pos:pos+1]
+ if comp == 0:
+ has_sep = next_char == ':'
+
if not next_char or comp >= 2:
break
- if next_char != ':':
- raise ValueError('Invalid time separator: %c' % next_char)
+ if has_sep and next_char != ':':
+ raise ValueError("Invalid time separator: %c" % next_char)
- pos += 1
+ pos += has_sep
if pos < len_str:
- if tstr[pos] != '.':
- raise ValueError('Invalid microsecond component')
+ if tstr[pos] not in '.,':
+ raise ValueError("Invalid microsecond component")
else:
pos += 1
len_remainder = len_str - pos
- if len_remainder not in (3, 6):
- raise ValueError('Invalid microsecond component')
- time_comps[3] = int(tstr[pos:])
- if len_remainder == 3:
- time_comps[3] *= 1000
+ if len_remainder >= 6:
+ to_parse = 6
+ else:
+ to_parse = len_remainder
+
+ time_comps[3] = int(tstr[pos:(pos+to_parse)])
+ if to_parse < 6:
+ time_comps[3] *= _FRACTION_CORRECTION[to_parse-1]
+ if (len_remainder > to_parse
+ and not all(map(_is_ascii_digit, tstr[(pos+to_parse):]))):
+ raise ValueError("Non-digit values in unparsed fraction")
return time_comps
@@ -321,27 +413,34 @@ def _parse_isoformat_time(tstr):
# Format supported is HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]]
len_str = len(tstr)
if len_str < 2:
- raise ValueError('Isoformat time too short')
+ raise ValueError("Isoformat time too short")
- # This is equivalent to re.search('[+-]', tstr), but faster
- tz_pos = (tstr.find('-') + 1 or tstr.find('+') + 1)
+ # This is equivalent to re.search('[+-Z]', tstr), but faster
+ tz_pos = (tstr.find('-') + 1 or tstr.find('+') + 1 or tstr.find('Z') + 1)
timestr = tstr[:tz_pos-1] if tz_pos > 0 else tstr
time_comps = _parse_hh_mm_ss_ff(timestr)
tzi = None
- if tz_pos > 0:
+ if tz_pos == len_str and tstr[-1] == 'Z':
+ tzi = timezone.utc
+ elif tz_pos > 0:
tzstr = tstr[tz_pos:]
# Valid time zone strings are:
+ # HH len: 2
+ # HHMM len: 4
# HH:MM len: 5
+ # HHMMSS len: 6
+ # HHMMSS.f+ len: 7+
# HH:MM:SS len: 8
- # HH:MM:SS.ffffff len: 15
+ # HH:MM:SS.f+ len: 10+
- if len(tzstr) not in (5, 8, 15):
- raise ValueError('Malformed time zone string')
+ if len(tzstr) in (0, 1, 3):
+ raise ValueError("Malformed time zone string")
tz_comps = _parse_hh_mm_ss_ff(tzstr)
+
if all(x == 0 for x in tz_comps):
tzi = timezone.utc
else:
@@ -356,6 +455,38 @@ def _parse_isoformat_time(tstr):
return time_comps
+# tuple[int, int, int] -> tuple[int, int, int] version of date.fromisocalendar
+def _isoweek_to_gregorian(year, week, day):
+ # Year is bounded this way because 9999-12-31 is (9999, 52, 5)
+ if not MINYEAR <= year <= MAXYEAR:
+ raise ValueError(f"Year is out of range: {year}")
+
+ if not 0 < week < 53:
+ out_of_range = True
+
+ if week == 53:
+ # ISO years have 53 weeks in them on years starting with a
+ # Thursday and leap years starting on a Wednesday
+ first_weekday = _ymd2ord(year, 1, 1) % 7
+ if (first_weekday == 4 or (first_weekday == 3 and
+ _is_leap(year))):
+ out_of_range = False
+
+ if out_of_range:
+ raise ValueError(f"Invalid week: {week}")
+
+ if not 0 < day < 8:
+ raise ValueError(f"Invalid weekday: {day} (range is [1, 7])")
+
+ # Now compute the offset from (Y, 1, 1) in days:
+ day_offset = (week - 1) * 7 + (day - 1)
+
+ # Calculate the ordinal day for monday, week 1
+ day_1 = _isoweek1monday(year)
+ ord_day = day_1 + day_offset
+
+ return _ord2ymd(ord_day)
+
# Just raise TypeError if the arg isn't None or a string.
def _check_tzname(name):
@@ -847,12 +978,14 @@ class date:
@classmethod
def fromisoformat(cls, date_string):
- """Construct a date from the output of date.isoformat()."""
+ """Construct a date from a string in ISO 8601 format."""
if not isinstance(date_string, str):
raise TypeError('fromisoformat: argument must be str')
+ if len(date_string) not in (7, 8, 10):
+ raise ValueError(f'Invalid isoformat string: {date_string!r}')
+
try:
- assert len(date_string) == 10
return cls(*_parse_isoformat_date(date_string))
except Exception:
raise ValueError(f'Invalid isoformat string: {date_string!r}')
@@ -862,35 +995,7 @@ class date:
"""Construct a date from the ISO year, week number and weekday.
This is the inverse of the date.isocalendar() function"""
- # Year is bounded this way because 9999-12-31 is (9999, 52, 5)
- if not MINYEAR <= year <= MAXYEAR:
- raise ValueError(f"Year is out of range: {year}")
-
- if not 0 < week < 53:
- out_of_range = True
-
- if week == 53:
- # ISO years have 53 weeks in them on years starting with a
- # Thursday and leap years starting on a Wednesday
- first_weekday = _ymd2ord(year, 1, 1) % 7
- if (first_weekday == 4 or (first_weekday == 3 and
- _is_leap(year))):
- out_of_range = False
-
- if out_of_range:
- raise ValueError(f"Invalid week: {week}")
-
- if not 0 < day < 8:
- raise ValueError(f"Invalid weekday: {day} (range is [1, 7])")
-
- # Now compute the offset from (Y, 1, 1) in days:
- day_offset = (week - 1) * 7 + (day - 1)
-
- # Calculate the ordinal day for monday, week 1
- day_1 = _isoweek1monday(year)
- ord_day = day_1 + day_offset
-
- return cls(*_ord2ymd(ord_day))
+ return cls(*_isoweek_to_gregorian(year, week, day))
# Conversions to string
@@ -1427,10 +1532,15 @@ class time:
@classmethod
def fromisoformat(cls, time_string):
- """Construct a time from the output of isoformat()."""
+ """Construct a time from a string in one of the ISO 8601 formats."""
if not isinstance(time_string, str):
raise TypeError('fromisoformat: argument must be str')
+ # The spec actually requires that time-only ISO 8601 strings start with
+ # T, but the extended format allows this to be omitted as long as there
+ # is no ambiguity with date strings.
+ time_string = time_string.removeprefix('T')
+
try:
return cls(*_parse_isoformat_time(time_string))
except Exception:
@@ -1711,24 +1821,30 @@ class datetime(date):
@classmethod
def fromisoformat(cls, date_string):
- """Construct a datetime from the output of datetime.isoformat()."""
+ """Construct a datetime from a string in one of the ISO 8601 formats."""
if not isinstance(date_string, str):
raise TypeError('fromisoformat: argument must be str')
- # Split this at the separator
- dstr = date_string[0:10]
- tstr = date_string[11:]
+ if len(date_string) < 7:
+ raise ValueError(f'Invalid isoformat string: {date_string!r}')
+ # Split this at the separator
try:
+ separator_location = _find_isoformat_datetime_separator(date_string)
+ dstr = date_string[0:separator_location]
+ tstr = date_string[(separator_location+1):]
+
date_components = _parse_isoformat_date(dstr)
except ValueError:
- raise ValueError(f'Invalid isoformat string: {date_string!r}')
+ raise ValueError(
+ f'Invalid isoformat string: {date_string!r}') from None
if tstr:
try:
time_components = _parse_isoformat_time(tstr)
except ValueError:
- raise ValueError(f'Invalid isoformat string: {date_string!r}')
+ raise ValueError(
+ f'Invalid isoformat string: {date_string!r}') from None
else:
time_components = [0, 0, 0, 0, None]
@@ -2509,7 +2625,9 @@ else:
_format_time, _format_offset, _index, _is_leap, _isoweek1monday, _math,
_ord2ymd, _time, _time_class, _tzinfo_class, _wrap_strftime, _ymd2ord,
_divide_and_round, _parse_isoformat_date, _parse_isoformat_time,
- _parse_hh_mm_ss_ff, _IsoCalendarDate)
+ _parse_hh_mm_ss_ff, _IsoCalendarDate, _isoweek_to_gregorian,
+ _find_isoformat_datetime_separator, _FRACTION_CORRECTION,
+ _is_ascii_digit)
# XXX Since import * above excludes names that start with _,
# docstring does not get overwritten. In the future, it may be
# appropriate to maintain a single module level docstring and
diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py
index d85b546..0495362 100644
--- a/Lib/test/datetimetester.py
+++ b/Lib/test/datetimetester.py
@@ -7,6 +7,7 @@ import itertools
import bisect
import copy
import decimal
+import functools
import sys
import os
import pickle
@@ -1840,6 +1841,41 @@ class TestDate(HarmlessMixedComparison, unittest.TestCase):
self.assertEqual(dt, dt_rt)
+ def test_fromisoformat_date_examples(self):
+ examples = [
+ ('00010101', self.theclass(1, 1, 1)),
+ ('20000101', self.theclass(2000, 1, 1)),
+ ('20250102', self.theclass(2025, 1, 2)),
+ ('99991231', self.theclass(9999, 12, 31)),
+ ('0001-01-01', self.theclass(1, 1, 1)),
+ ('2000-01-01', self.theclass(2000, 1, 1)),
+ ('2025-01-02', self.theclass(2025, 1, 2)),
+ ('9999-12-31', self.theclass(9999, 12, 31)),
+ ('2025W01', self.theclass(2024, 12, 30)),
+ ('2025-W01', self.theclass(2024, 12, 30)),
+ ('2025W014', self.theclass(2025, 1, 2)),
+ ('2025-W01-4', self.theclass(2025, 1, 2)),
+ ('2026W01', self.theclass(2025, 12, 29)),
+ ('2026-W01', self.theclass(2025, 12, 29)),
+ ('2026W013', self.theclass(2025, 12, 31)),
+ ('2026-W01-3', self.theclass(2025, 12, 31)),
+ ('2022W52', self.theclass(2022, 12, 26)),
+ ('2022-W52', self.theclass(2022, 12, 26)),
+ ('2022W527', self.theclass(2023, 1, 1)),
+ ('2022-W52-7', self.theclass(2023, 1, 1)),
+ ('2015W534', self.theclass(2015, 12, 31)), # Has week 53
+ ('2015-W53-4', self.theclass(2015, 12, 31)), # Has week 53
+ ('2015-W53-5', self.theclass(2016, 1, 1)),
+ ('2020W531', self.theclass(2020, 12, 28)), # Leap year
+ ('2020-W53-1', self.theclass(2020, 12, 28)), # Leap year
+ ('2020-W53-6', self.theclass(2021, 1, 2)),
+ ]
+
+ for input_str, expected in examples:
+ with self.subTest(input_str=input_str):
+ actual = self.theclass.fromisoformat(input_str)
+ self.assertEqual(actual, expected)
+
def test_fromisoformat_subclass(self):
class DateSubclass(self.theclass):
pass
@@ -1862,7 +1898,8 @@ class TestDate(HarmlessMixedComparison, unittest.TestCase):
'2009-12-0a', # Invalid character in day
'2009-01-32', # Invalid day
'2009-02-29', # Invalid leap day
- '20090228', # Valid ISO8601 output not from isoformat()
+ '2019-W53-1', # No week 53 in 2019
+ '2020-W54-1', # No week 54
'2009\ud80002\ud80028', # Separators are surrogate codepoints
]
@@ -3003,6 +3040,140 @@ class TestDateTime(TestDate):
dt_rt = self.theclass.fromisoformat(dtstr)
self.assertEqual(dt, dt_rt)
+ def test_fromisoformat_datetime_examples(self):
+ UTC = timezone.utc
+ BST = timezone(timedelta(hours=1), 'BST')
+ EST = timezone(timedelta(hours=-5), 'EST')
+ EDT = timezone(timedelta(hours=-4), 'EDT')
+ examples = [
+ ('2025-01-02', self.theclass(2025, 1, 2, 0, 0)),
+ ('2025-01-02T03', self.theclass(2025, 1, 2, 3, 0)),
+ ('2025-01-02T03:04', self.theclass(2025, 1, 2, 3, 4)),
+ ('2025-01-02T0304', self.theclass(2025, 1, 2, 3, 4)),
+ ('2025-01-02T03:04:05', self.theclass(2025, 1, 2, 3, 4, 5)),
+ ('2025-01-02T030405', self.theclass(2025, 1, 2, 3, 4, 5)),
+ ('2025-01-02T03:04:05.6',
+ self.theclass(2025, 1, 2, 3, 4, 5, 600000)),
+ ('2025-01-02T03:04:05,6',
+ self.theclass(2025, 1, 2, 3, 4, 5, 600000)),
+ ('2025-01-02T03:04:05.678',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678000)),
+ ('2025-01-02T03:04:05.678901',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('2025-01-02T03:04:05,678901',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('2025-01-02T030405.678901',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('2025-01-02T030405,678901',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('2025-01-02T03:04:05.6789010',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('2009-04-19T03:15:45.2345',
+ self.theclass(2009, 4, 19, 3, 15, 45, 234500)),
+ ('2009-04-19T03:15:45.1234567',
+ self.theclass(2009, 4, 19, 3, 15, 45, 123456)),
+ ('2025-01-02T03:04:05,678',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678000)),
+ ('20250102', self.theclass(2025, 1, 2, 0, 0)),
+ ('20250102T03', self.theclass(2025, 1, 2, 3, 0)),
+ ('20250102T03:04', self.theclass(2025, 1, 2, 3, 4)),
+ ('20250102T03:04:05', self.theclass(2025, 1, 2, 3, 4, 5)),
+ ('20250102T030405', self.theclass(2025, 1, 2, 3, 4, 5)),
+ ('20250102T03:04:05.6',
+ self.theclass(2025, 1, 2, 3, 4, 5, 600000)),
+ ('20250102T03:04:05,6',
+ self.theclass(2025, 1, 2, 3, 4, 5, 600000)),
+ ('20250102T03:04:05.678',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678000)),
+ ('20250102T03:04:05,678',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678000)),
+ ('20250102T03:04:05.678901',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('20250102T030405.678901',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('20250102T030405,678901',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('20250102T030405.6789010',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('2022W01', self.theclass(2022, 1, 3)),
+ ('2022W52520', self.theclass(2022, 12, 26, 20, 0)),
+ ('2022W527520', self.theclass(2023, 1, 1, 20, 0)),
+ ('2026W01516', self.theclass(2025, 12, 29, 16, 0)),
+ ('2026W013516', self.theclass(2025, 12, 31, 16, 0)),
+ ('2025W01503', self.theclass(2024, 12, 30, 3, 0)),
+ ('2025W014503', self.theclass(2025, 1, 2, 3, 0)),
+ ('2025W01512', self.theclass(2024, 12, 30, 12, 0)),
+ ('2025W014512', self.theclass(2025, 1, 2, 12, 0)),
+ ('2025W014T121431', self.theclass(2025, 1, 2, 12, 14, 31)),
+ ('2026W013T162100', self.theclass(2025, 12, 31, 16, 21)),
+ ('2026W013 162100', self.theclass(2025, 12, 31, 16, 21)),
+ ('2022W527T202159', self.theclass(2023, 1, 1, 20, 21, 59)),
+ ('2022W527 202159', self.theclass(2023, 1, 1, 20, 21, 59)),
+ ('2025W014 121431', self.theclass(2025, 1, 2, 12, 14, 31)),
+ ('2025W014T030405', self.theclass(2025, 1, 2, 3, 4, 5)),
+ ('2025W014 030405', self.theclass(2025, 1, 2, 3, 4, 5)),
+ ('2020-W53-6T03:04:05', self.theclass(2021, 1, 2, 3, 4, 5)),
+ ('2020W537 03:04:05', self.theclass(2021, 1, 3, 3, 4, 5)),
+ ('2025-W01-4T03:04:05', self.theclass(2025, 1, 2, 3, 4, 5)),
+ ('2025-W01-4T03:04:05.678901',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678901)),
+ ('2025-W01-4T12:14:31', self.theclass(2025, 1, 2, 12, 14, 31)),
+ ('2025-W01-4T12:14:31.012345',
+ self.theclass(2025, 1, 2, 12, 14, 31, 12345)),
+ ('2026-W01-3T16:21:00', self.theclass(2025, 12, 31, 16, 21)),
+ ('2026-W01-3T16:21:00.000000', self.theclass(2025, 12, 31, 16, 21)),
+ ('2022-W52-7T20:21:59',
+ self.theclass(2023, 1, 1, 20, 21, 59)),
+ ('2022-W52-7T20:21:59.999999',
+ self.theclass(2023, 1, 1, 20, 21, 59, 999999)),
+ ('2025-W01003+00',
+ self.theclass(2024, 12, 30, 3, 0, tzinfo=UTC)),
+ ('2025-01-02T03:04:05+00',
+ self.theclass(2025, 1, 2, 3, 4, 5, tzinfo=UTC)),
+ ('2025-01-02T03:04:05Z',
+ self.theclass(2025, 1, 2, 3, 4, 5, tzinfo=UTC)),
+ ('2025-01-02003:04:05,6+00:00:00.00',
+ self.theclass(2025, 1, 2, 3, 4, 5, 600000, tzinfo=UTC)),
+ ('2000-01-01T00+21',
+ self.theclass(2000, 1, 1, 0, 0, tzinfo=timezone(timedelta(hours=21)))),
+ ('2025-01-02T03:05:06+0300',
+ self.theclass(2025, 1, 2, 3, 5, 6,
+ tzinfo=timezone(timedelta(hours=3)))),
+ ('2025-01-02T03:05:06-0300',
+ self.theclass(2025, 1, 2, 3, 5, 6,
+ tzinfo=timezone(timedelta(hours=-3)))),
+ ('2025-01-02T03:04:05+0000',
+ self.theclass(2025, 1, 2, 3, 4, 5, tzinfo=UTC)),
+ ('2025-01-02T03:05:06+03',
+ self.theclass(2025, 1, 2, 3, 5, 6,
+ tzinfo=timezone(timedelta(hours=3)))),
+ ('2025-01-02T03:05:06-03',
+ self.theclass(2025, 1, 2, 3, 5, 6,
+ tzinfo=timezone(timedelta(hours=-3)))),
+ ('2020-01-01T03:05:07.123457-05:00',
+ self.theclass(2020, 1, 1, 3, 5, 7, 123457, tzinfo=EST)),
+ ('2020-01-01T03:05:07.123457-0500',
+ self.theclass(2020, 1, 1, 3, 5, 7, 123457, tzinfo=EST)),
+ ('2020-06-01T04:05:06.111111-04:00',
+ self.theclass(2020, 6, 1, 4, 5, 6, 111111, tzinfo=EDT)),
+ ('2020-06-01T04:05:06.111111-0400',
+ self.theclass(2020, 6, 1, 4, 5, 6, 111111, tzinfo=EDT)),
+ ('2021-10-31T01:30:00.000000+01:00',
+ self.theclass(2021, 10, 31, 1, 30, tzinfo=BST)),
+ ('2021-10-31T01:30:00.000000+0100',
+ self.theclass(2021, 10, 31, 1, 30, tzinfo=BST)),
+ ('2025-01-02T03:04:05,6+000000.00',
+ self.theclass(2025, 1, 2, 3, 4, 5, 600000, tzinfo=UTC)),
+ ('2025-01-02T03:04:05,678+00:00:10',
+ self.theclass(2025, 1, 2, 3, 4, 5, 678000,
+ tzinfo=timezone(timedelta(seconds=10)))),
+ ]
+
+ for input_str, expected in examples:
+ with self.subTest(input_str=input_str):
+ actual = self.theclass.fromisoformat(input_str)
+ self.assertEqual(actual, expected)
+
def test_fromisoformat_fails_datetime(self):
# Test that fromisoformat() fails on invalid values
bad_strs = [
@@ -3016,8 +3187,6 @@ class TestDateTime(TestDate):
'2009-04-19T03;15:45', # Bad first time separator
'2009-04-19T03:15;45', # Bad second time separator
'2009-04-19T03:15:4500:00', # Bad time zone separator
- '2009-04-19T03:15:45.2345', # Too many digits for milliseconds
- '2009-04-19T03:15:45.1234567', # Too many digits for microseconds
'2009-04-19T03:15:45.123456+24:30', # Invalid time zone offset
'2009-04-19T03:15:45.123456-24:30', # Invalid negative offset
'2009-04-10ᛇᛇᛇᛇᛇ12:15', # Too many unicode separators
@@ -3962,6 +4131,76 @@ class TestTimeTZ(TestTime, TZInfoBase, unittest.TestCase):
t_rt = self.theclass.fromisoformat(tstr)
self.assertEqual(t, t_rt)
+ def test_fromisoformat_fractions(self):
+ strs = [
+ ('12:30:45.1', (12, 30, 45, 100000)),
+ ('12:30:45.12', (12, 30, 45, 120000)),
+ ('12:30:45.123', (12, 30, 45, 123000)),
+ ('12:30:45.1234', (12, 30, 45, 123400)),
+ ('12:30:45.12345', (12, 30, 45, 123450)),
+ ('12:30:45.123456', (12, 30, 45, 123456)),
+ ('12:30:45.1234567', (12, 30, 45, 123456)),
+ ('12:30:45.12345678', (12, 30, 45, 123456)),
+ ]
+
+ for time_str, time_comps in strs:
+ expected = self.theclass(*time_comps)
+ actual = self.theclass.fromisoformat(time_str)
+
+ self.assertEqual(actual, expected)
+
+ def test_fromisoformat_time_examples(self):
+ examples = [
+ ('0000', self.theclass(0, 0)),
+ ('00:00', self.theclass(0, 0)),
+ ('000000', self.theclass(0, 0)),
+ ('00:00:00', self.theclass(0, 0)),
+ ('000000.0', self.theclass(0, 0)),
+ ('00:00:00.0', self.theclass(0, 0)),
+ ('000000.000', self.theclass(0, 0)),
+ ('00:00:00.000', self.theclass(0, 0)),
+ ('000000.000000', self.theclass(0, 0)),
+ ('00:00:00.000000', self.theclass(0, 0)),
+ ('1200', self.theclass(12, 0)),
+ ('12:00', self.theclass(12, 0)),
+ ('120000', self.theclass(12, 0)),
+ ('12:00:00', self.theclass(12, 0)),
+ ('120000.0', self.theclass(12, 0)),
+ ('12:00:00.0', self.theclass(12, 0)),
+ ('120000.000', self.theclass(12, 0)),
+ ('12:00:00.000', self.theclass(12, 0)),
+ ('120000.000000', self.theclass(12, 0)),
+ ('12:00:00.000000', self.theclass(12, 0)),
+ ('2359', self.theclass(23, 59)),
+ ('23:59', self.theclass(23, 59)),
+ ('235959', self.theclass(23, 59, 59)),
+ ('23:59:59', self.theclass(23, 59, 59)),
+ ('235959.9', self.theclass(23, 59, 59, 900000)),
+ ('23:59:59.9', self.theclass(23, 59, 59, 900000)),
+ ('235959.999', self.theclass(23, 59, 59, 999000)),
+ ('23:59:59.999', self.theclass(23, 59, 59, 999000)),
+ ('235959.999999', self.theclass(23, 59, 59, 999999)),
+ ('23:59:59.999999', self.theclass(23, 59, 59, 999999)),
+ ('00:00:00Z', self.theclass(0, 0, tzinfo=timezone.utc)),
+ ('12:00:00+0000', self.theclass(12, 0, tzinfo=timezone.utc)),
+ ('12:00:00+00:00', self.theclass(12, 0, tzinfo=timezone.utc)),
+ ('00:00:00+05',
+ self.theclass(0, 0, tzinfo=timezone(timedelta(hours=5)))),
+ ('00:00:00+05:30',
+ self.theclass(0, 0, tzinfo=timezone(timedelta(hours=5, minutes=30)))),
+ ('12:00:00-05:00',
+ self.theclass(12, 0, tzinfo=timezone(timedelta(hours=-5)))),
+ ('12:00:00-0500',
+ self.theclass(12, 0, tzinfo=timezone(timedelta(hours=-5)))),
+ ('00:00:00,000-23:59:59.999999',
+ self.theclass(0, 0, tzinfo=timezone(-timedelta(hours=23, minutes=59, seconds=59, microseconds=999999)))),
+ ]
+
+ for input_str, expected in examples:
+ with self.subTest(input_str=input_str):
+ actual = self.theclass.fromisoformat(input_str)
+ self.assertEqual(actual, expected)
+
def test_fromisoformat_fails(self):
bad_strs = [
'', # Empty string
@@ -3975,15 +4214,17 @@ class TestTimeTZ(TestTime, TZInfoBase, unittest.TestCase):
'1a:30:45.334034', # Invalid character in hours
'12:a0:45.334034', # Invalid character in minutes
'12:30:a5.334034', # Invalid character in seconds
- '12:30:45.1234', # Too many digits for milliseconds
- '12:30:45.1234567', # Too many digits for microseconds
'12:30:45.123456+24:30', # Invalid time zone offset
'12:30:45.123456-24:30', # Invalid negative offset
'12:30:45', # Uses full-width unicode colons
+ '12:30:45.123456a', # Non-numeric data after 6 components
+ '12:30:45.123456789a', # Non-numeric data after 9 components
'12:30:45․123456', # Uses \u2024 in place of decimal point
'12:30:45a', # Extra at tend of basic time
'12:30:45.123a', # Extra at end of millisecond time
'12:30:45.123456a', # Extra at end of microsecond time
+ '12:30:45.123456-', # Extra at end of microsecond time
+ '12:30:45.123456+', # Extra at end of microsecond time
'12:30:45.123456+12:00:30a', # Extra at end of full time
]
diff --git a/Misc/NEWS.d/next/Library/2022-05-03-12-11-27.gh-issue-80010.yG54RE.rst b/Misc/NEWS.d/next/Library/2022-05-03-12-11-27.gh-issue-80010.yG54RE.rst
new file mode 100644
index 0000000..bbcef47
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-05-03-12-11-27.gh-issue-80010.yG54RE.rst
@@ -0,0 +1,3 @@
+Add support for generalized ISO 8601 parsing to
+:meth:`datetime.datetime.fromisoformat`, :meth:`datetime.date.fromisoformat`
+and :meth:`datetime.time.fromisoformat`. Patch by Paul Ganssle.
diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 20cdb18..efb5278 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -395,6 +395,39 @@ iso_week1_monday(int year)
return week1_monday;
}
+static int
+iso_to_ymd(const int iso_year, const int iso_week, const int iso_day,
+ int *year, int *month, int *day) {
+ if (iso_week <= 0 || iso_week >= 53) {
+ int out_of_range = 1;
+ if (iso_week == 53) {
+ // ISO years have 53 weeks in it on years starting with a Thursday
+ // and on leap years starting on Wednesday
+ int first_weekday = weekday(iso_year, 1, 1);
+ if (first_weekday == 3 || (first_weekday == 2 && is_leap(iso_year))) {
+ out_of_range = 0;
+ }
+ }
+
+ if (out_of_range) {
+ return -2;
+ }
+ }
+
+ if (iso_day <= 0 || iso_day >= 8) {
+ return -3;
+ }
+
+ // Convert (Y, W, D) to (Y, M, D) in-place
+ int day_1 = iso_week1_monday(iso_year);
+
+ int day_offset = (iso_week - 1)*7 + iso_day - 1;
+
+ ord_to_ymd(day_1 + day_offset, year, month, day);
+ return 0;
+}
+
+
/* ---------------------------------------------------------------------------
* Range checkers.
*/
@@ -680,6 +713,11 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d)
* String parsing utilities and helper functions
*/
+static unsigned char
+is_digit(const char c) {
+ return ((unsigned int)(c - '0')) < 10;
+}
+
static const char *
parse_digits(const char *ptr, int *var, size_t num_digits)
{
@@ -696,14 +734,17 @@ parse_digits(const char *ptr, int *var, size_t num_digits)
}
static int
-parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
+parse_isoformat_date(const char *dtstr, const size_t len, int *year, int *month, int *day)
{
/* Parse the date components of the result of date.isoformat()
*
* Return codes:
* 0: Success
* -1: Failed to parse date component
- * -2: Failed to parse dateseparator
+ * -2: Inconsistent date separator usage
+ * -3: Failed to parse ISO week.
+ * -4: Failed to parse ISO day.
+ * -5, -6: Failure in iso_to_ymd
*/
const char *p = dtstr;
p = parse_digits(p, year, 4);
@@ -711,8 +752,42 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
return -1;
}
- if (*(p++) != '-') {
- return -2;
+ const unsigned char uses_separator = (*p == '-');
+ if (uses_separator) {
+ ++p;
+ }
+
+ if(*p == 'W') {
+ // This is an isocalendar-style date string
+ p++;
+ int iso_week = 0;
+ int iso_day = 0;
+
+ p = parse_digits(p, &iso_week, 2);
+ if (NULL == p) {
+ return -3;
+ }
+
+ assert(p > dtstr);
+ if ((size_t)(p - dtstr) < len) {
+ if (uses_separator && *(p++) != '-') {
+ return -2;
+ }
+
+ p = parse_digits(p, &iso_day, 1);
+ if (NULL == p) {
+ return -4;
+ }
+ } else {
+ iso_day = 1;
+ }
+
+ int rv = iso_to_ymd(*year, iso_week, iso_day, year, month, day);
+ if (rv) {
+ return -3 + rv;
+ } else {
+ return 0;
+ }
}
p = parse_digits(p, month, 2);
@@ -720,15 +795,13 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
return -1;
}
- if (*(p++) != '-') {
+ if (uses_separator && *(p++) != '-') {
return -2;
}
-
p = parse_digits(p, day, 2);
if (p == NULL) {
return -1;
}
-
return 0;
}
@@ -736,11 +809,14 @@ static int
parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
int *minute, int *second, int *microsecond)
{
+ *hour = *minute = *second = *microsecond = 0;
const char *p = tstr;
const char *p_end = tstr_end;
int *vals[3] = {hour, minute, second};
+ // This is initialized to satisfy an erroneous compiler warning.
+ unsigned char has_separator = 1;
- // Parse [HH[:MM[:SS]]]
+ // Parse [HH[:?MM[:?SS]]]
for (size_t i = 0; i < 3; ++i) {
p = parse_digits(p, vals[i], 2);
if (NULL == p) {
@@ -748,33 +824,47 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
}
char c = *(p++);
+ if (i == 0) {
+ has_separator = (c == ':');
+ }
+
if (p >= p_end) {
return c != '\0';
}
- else if (c == ':') {
+ else if (has_separator && (c == ':')) {
continue;
}
- else if (c == '.') {
+ else if (c == '.' || c == ',') {
break;
- }
- else {
+ } else if (!has_separator) {
+ --p;
+ } else {
return -4; // Malformed time separator
}
}
- // Parse .fff[fff]
+ // Parse fractional components
size_t len_remains = p_end - p;
- if (!(len_remains == 6 || len_remains == 3)) {
- return -3;
+ size_t to_parse = len_remains;
+ if (len_remains >= 6) {
+ to_parse = 6;
}
- p = parse_digits(p, microsecond, len_remains);
+ p = parse_digits(p, microsecond, to_parse);
if (NULL == p) {
return -3;
}
- if (len_remains == 3) {
- *microsecond *= 1000;
+ static int correction[] = {
+ 100000, 10000, 1000, 100, 10
+ };
+
+ if (to_parse < 6) {
+ *microsecond *= correction[to_parse-1];
+ }
+
+ while (is_digit(*p)){
+ ++p; // skip truncated digits
}
// Return 1 if it's not the end of the string
@@ -800,7 +890,7 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
const char *tzinfo_pos = p;
do {
- if (*tzinfo_pos == '+' || *tzinfo_pos == '-') {
+ if (*tzinfo_pos == 'Z' || *tzinfo_pos == '+' || *tzinfo_pos == '-') {
break;
}
} while (++tzinfo_pos < p_end);
@@ -822,14 +912,16 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
}
}
- // Parse time zone component
- // Valid formats are:
- // - +HH:MM (len 6)
- // - +HH:MM:SS (len 9)
- // - +HH:MM:SS.ffffff (len 16)
- size_t tzlen = p_end - tzinfo_pos;
- if (!(tzlen == 6 || tzlen == 9 || tzlen == 16)) {
- return -5;
+ // Special case UTC / Zulu time.
+ if (*tzinfo_pos == 'Z') {
+ *tzoffset = 0;
+ *tzmicrosecond = 0;
+
+ if (*(tzinfo_pos + 1) != '\0') {
+ return -5;
+ } else {
+ return 1;
+ }
}
int tzsign = (*tzinfo_pos == '-') ? -1 : 1;
@@ -2983,8 +3075,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr)
int year = 0, month = 0, day = 0;
int rv;
- if (len == 10) {
- rv = parse_isoformat_date(dt_ptr, &year, &month, &day);
+ if (len == 7 || len == 8 || len == 10) {
+ rv = parse_isoformat_date(dt_ptr, len, &year, &month, &day);
}
else {
rv = -1;
@@ -3027,37 +3119,21 @@ date_fromisocalendar(PyObject *cls, PyObject *args, PyObject *kw)
return NULL;
}
- if (week <= 0 || week >= 53) {
- int out_of_range = 1;
- if (week == 53) {
- // ISO years have 53 weeks in it on years starting with a Thursday
- // and on leap years starting on Wednesday
- int first_weekday = weekday(year, 1, 1);
- if (first_weekday == 3 || (first_weekday == 2 && is_leap(year))) {
- out_of_range = 0;
- }
- }
+ int month;
+ int rv = iso_to_ymd(year, week, day, &year, &month, &day);
- if (out_of_range) {
- PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
- return NULL;
- }
+
+ if (rv == -2) {
+ PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
+ return NULL;
}
- if (day <= 0 || day >= 8) {
+ if (rv == -3) {
PyErr_Format(PyExc_ValueError, "Invalid day: %d (range is [1, 7])",
day);
return NULL;
}
- // Convert (Y, W, D) to (Y, M, D) in-place
- int day_1 = iso_week1_monday(year);
-
- int month = week;
- int day_offset = (month - 1)*7 + day - 1;
-
- ord_to_ymd(day_1 + day_offset, &year, &month, &day);
-
return new_date_subclass_ex(year, month, day, cls);
}
@@ -3489,7 +3565,7 @@ static PyMethodDef date_methods[] = {
{"fromisoformat", (PyCFunction)date_fromisoformat, METH_O |
METH_CLASS,
- PyDoc_STR("str -> Construct a date from the output of date.isoformat()")},
+ PyDoc_STR("str -> Construct a date from a string in ISO 8601 format.")},
{"fromisocalendar", _PyCFunction_CAST(date_fromisocalendar),
METH_VARARGS | METH_KEYWORDS | METH_CLASS,
@@ -4564,6 +4640,14 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
goto invalid_string_error;
}
+ // The spec actually requires that time-only ISO 8601 strings start with
+ // T, but the extended format allows this to be omitted as long as there
+ // is no ambiguity with date strings.
+ if (*p == 'T') {
+ ++p;
+ len -= 1;
+ }
+
int hour = 0, minute = 0, second = 0, microsecond = 0;
int tzoffset, tzimicrosecond = 0;
int rv = parse_isoformat_time(p, len,
@@ -4671,7 +4755,7 @@ static PyMethodDef time_methods[] = {
PyDoc_STR("Return time with new specified fields.")},
{"fromisoformat", (PyCFunction)time_fromisoformat, METH_O | METH_CLASS,
- PyDoc_STR("string -> time from time.isoformat() output")},
+ PyDoc_STR("string -> time from a string in ISO 8601 format")},
{"__reduce_ex__", (PyCFunction)time_reduce_ex, METH_VARARGS,
PyDoc_STR("__reduce_ex__(proto) -> (cls, state)")},
@@ -5184,19 +5268,42 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
static PyObject *
_sanitize_isoformat_str(PyObject *dtstr)
{
+ Py_ssize_t len = PyUnicode_GetLength(dtstr);
+ if (len < 7) { // All valid ISO 8601 strings are at least 7 characters long
+ return NULL;
+ }
+
// `fromisoformat` allows surrogate characters in exactly one position,
// the separator; to allow datetime_fromisoformat to make the simplifying
// assumption that all valid strings can be encoded in UTF-8, this function
// replaces any surrogate character separators with `T`.
//
// The result of this, if not NULL, returns a new reference
- Py_ssize_t len = PyUnicode_GetLength(dtstr);
- if (len < 0) {
- return NULL;
+ const void* const unicode_data = PyUnicode_DATA(dtstr);
+ const unsigned int kind = PyUnicode_KIND(dtstr);
+
+ // Depending on the format of the string, the separator can only ever be
+ // in positions 7, 8 or 10. We'll check each of these for a surrogate and
+ // if we find one, replace it with `T`. If there is more than one surrogate,
+ // we don't have to bother sanitizing it, because the function will later
+ // fail when we try to encode the string as ASCII.
+ static const size_t potential_separators[3] = {7, 8, 10};
+ size_t surrogate_separator = 0;
+ for(size_t idx = 0;
+ idx < sizeof(potential_separators) / sizeof(*potential_separators);
+ ++idx) {
+ size_t pos = potential_separators[idx];
+ if (pos > (size_t)len) {
+ break;
+ }
+
+ if(Py_UNICODE_IS_SURROGATE(PyUnicode_READ(kind, unicode_data, pos))) {
+ surrogate_separator = pos;
+ break;
+ }
}
- if (len <= 10 ||
- !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
+ if (surrogate_separator == 0) {
Py_INCREF(dtstr);
return dtstr;
}
@@ -5206,7 +5313,7 @@ _sanitize_isoformat_str(PyObject *dtstr)
return NULL;
}
- if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
+ if (PyUnicode_WriteChar(str_out, surrogate_separator, (Py_UCS4)'T')) {
Py_DECREF(str_out);
return NULL;
}
@@ -5214,6 +5321,106 @@ _sanitize_isoformat_str(PyObject *dtstr)
return str_out;
}
+
+static Py_ssize_t
+_find_isoformat_datetime_separator(const char *dtstr, Py_ssize_t len) {
+ // The valid date formats can all be distinguished by characters 4 and 5
+ // and further narrowed down by character
+ // which tells us where to look for the separator character.
+ // Format | As-rendered | Position
+ // ---------------------------------------
+ // %Y-%m-%d | YYYY-MM-DD | 10
+ // %Y%m%d | YYYYMMDD | 8
+ // %Y-W%V | YYYY-Www | 8
+ // %YW%V | YYYYWww | 7
+ // %Y-W%V-%u | YYYY-Www-d | 10
+ // %YW%V%u | YYYYWwwd | 8
+ // %Y-%j | YYYY-DDD | 8
+ // %Y%j | YYYYDDD | 7
+ //
+ // Note that because we allow *any* character for the separator, in the
+ // case where character 4 is W, it's not straightforward to determine where
+ // the separator is — in the case of YYYY-Www-d, you have actual ambiguity,
+ // e.g. 2020-W01-0000 could be YYYY-Www-D0HH or YYYY-Www-HHMM, when the
+ // separator character is a number in the former case or a hyphen in the
+ // latter case.
+ //
+ // The case of YYYYWww can be distinguished from YYYYWwwd by tracking ahead
+ // to either the end of the string or the first non-numeric character —
+ // since the time components all come in pairs YYYYWww#HH can be
+ // distinguished from YYYYWwwd#HH by the fact that there will always be an
+ // odd number of digits before the first non-digit character in the former
+ // case.
+ static const char date_separator = '-';
+ static const char week_indicator = 'W';
+
+ if (len == 7) {
+ return 7;
+ }
+
+ if (dtstr[4] == date_separator) {
+ // YYYY-???
+
+ if (dtstr[5] == week_indicator) {
+ // YYYY-W??
+
+ if (len < 8) {
+ return -1;
+ }
+
+ if (len > 8 && dtstr[8] == date_separator) {
+ // YYYY-Www-D (10) or YYYY-Www-HH (8)
+ if (len == 9) { return -1; }
+ if (len > 10 && is_digit(dtstr[10])) {
+ // This is as far as we'll try to go to resolve the
+ // ambiguity for the moment — if we have YYYY-Www-##, the
+ // separator is either a hyphen at 8 or a number at 10.
+ //
+ // We'll assume it's a hyphen at 8 because it's way more
+ // likely that someone will use a hyphen as a separator
+ // than a number, but at this point it's really best effort
+ // because this is an extension of the spec anyway.
+ return 8;
+ }
+
+ return 10;
+ } else {
+ // YYYY-Www (8)
+ return 8;
+ }
+ } else {
+ // YYYY-MM-DD (10)
+ return 10;
+ }
+ } else {
+ // YYYY???
+ if (dtstr[4] == week_indicator) {
+ // YYYYWww (7) or YYYYWwwd (8)
+ size_t idx = 7;
+ for (; idx < (size_t)len; ++idx) {
+ // Keep going until we run out of digits.
+ if (!is_digit(dtstr[idx])) {
+ break;
+ }
+ }
+
+ if (idx < 9) {
+ return idx;
+ }
+
+ if (idx % 2 == 0) {
+ // If the index of the last number is even, it's YYYYWww
+ return 7;
+ } else {
+ return 8;
+ }
+ } else {
+ // YYYYMMDD (8)
+ return 8;
+ }
+ }
+}
+
static PyObject *
datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
{
@@ -5225,9 +5432,14 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
return NULL;
}
+ // We only need to sanitize this string if the separator is a surrogate
+ // character. In the situation where the separator location is ambiguous,
+ // we don't have to sanitize it anything because that can only happen when
+ // the separator is either '-' or a number. This should mostly be a noop
+ // but it makes the reference counting easier if we still sanitize.
PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr);
if (dtstr_clean == NULL) {
- goto error;
+ goto invalid_string_error;
}
Py_ssize_t len;
@@ -5243,30 +5455,35 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
}
}
+ const Py_ssize_t separator_location = _find_isoformat_datetime_separator(
+ dt_ptr, len);
+
+
const char *p = dt_ptr;
int year = 0, month = 0, day = 0;
int hour = 0, minute = 0, second = 0, microsecond = 0;
int tzoffset = 0, tzusec = 0;
- // date has a fixed length of 10
- int rv = parse_isoformat_date(p, &year, &month, &day);
+ // date runs up to separator_location
+ int rv = parse_isoformat_date(p, separator_location, &year, &month, &day);
- if (!rv && len > 10) {
+ if (!rv && len > separator_location) {
// In UTF-8, the length of multi-byte characters is encoded in the MSB
- if ((p[10] & 0x80) == 0) {
- p += 11;
+ p += separator_location;
+ if ((p[0] & 0x80) == 0) {
+ p += 1;
}
else {
- switch (p[10] & 0xf0) {
+ switch (p[0] & 0xf0) {
case 0xe0:
- p += 13;
+ p += 3;
break;
case 0xf0:
- p += 14;
+ p += 4;
break;
default:
- p += 12;
+ p += 2;
break;
}
}
@@ -6327,7 +6544,7 @@ static PyMethodDef datetime_methods[] = {
{"fromisoformat", (PyCFunction)datetime_fromisoformat,
METH_O | METH_CLASS,
- PyDoc_STR("string -> datetime from datetime.isoformat() output")},
+ PyDoc_STR("string -> datetime from a string in most ISO 8601 formats")},
/* Instance methods: */