diff options
-rw-r--r-- | Doc/library/datetime.rst | 83 | ||||
-rw-r--r-- | Doc/whatsnew/3.11.rst | 8 | ||||
-rw-r--r-- | Lib/datetime.py | 252 | ||||
-rw-r--r-- | Lib/test/datetimetester.py | 251 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2022-05-03-12-11-27.gh-issue-80010.yG54RE.rst | 3 | ||||
-rw-r--r-- | Modules/_datetimemodule.c | 359 |
6 files changed, 778 insertions, 178 deletions
diff --git a/Doc/library/datetime.rst b/Doc/library/datetime.rst index ca17dc8..e0b28d7 100644 --- a/Doc/library/datetime.rst +++ b/Doc/library/datetime.rst @@ -526,18 +526,20 @@ Other constructors, all class methods: .. classmethod:: date.fromisoformat(date_string) - Return a :class:`date` corresponding to a *date_string* given in the format - ``YYYY-MM-DD``:: + Return a :class:`date` corresponding to a *date_string* given in any valid + ISO 8601 format, except ordinal dates (e.g. ``YYYY-DDD``):: >>> from datetime import date >>> date.fromisoformat('2019-12-04') datetime.date(2019, 12, 4) - - This is the inverse of :meth:`date.isoformat`. It only supports the format - ``YYYY-MM-DD``. + >>> date.fromisoformat('20191204') + datetime.date(2019, 12, 4) + >>> date.fromisoformat('2021-W01-1') + datetime.date(2021, 1, 4) .. versionadded:: 3.7 - + .. versionchanged:: 3.11 + Previously, this method only supported the format ``YYYY-MM-DD``. .. classmethod:: date.fromisocalendar(year, week, day) @@ -710,8 +712,6 @@ Instance methods: >>> date(2002, 12, 4).isoformat() '2002-12-04' - This is the inverse of :meth:`date.fromisoformat`. - .. method:: date.__str__() For a date *d*, ``str(d)`` is equivalent to ``d.isoformat()``. @@ -994,31 +994,29 @@ Other constructors, all class methods: .. classmethod:: datetime.fromisoformat(date_string) - Return a :class:`.datetime` corresponding to a *date_string* in one of the - formats emitted by :meth:`date.isoformat` and :meth:`datetime.isoformat`. - - Specifically, this function supports strings in the format: + Return a :class:`.datetime` corresponding to a *date_string* in any valid + ISO 8601 format, with the following exceptions: - .. code-block:: none - - YYYY-MM-DD[*HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]]] - - where ``*`` can match any single character. - - .. caution:: - - This does *not* support parsing arbitrary ISO 8601 strings - it is only intended - as the inverse operation of :meth:`datetime.isoformat`. A more full-featured - ISO 8601 parser, ``dateutil.parser.isoparse`` is available in the third-party package - `dateutil <https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.isoparse>`__. + 1. Time zone offsets may have fractional seconds. + 2. The `T` separator may be replaced by any single unicode character. + 3. Ordinal dates are not currently supported. + 4. Fractional hours and minutes are not supported. Examples:: >>> from datetime import datetime >>> datetime.fromisoformat('2011-11-04') datetime.datetime(2011, 11, 4, 0, 0) + >>> datetime.fromisoformat('20111104') + datetime.datetime(2011, 11, 4, 0, 0) >>> datetime.fromisoformat('2011-11-04T00:05:23') datetime.datetime(2011, 11, 4, 0, 5, 23) + >>> datetime.fromisoformat('2011-11-04T00:05:23Z') + datetime.datetime(2011, 11, 4, 0, 5, 23, tzinfo=datetime.timezone.utc) + >>> datetime.fromisoformat('20111104T000523') + datetime.datetime(2011, 11, 4, 0, 5, 23) + >>> datetime.fromisoformat('2011-W01-2T00:05:23.283') + datetime.datetime(2011, 1, 4, 0, 5, 23, 283000) >>> datetime.fromisoformat('2011-11-04 00:05:23.283') datetime.datetime(2011, 11, 4, 0, 5, 23, 283000) >>> datetime.fromisoformat('2011-11-04 00:05:23.283+00:00') @@ -1028,6 +1026,10 @@ Other constructors, all class methods: tzinfo=datetime.timezone(datetime.timedelta(seconds=14400))) .. versionadded:: 3.7 + .. versionchanged:: 3.11 + Previously, this method only supported formats that could be emitted by + :meth:`date.isoformat()` or :meth:`datetime.isoformat()`. + .. classmethod:: datetime.fromisocalendar(year, week, day) @@ -1763,30 +1765,41 @@ Other constructor: .. classmethod:: time.fromisoformat(time_string) - Return a :class:`.time` corresponding to a *time_string* in one of the - formats emitted by :meth:`time.isoformat`. Specifically, this function supports - strings in the format: - - .. code-block:: none - - HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]] - - .. caution:: + Return a :class:`.time` corresponding to a *time_string* in any valid + ISO 8601 format, with the following exceptions: - This does *not* support parsing arbitrary ISO 8601 strings. It is only - intended as the inverse operation of :meth:`time.isoformat`. + 1. Time zone offsets may have fractional seconds. + 2. The leading `T`, normally required in cases where there may be ambiguity between + a date and a time, is not required. + 3. Fractional seconds may have any number of digits (anything beyond 6 will + be truncated). + 4. Fractional hours and minutes are not supported. Examples:: >>> from datetime import time >>> time.fromisoformat('04:23:01') datetime.time(4, 23, 1) + >>> time.fromisoformat('T04:23:01') + datetime.time(4, 23, 1) + >>> time.fromisoformat('T042301') + datetime.time(4, 23, 1) >>> time.fromisoformat('04:23:01.000384') datetime.time(4, 23, 1, 384) + >>> time.fromisoformat('04:23:01,000') + datetime.time(4, 23, 1, 384) >>> time.fromisoformat('04:23:01+04:00') datetime.time(4, 23, 1, tzinfo=datetime.timezone(datetime.timedelta(seconds=14400))) + >>> time.fromisoformat('04:23:01Z') + datetime.time(4, 23, 1, tzinfo=datetime.timezone.utc) + >>> time.fromisoformat('04:23:01+00:00') + datetime.time(4, 23, 1, tzinfo=datetime.timezone.utc) + .. versionadded:: 3.7 + .. versionchanged:: 3.11 + Previously, this method only supported formats that could be emitted by + :meth:`time.isoformat()`. Instance methods: diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 87dc5dd..efcfa17 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -425,6 +425,14 @@ asyncio existing stream-based connections to TLS. (Contributed by Ian Good in :issue:`34975`.) +datetime +-------- + +* :meth:`datetime.date.fromisoformat`, :meth:`datetime.time.fromisoformat` and + :meth:`datetime.datetime.fromisoformat` can now be used to parse most ISO 8601 + formats (barring only those that support fractional hours and minutes). + (Contributed by Paul Ganssle in :gh:`80010`.) + fractions --------- diff --git a/Lib/datetime.py b/Lib/datetime.py index 7f79aa4..afbb6fe 100644 --- a/Lib/datetime.py +++ b/Lib/datetime.py @@ -262,58 +262,150 @@ def _wrap_strftime(object, format, timetuple): return _time.strftime(newformat, timetuple) # Helpers for parsing the result of isoformat() +def _is_ascii_digit(c): + return c in "0123456789" + +def _find_isoformat_datetime_separator(dtstr): + # See the comment in _datetimemodule.c:_find_isoformat_datetime_separator + len_dtstr = len(dtstr) + if len_dtstr == 7: + return 7 + + assert len_dtstr > 7 + date_separator = "-" + week_indicator = "W" + + if dtstr[4] == date_separator: + if dtstr[5] == week_indicator: + if len_dtstr < 8: + raise ValueError("Invalid ISO string") + if len_dtstr > 8 and dtstr[8] == date_separator: + if len_dtstr == 9: + raise ValueError("Invalid ISO string") + if len_dtstr > 10 and _is_ascii_digit(dtstr[10]): + # This is as far as we need to resolve the ambiguity for + # the moment - if we have YYYY-Www-##, the separator is + # either a hyphen at 8 or a number at 10. + # + # We'll assume it's a hyphen at 8 because it's way more + # likely that someone will use a hyphen as a separator than + # a number, but at this point it's really best effort + # because this is an extension of the spec anyway. + # TODO(pganssle): Document this + return 8 + return 10 + else: + # YYYY-Www (8) + return 8 + else: + # YYYY-MM-DD (10) + return 10 + else: + if dtstr[4] == week_indicator: + # YYYYWww (7) or YYYYWwwd (8) + idx = 7 + while idx < len_dtstr: + if not _is_ascii_digit(dtstr[idx]): + break + idx += 1 + + if idx < 9: + return idx + + if idx % 2 == 0: + # If the index of the last number is even, it's YYYYWwwd + return 7 + else: + return 8 + else: + # YYYYMMDD (8) + return 8 + + def _parse_isoformat_date(dtstr): - # It is assumed that this function will only be called with a - # string of length exactly 10, and (though this is not used) ASCII-only + # It is assumed that this is an ASCII-only string of lengths 7, 8 or 10, + # see the comment on Modules/_datetimemodule.c:_find_isoformat_datetime_separator + assert len(dtstr) in (7, 8, 10) year = int(dtstr[0:4]) - if dtstr[4] != '-': - raise ValueError('Invalid date separator: %s' % dtstr[4]) + has_sep = dtstr[4] == '-' + + pos = 4 + has_sep + if dtstr[pos:pos + 1] == "W": + # YYYY-?Www-?D? + pos += 1 + weekno = int(dtstr[pos:pos + 2]) + pos += 2 - month = int(dtstr[5:7]) + dayno = 1 + if len(dtstr) > pos: + if (dtstr[pos:pos + 1] == '-') != has_sep: + raise ValueError("Inconsistent use of dash separator") - if dtstr[7] != '-': - raise ValueError('Invalid date separator') + pos += has_sep - day = int(dtstr[8:10]) + dayno = int(dtstr[pos:pos + 1]) + + return list(_isoweek_to_gregorian(year, weekno, dayno)) + else: + month = int(dtstr[pos:pos + 2]) + pos += 2 + if (dtstr[pos:pos + 1] == "-") != has_sep: + raise ValueError("Inconsistent use of dash separator") + + pos += has_sep + day = int(dtstr[pos:pos + 2]) + + return [year, month, day] + + +_FRACTION_CORRECTION = [100000, 10000, 1000, 100, 10] - return [year, month, day] def _parse_hh_mm_ss_ff(tstr): - # Parses things of the form HH[:MM[:SS[.fff[fff]]]] + # Parses things of the form HH[:?MM[:?SS[{.,}fff[fff]]]] len_str = len(tstr) time_comps = [0, 0, 0, 0] pos = 0 for comp in range(0, 3): if (len_str - pos) < 2: - raise ValueError('Incomplete time component') + raise ValueError("Incomplete time component") time_comps[comp] = int(tstr[pos:pos+2]) pos += 2 next_char = tstr[pos:pos+1] + if comp == 0: + has_sep = next_char == ':' + if not next_char or comp >= 2: break - if next_char != ':': - raise ValueError('Invalid time separator: %c' % next_char) + if has_sep and next_char != ':': + raise ValueError("Invalid time separator: %c" % next_char) - pos += 1 + pos += has_sep if pos < len_str: - if tstr[pos] != '.': - raise ValueError('Invalid microsecond component') + if tstr[pos] not in '.,': + raise ValueError("Invalid microsecond component") else: pos += 1 len_remainder = len_str - pos - if len_remainder not in (3, 6): - raise ValueError('Invalid microsecond component') - time_comps[3] = int(tstr[pos:]) - if len_remainder == 3: - time_comps[3] *= 1000 + if len_remainder >= 6: + to_parse = 6 + else: + to_parse = len_remainder + + time_comps[3] = int(tstr[pos:(pos+to_parse)]) + if to_parse < 6: + time_comps[3] *= _FRACTION_CORRECTION[to_parse-1] + if (len_remainder > to_parse + and not all(map(_is_ascii_digit, tstr[(pos+to_parse):]))): + raise ValueError("Non-digit values in unparsed fraction") return time_comps @@ -321,27 +413,34 @@ def _parse_isoformat_time(tstr): # Format supported is HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]] len_str = len(tstr) if len_str < 2: - raise ValueError('Isoformat time too short') + raise ValueError("Isoformat time too short") - # This is equivalent to re.search('[+-]', tstr), but faster - tz_pos = (tstr.find('-') + 1 or tstr.find('+') + 1) + # This is equivalent to re.search('[+-Z]', tstr), but faster + tz_pos = (tstr.find('-') + 1 or tstr.find('+') + 1 or tstr.find('Z') + 1) timestr = tstr[:tz_pos-1] if tz_pos > 0 else tstr time_comps = _parse_hh_mm_ss_ff(timestr) tzi = None - if tz_pos > 0: + if tz_pos == len_str and tstr[-1] == 'Z': + tzi = timezone.utc + elif tz_pos > 0: tzstr = tstr[tz_pos:] # Valid time zone strings are: + # HH len: 2 + # HHMM len: 4 # HH:MM len: 5 + # HHMMSS len: 6 + # HHMMSS.f+ len: 7+ # HH:MM:SS len: 8 - # HH:MM:SS.ffffff len: 15 + # HH:MM:SS.f+ len: 10+ - if len(tzstr) not in (5, 8, 15): - raise ValueError('Malformed time zone string') + if len(tzstr) in (0, 1, 3): + raise ValueError("Malformed time zone string") tz_comps = _parse_hh_mm_ss_ff(tzstr) + if all(x == 0 for x in tz_comps): tzi = timezone.utc else: @@ -356,6 +455,38 @@ def _parse_isoformat_time(tstr): return time_comps +# tuple[int, int, int] -> tuple[int, int, int] version of date.fromisocalendar +def _isoweek_to_gregorian(year, week, day): + # Year is bounded this way because 9999-12-31 is (9999, 52, 5) + if not MINYEAR <= year <= MAXYEAR: + raise ValueError(f"Year is out of range: {year}") + + if not 0 < week < 53: + out_of_range = True + + if week == 53: + # ISO years have 53 weeks in them on years starting with a + # Thursday and leap years starting on a Wednesday + first_weekday = _ymd2ord(year, 1, 1) % 7 + if (first_weekday == 4 or (first_weekday == 3 and + _is_leap(year))): + out_of_range = False + + if out_of_range: + raise ValueError(f"Invalid week: {week}") + + if not 0 < day < 8: + raise ValueError(f"Invalid weekday: {day} (range is [1, 7])") + + # Now compute the offset from (Y, 1, 1) in days: + day_offset = (week - 1) * 7 + (day - 1) + + # Calculate the ordinal day for monday, week 1 + day_1 = _isoweek1monday(year) + ord_day = day_1 + day_offset + + return _ord2ymd(ord_day) + # Just raise TypeError if the arg isn't None or a string. def _check_tzname(name): @@ -847,12 +978,14 @@ class date: @classmethod def fromisoformat(cls, date_string): - """Construct a date from the output of date.isoformat().""" + """Construct a date from a string in ISO 8601 format.""" if not isinstance(date_string, str): raise TypeError('fromisoformat: argument must be str') + if len(date_string) not in (7, 8, 10): + raise ValueError(f'Invalid isoformat string: {date_string!r}') + try: - assert len(date_string) == 10 return cls(*_parse_isoformat_date(date_string)) except Exception: raise ValueError(f'Invalid isoformat string: {date_string!r}') @@ -862,35 +995,7 @@ class date: """Construct a date from the ISO year, week number and weekday. This is the inverse of the date.isocalendar() function""" - # Year is bounded this way because 9999-12-31 is (9999, 52, 5) - if not MINYEAR <= year <= MAXYEAR: - raise ValueError(f"Year is out of range: {year}") - - if not 0 < week < 53: - out_of_range = True - - if week == 53: - # ISO years have 53 weeks in them on years starting with a - # Thursday and leap years starting on a Wednesday - first_weekday = _ymd2ord(year, 1, 1) % 7 - if (first_weekday == 4 or (first_weekday == 3 and - _is_leap(year))): - out_of_range = False - - if out_of_range: - raise ValueError(f"Invalid week: {week}") - - if not 0 < day < 8: - raise ValueError(f"Invalid weekday: {day} (range is [1, 7])") - - # Now compute the offset from (Y, 1, 1) in days: - day_offset = (week - 1) * 7 + (day - 1) - - # Calculate the ordinal day for monday, week 1 - day_1 = _isoweek1monday(year) - ord_day = day_1 + day_offset - - return cls(*_ord2ymd(ord_day)) + return cls(*_isoweek_to_gregorian(year, week, day)) # Conversions to string @@ -1427,10 +1532,15 @@ class time: @classmethod def fromisoformat(cls, time_string): - """Construct a time from the output of isoformat().""" + """Construct a time from a string in one of the ISO 8601 formats.""" if not isinstance(time_string, str): raise TypeError('fromisoformat: argument must be str') + # The spec actually requires that time-only ISO 8601 strings start with + # T, but the extended format allows this to be omitted as long as there + # is no ambiguity with date strings. + time_string = time_string.removeprefix('T') + try: return cls(*_parse_isoformat_time(time_string)) except Exception: @@ -1711,24 +1821,30 @@ class datetime(date): @classmethod def fromisoformat(cls, date_string): - """Construct a datetime from the output of datetime.isoformat().""" + """Construct a datetime from a string in one of the ISO 8601 formats.""" if not isinstance(date_string, str): raise TypeError('fromisoformat: argument must be str') - # Split this at the separator - dstr = date_string[0:10] - tstr = date_string[11:] + if len(date_string) < 7: + raise ValueError(f'Invalid isoformat string: {date_string!r}') + # Split this at the separator try: + separator_location = _find_isoformat_datetime_separator(date_string) + dstr = date_string[0:separator_location] + tstr = date_string[(separator_location+1):] + date_components = _parse_isoformat_date(dstr) except ValueError: - raise ValueError(f'Invalid isoformat string: {date_string!r}') + raise ValueError( + f'Invalid isoformat string: {date_string!r}') from None if tstr: try: time_components = _parse_isoformat_time(tstr) except ValueError: - raise ValueError(f'Invalid isoformat string: {date_string!r}') + raise ValueError( + f'Invalid isoformat string: {date_string!r}') from None else: time_components = [0, 0, 0, 0, None] @@ -2509,7 +2625,9 @@ else: _format_time, _format_offset, _index, _is_leap, _isoweek1monday, _math, _ord2ymd, _time, _time_class, _tzinfo_class, _wrap_strftime, _ymd2ord, _divide_and_round, _parse_isoformat_date, _parse_isoformat_time, - _parse_hh_mm_ss_ff, _IsoCalendarDate) + _parse_hh_mm_ss_ff, _IsoCalendarDate, _isoweek_to_gregorian, + _find_isoformat_datetime_separator, _FRACTION_CORRECTION, + _is_ascii_digit) # XXX Since import * above excludes names that start with _, # docstring does not get overwritten. In the future, it may be # appropriate to maintain a single module level docstring and diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py index d85b546..0495362 100644 --- a/Lib/test/datetimetester.py +++ b/Lib/test/datetimetester.py @@ -7,6 +7,7 @@ import itertools import bisect import copy import decimal +import functools import sys import os import pickle @@ -1840,6 +1841,41 @@ class TestDate(HarmlessMixedComparison, unittest.TestCase): self.assertEqual(dt, dt_rt) + def test_fromisoformat_date_examples(self): + examples = [ + ('00010101', self.theclass(1, 1, 1)), + ('20000101', self.theclass(2000, 1, 1)), + ('20250102', self.theclass(2025, 1, 2)), + ('99991231', self.theclass(9999, 12, 31)), + ('0001-01-01', self.theclass(1, 1, 1)), + ('2000-01-01', self.theclass(2000, 1, 1)), + ('2025-01-02', self.theclass(2025, 1, 2)), + ('9999-12-31', self.theclass(9999, 12, 31)), + ('2025W01', self.theclass(2024, 12, 30)), + ('2025-W01', self.theclass(2024, 12, 30)), + ('2025W014', self.theclass(2025, 1, 2)), + ('2025-W01-4', self.theclass(2025, 1, 2)), + ('2026W01', self.theclass(2025, 12, 29)), + ('2026-W01', self.theclass(2025, 12, 29)), + ('2026W013', self.theclass(2025, 12, 31)), + ('2026-W01-3', self.theclass(2025, 12, 31)), + ('2022W52', self.theclass(2022, 12, 26)), + ('2022-W52', self.theclass(2022, 12, 26)), + ('2022W527', self.theclass(2023, 1, 1)), + ('2022-W52-7', self.theclass(2023, 1, 1)), + ('2015W534', self.theclass(2015, 12, 31)), # Has week 53 + ('2015-W53-4', self.theclass(2015, 12, 31)), # Has week 53 + ('2015-W53-5', self.theclass(2016, 1, 1)), + ('2020W531', self.theclass(2020, 12, 28)), # Leap year + ('2020-W53-1', self.theclass(2020, 12, 28)), # Leap year + ('2020-W53-6', self.theclass(2021, 1, 2)), + ] + + for input_str, expected in examples: + with self.subTest(input_str=input_str): + actual = self.theclass.fromisoformat(input_str) + self.assertEqual(actual, expected) + def test_fromisoformat_subclass(self): class DateSubclass(self.theclass): pass @@ -1862,7 +1898,8 @@ class TestDate(HarmlessMixedComparison, unittest.TestCase): '2009-12-0a', # Invalid character in day '2009-01-32', # Invalid day '2009-02-29', # Invalid leap day - '20090228', # Valid ISO8601 output not from isoformat() + '2019-W53-1', # No week 53 in 2019 + '2020-W54-1', # No week 54 '2009\ud80002\ud80028', # Separators are surrogate codepoints ] @@ -3003,6 +3040,140 @@ class TestDateTime(TestDate): dt_rt = self.theclass.fromisoformat(dtstr) self.assertEqual(dt, dt_rt) + def test_fromisoformat_datetime_examples(self): + UTC = timezone.utc + BST = timezone(timedelta(hours=1), 'BST') + EST = timezone(timedelta(hours=-5), 'EST') + EDT = timezone(timedelta(hours=-4), 'EDT') + examples = [ + ('2025-01-02', self.theclass(2025, 1, 2, 0, 0)), + ('2025-01-02T03', self.theclass(2025, 1, 2, 3, 0)), + ('2025-01-02T03:04', self.theclass(2025, 1, 2, 3, 4)), + ('2025-01-02T0304', self.theclass(2025, 1, 2, 3, 4)), + ('2025-01-02T03:04:05', self.theclass(2025, 1, 2, 3, 4, 5)), + ('2025-01-02T030405', self.theclass(2025, 1, 2, 3, 4, 5)), + ('2025-01-02T03:04:05.6', + self.theclass(2025, 1, 2, 3, 4, 5, 600000)), + ('2025-01-02T03:04:05,6', + self.theclass(2025, 1, 2, 3, 4, 5, 600000)), + ('2025-01-02T03:04:05.678', + self.theclass(2025, 1, 2, 3, 4, 5, 678000)), + ('2025-01-02T03:04:05.678901', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('2025-01-02T03:04:05,678901', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('2025-01-02T030405.678901', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('2025-01-02T030405,678901', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('2025-01-02T03:04:05.6789010', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('2009-04-19T03:15:45.2345', + self.theclass(2009, 4, 19, 3, 15, 45, 234500)), + ('2009-04-19T03:15:45.1234567', + self.theclass(2009, 4, 19, 3, 15, 45, 123456)), + ('2025-01-02T03:04:05,678', + self.theclass(2025, 1, 2, 3, 4, 5, 678000)), + ('20250102', self.theclass(2025, 1, 2, 0, 0)), + ('20250102T03', self.theclass(2025, 1, 2, 3, 0)), + ('20250102T03:04', self.theclass(2025, 1, 2, 3, 4)), + ('20250102T03:04:05', self.theclass(2025, 1, 2, 3, 4, 5)), + ('20250102T030405', self.theclass(2025, 1, 2, 3, 4, 5)), + ('20250102T03:04:05.6', + self.theclass(2025, 1, 2, 3, 4, 5, 600000)), + ('20250102T03:04:05,6', + self.theclass(2025, 1, 2, 3, 4, 5, 600000)), + ('20250102T03:04:05.678', + self.theclass(2025, 1, 2, 3, 4, 5, 678000)), + ('20250102T03:04:05,678', + self.theclass(2025, 1, 2, 3, 4, 5, 678000)), + ('20250102T03:04:05.678901', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('20250102T030405.678901', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('20250102T030405,678901', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('20250102T030405.6789010', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('2022W01', self.theclass(2022, 1, 3)), + ('2022W52520', self.theclass(2022, 12, 26, 20, 0)), + ('2022W527520', self.theclass(2023, 1, 1, 20, 0)), + ('2026W01516', self.theclass(2025, 12, 29, 16, 0)), + ('2026W013516', self.theclass(2025, 12, 31, 16, 0)), + ('2025W01503', self.theclass(2024, 12, 30, 3, 0)), + ('2025W014503', self.theclass(2025, 1, 2, 3, 0)), + ('2025W01512', self.theclass(2024, 12, 30, 12, 0)), + ('2025W014512', self.theclass(2025, 1, 2, 12, 0)), + ('2025W014T121431', self.theclass(2025, 1, 2, 12, 14, 31)), + ('2026W013T162100', self.theclass(2025, 12, 31, 16, 21)), + ('2026W013 162100', self.theclass(2025, 12, 31, 16, 21)), + ('2022W527T202159', self.theclass(2023, 1, 1, 20, 21, 59)), + ('2022W527 202159', self.theclass(2023, 1, 1, 20, 21, 59)), + ('2025W014 121431', self.theclass(2025, 1, 2, 12, 14, 31)), + ('2025W014T030405', self.theclass(2025, 1, 2, 3, 4, 5)), + ('2025W014 030405', self.theclass(2025, 1, 2, 3, 4, 5)), + ('2020-W53-6T03:04:05', self.theclass(2021, 1, 2, 3, 4, 5)), + ('2020W537 03:04:05', self.theclass(2021, 1, 3, 3, 4, 5)), + ('2025-W01-4T03:04:05', self.theclass(2025, 1, 2, 3, 4, 5)), + ('2025-W01-4T03:04:05.678901', + self.theclass(2025, 1, 2, 3, 4, 5, 678901)), + ('2025-W01-4T12:14:31', self.theclass(2025, 1, 2, 12, 14, 31)), + ('2025-W01-4T12:14:31.012345', + self.theclass(2025, 1, 2, 12, 14, 31, 12345)), + ('2026-W01-3T16:21:00', self.theclass(2025, 12, 31, 16, 21)), + ('2026-W01-3T16:21:00.000000', self.theclass(2025, 12, 31, 16, 21)), + ('2022-W52-7T20:21:59', + self.theclass(2023, 1, 1, 20, 21, 59)), + ('2022-W52-7T20:21:59.999999', + self.theclass(2023, 1, 1, 20, 21, 59, 999999)), + ('2025-W01003+00', + self.theclass(2024, 12, 30, 3, 0, tzinfo=UTC)), + ('2025-01-02T03:04:05+00', + self.theclass(2025, 1, 2, 3, 4, 5, tzinfo=UTC)), + ('2025-01-02T03:04:05Z', + self.theclass(2025, 1, 2, 3, 4, 5, tzinfo=UTC)), + ('2025-01-02003:04:05,6+00:00:00.00', + self.theclass(2025, 1, 2, 3, 4, 5, 600000, tzinfo=UTC)), + ('2000-01-01T00+21', + self.theclass(2000, 1, 1, 0, 0, tzinfo=timezone(timedelta(hours=21)))), + ('2025-01-02T03:05:06+0300', + self.theclass(2025, 1, 2, 3, 5, 6, + tzinfo=timezone(timedelta(hours=3)))), + ('2025-01-02T03:05:06-0300', + self.theclass(2025, 1, 2, 3, 5, 6, + tzinfo=timezone(timedelta(hours=-3)))), + ('2025-01-02T03:04:05+0000', + self.theclass(2025, 1, 2, 3, 4, 5, tzinfo=UTC)), + ('2025-01-02T03:05:06+03', + self.theclass(2025, 1, 2, 3, 5, 6, + tzinfo=timezone(timedelta(hours=3)))), + ('2025-01-02T03:05:06-03', + self.theclass(2025, 1, 2, 3, 5, 6, + tzinfo=timezone(timedelta(hours=-3)))), + ('2020-01-01T03:05:07.123457-05:00', + self.theclass(2020, 1, 1, 3, 5, 7, 123457, tzinfo=EST)), + ('2020-01-01T03:05:07.123457-0500', + self.theclass(2020, 1, 1, 3, 5, 7, 123457, tzinfo=EST)), + ('2020-06-01T04:05:06.111111-04:00', + self.theclass(2020, 6, 1, 4, 5, 6, 111111, tzinfo=EDT)), + ('2020-06-01T04:05:06.111111-0400', + self.theclass(2020, 6, 1, 4, 5, 6, 111111, tzinfo=EDT)), + ('2021-10-31T01:30:00.000000+01:00', + self.theclass(2021, 10, 31, 1, 30, tzinfo=BST)), + ('2021-10-31T01:30:00.000000+0100', + self.theclass(2021, 10, 31, 1, 30, tzinfo=BST)), + ('2025-01-02T03:04:05,6+000000.00', + self.theclass(2025, 1, 2, 3, 4, 5, 600000, tzinfo=UTC)), + ('2025-01-02T03:04:05,678+00:00:10', + self.theclass(2025, 1, 2, 3, 4, 5, 678000, + tzinfo=timezone(timedelta(seconds=10)))), + ] + + for input_str, expected in examples: + with self.subTest(input_str=input_str): + actual = self.theclass.fromisoformat(input_str) + self.assertEqual(actual, expected) + def test_fromisoformat_fails_datetime(self): # Test that fromisoformat() fails on invalid values bad_strs = [ @@ -3016,8 +3187,6 @@ class TestDateTime(TestDate): '2009-04-19T03;15:45', # Bad first time separator '2009-04-19T03:15;45', # Bad second time separator '2009-04-19T03:15:4500:00', # Bad time zone separator - '2009-04-19T03:15:45.2345', # Too many digits for milliseconds - '2009-04-19T03:15:45.1234567', # Too many digits for microseconds '2009-04-19T03:15:45.123456+24:30', # Invalid time zone offset '2009-04-19T03:15:45.123456-24:30', # Invalid negative offset '2009-04-10ᛇᛇᛇᛇᛇ12:15', # Too many unicode separators @@ -3962,6 +4131,76 @@ class TestTimeTZ(TestTime, TZInfoBase, unittest.TestCase): t_rt = self.theclass.fromisoformat(tstr) self.assertEqual(t, t_rt) + def test_fromisoformat_fractions(self): + strs = [ + ('12:30:45.1', (12, 30, 45, 100000)), + ('12:30:45.12', (12, 30, 45, 120000)), + ('12:30:45.123', (12, 30, 45, 123000)), + ('12:30:45.1234', (12, 30, 45, 123400)), + ('12:30:45.12345', (12, 30, 45, 123450)), + ('12:30:45.123456', (12, 30, 45, 123456)), + ('12:30:45.1234567', (12, 30, 45, 123456)), + ('12:30:45.12345678', (12, 30, 45, 123456)), + ] + + for time_str, time_comps in strs: + expected = self.theclass(*time_comps) + actual = self.theclass.fromisoformat(time_str) + + self.assertEqual(actual, expected) + + def test_fromisoformat_time_examples(self): + examples = [ + ('0000', self.theclass(0, 0)), + ('00:00', self.theclass(0, 0)), + ('000000', self.theclass(0, 0)), + ('00:00:00', self.theclass(0, 0)), + ('000000.0', self.theclass(0, 0)), + ('00:00:00.0', self.theclass(0, 0)), + ('000000.000', self.theclass(0, 0)), + ('00:00:00.000', self.theclass(0, 0)), + ('000000.000000', self.theclass(0, 0)), + ('00:00:00.000000', self.theclass(0, 0)), + ('1200', self.theclass(12, 0)), + ('12:00', self.theclass(12, 0)), + ('120000', self.theclass(12, 0)), + ('12:00:00', self.theclass(12, 0)), + ('120000.0', self.theclass(12, 0)), + ('12:00:00.0', self.theclass(12, 0)), + ('120000.000', self.theclass(12, 0)), + ('12:00:00.000', self.theclass(12, 0)), + ('120000.000000', self.theclass(12, 0)), + ('12:00:00.000000', self.theclass(12, 0)), + ('2359', self.theclass(23, 59)), + ('23:59', self.theclass(23, 59)), + ('235959', self.theclass(23, 59, 59)), + ('23:59:59', self.theclass(23, 59, 59)), + ('235959.9', self.theclass(23, 59, 59, 900000)), + ('23:59:59.9', self.theclass(23, 59, 59, 900000)), + ('235959.999', self.theclass(23, 59, 59, 999000)), + ('23:59:59.999', self.theclass(23, 59, 59, 999000)), + ('235959.999999', self.theclass(23, 59, 59, 999999)), + ('23:59:59.999999', self.theclass(23, 59, 59, 999999)), + ('00:00:00Z', self.theclass(0, 0, tzinfo=timezone.utc)), + ('12:00:00+0000', self.theclass(12, 0, tzinfo=timezone.utc)), + ('12:00:00+00:00', self.theclass(12, 0, tzinfo=timezone.utc)), + ('00:00:00+05', + self.theclass(0, 0, tzinfo=timezone(timedelta(hours=5)))), + ('00:00:00+05:30', + self.theclass(0, 0, tzinfo=timezone(timedelta(hours=5, minutes=30)))), + ('12:00:00-05:00', + self.theclass(12, 0, tzinfo=timezone(timedelta(hours=-5)))), + ('12:00:00-0500', + self.theclass(12, 0, tzinfo=timezone(timedelta(hours=-5)))), + ('00:00:00,000-23:59:59.999999', + self.theclass(0, 0, tzinfo=timezone(-timedelta(hours=23, minutes=59, seconds=59, microseconds=999999)))), + ] + + for input_str, expected in examples: + with self.subTest(input_str=input_str): + actual = self.theclass.fromisoformat(input_str) + self.assertEqual(actual, expected) + def test_fromisoformat_fails(self): bad_strs = [ '', # Empty string @@ -3975,15 +4214,17 @@ class TestTimeTZ(TestTime, TZInfoBase, unittest.TestCase): '1a:30:45.334034', # Invalid character in hours '12:a0:45.334034', # Invalid character in minutes '12:30:a5.334034', # Invalid character in seconds - '12:30:45.1234', # Too many digits for milliseconds - '12:30:45.1234567', # Too many digits for microseconds '12:30:45.123456+24:30', # Invalid time zone offset '12:30:45.123456-24:30', # Invalid negative offset '12:30:45', # Uses full-width unicode colons + '12:30:45.123456a', # Non-numeric data after 6 components + '12:30:45.123456789a', # Non-numeric data after 9 components '12:30:45․123456', # Uses \u2024 in place of decimal point '12:30:45a', # Extra at tend of basic time '12:30:45.123a', # Extra at end of millisecond time '12:30:45.123456a', # Extra at end of microsecond time + '12:30:45.123456-', # Extra at end of microsecond time + '12:30:45.123456+', # Extra at end of microsecond time '12:30:45.123456+12:00:30a', # Extra at end of full time ] diff --git a/Misc/NEWS.d/next/Library/2022-05-03-12-11-27.gh-issue-80010.yG54RE.rst b/Misc/NEWS.d/next/Library/2022-05-03-12-11-27.gh-issue-80010.yG54RE.rst new file mode 100644 index 0000000..bbcef47 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-05-03-12-11-27.gh-issue-80010.yG54RE.rst @@ -0,0 +1,3 @@ +Add support for generalized ISO 8601 parsing to +:meth:`datetime.datetime.fromisoformat`, :meth:`datetime.date.fromisoformat` +and :meth:`datetime.time.fromisoformat`. Patch by Paul Ganssle. diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 20cdb18..efb5278 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -395,6 +395,39 @@ iso_week1_monday(int year) return week1_monday; } +static int +iso_to_ymd(const int iso_year, const int iso_week, const int iso_day, + int *year, int *month, int *day) { + if (iso_week <= 0 || iso_week >= 53) { + int out_of_range = 1; + if (iso_week == 53) { + // ISO years have 53 weeks in it on years starting with a Thursday + // and on leap years starting on Wednesday + int first_weekday = weekday(iso_year, 1, 1); + if (first_weekday == 3 || (first_weekday == 2 && is_leap(iso_year))) { + out_of_range = 0; + } + } + + if (out_of_range) { + return -2; + } + } + + if (iso_day <= 0 || iso_day >= 8) { + return -3; + } + + // Convert (Y, W, D) to (Y, M, D) in-place + int day_1 = iso_week1_monday(iso_year); + + int day_offset = (iso_week - 1)*7 + iso_day - 1; + + ord_to_ymd(day_1 + day_offset, year, month, day); + return 0; +} + + /* --------------------------------------------------------------------------- * Range checkers. */ @@ -680,6 +713,11 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d) * String parsing utilities and helper functions */ +static unsigned char +is_digit(const char c) { + return ((unsigned int)(c - '0')) < 10; +} + static const char * parse_digits(const char *ptr, int *var, size_t num_digits) { @@ -696,14 +734,17 @@ parse_digits(const char *ptr, int *var, size_t num_digits) } static int -parse_isoformat_date(const char *dtstr, int *year, int *month, int *day) +parse_isoformat_date(const char *dtstr, const size_t len, int *year, int *month, int *day) { /* Parse the date components of the result of date.isoformat() * * Return codes: * 0: Success * -1: Failed to parse date component - * -2: Failed to parse dateseparator + * -2: Inconsistent date separator usage + * -3: Failed to parse ISO week. + * -4: Failed to parse ISO day. + * -5, -6: Failure in iso_to_ymd */ const char *p = dtstr; p = parse_digits(p, year, 4); @@ -711,8 +752,42 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day) return -1; } - if (*(p++) != '-') { - return -2; + const unsigned char uses_separator = (*p == '-'); + if (uses_separator) { + ++p; + } + + if(*p == 'W') { + // This is an isocalendar-style date string + p++; + int iso_week = 0; + int iso_day = 0; + + p = parse_digits(p, &iso_week, 2); + if (NULL == p) { + return -3; + } + + assert(p > dtstr); + if ((size_t)(p - dtstr) < len) { + if (uses_separator && *(p++) != '-') { + return -2; + } + + p = parse_digits(p, &iso_day, 1); + if (NULL == p) { + return -4; + } + } else { + iso_day = 1; + } + + int rv = iso_to_ymd(*year, iso_week, iso_day, year, month, day); + if (rv) { + return -3 + rv; + } else { + return 0; + } } p = parse_digits(p, month, 2); @@ -720,15 +795,13 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day) return -1; } - if (*(p++) != '-') { + if (uses_separator && *(p++) != '-') { return -2; } - p = parse_digits(p, day, 2); if (p == NULL) { return -1; } - return 0; } @@ -736,11 +809,14 @@ static int parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour, int *minute, int *second, int *microsecond) { + *hour = *minute = *second = *microsecond = 0; const char *p = tstr; const char *p_end = tstr_end; int *vals[3] = {hour, minute, second}; + // This is initialized to satisfy an erroneous compiler warning. + unsigned char has_separator = 1; - // Parse [HH[:MM[:SS]]] + // Parse [HH[:?MM[:?SS]]] for (size_t i = 0; i < 3; ++i) { p = parse_digits(p, vals[i], 2); if (NULL == p) { @@ -748,33 +824,47 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour, } char c = *(p++); + if (i == 0) { + has_separator = (c == ':'); + } + if (p >= p_end) { return c != '\0'; } - else if (c == ':') { + else if (has_separator && (c == ':')) { continue; } - else if (c == '.') { + else if (c == '.' || c == ',') { break; - } - else { + } else if (!has_separator) { + --p; + } else { return -4; // Malformed time separator } } - // Parse .fff[fff] + // Parse fractional components size_t len_remains = p_end - p; - if (!(len_remains == 6 || len_remains == 3)) { - return -3; + size_t to_parse = len_remains; + if (len_remains >= 6) { + to_parse = 6; } - p = parse_digits(p, microsecond, len_remains); + p = parse_digits(p, microsecond, to_parse); if (NULL == p) { return -3; } - if (len_remains == 3) { - *microsecond *= 1000; + static int correction[] = { + 100000, 10000, 1000, 100, 10 + }; + + if (to_parse < 6) { + *microsecond *= correction[to_parse-1]; + } + + while (is_digit(*p)){ + ++p; // skip truncated digits } // Return 1 if it's not the end of the string @@ -800,7 +890,7 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute, const char *tzinfo_pos = p; do { - if (*tzinfo_pos == '+' || *tzinfo_pos == '-') { + if (*tzinfo_pos == 'Z' || *tzinfo_pos == '+' || *tzinfo_pos == '-') { break; } } while (++tzinfo_pos < p_end); @@ -822,14 +912,16 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute, } } - // Parse time zone component - // Valid formats are: - // - +HH:MM (len 6) - // - +HH:MM:SS (len 9) - // - +HH:MM:SS.ffffff (len 16) - size_t tzlen = p_end - tzinfo_pos; - if (!(tzlen == 6 || tzlen == 9 || tzlen == 16)) { - return -5; + // Special case UTC / Zulu time. + if (*tzinfo_pos == 'Z') { + *tzoffset = 0; + *tzmicrosecond = 0; + + if (*(tzinfo_pos + 1) != '\0') { + return -5; + } else { + return 1; + } } int tzsign = (*tzinfo_pos == '-') ? -1 : 1; @@ -2983,8 +3075,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) int year = 0, month = 0, day = 0; int rv; - if (len == 10) { - rv = parse_isoformat_date(dt_ptr, &year, &month, &day); + if (len == 7 || len == 8 || len == 10) { + rv = parse_isoformat_date(dt_ptr, len, &year, &month, &day); } else { rv = -1; @@ -3027,37 +3119,21 @@ date_fromisocalendar(PyObject *cls, PyObject *args, PyObject *kw) return NULL; } - if (week <= 0 || week >= 53) { - int out_of_range = 1; - if (week == 53) { - // ISO years have 53 weeks in it on years starting with a Thursday - // and on leap years starting on Wednesday - int first_weekday = weekday(year, 1, 1); - if (first_weekday == 3 || (first_weekday == 2 && is_leap(year))) { - out_of_range = 0; - } - } + int month; + int rv = iso_to_ymd(year, week, day, &year, &month, &day); - if (out_of_range) { - PyErr_Format(PyExc_ValueError, "Invalid week: %d", week); - return NULL; - } + + if (rv == -2) { + PyErr_Format(PyExc_ValueError, "Invalid week: %d", week); + return NULL; } - if (day <= 0 || day >= 8) { + if (rv == -3) { PyErr_Format(PyExc_ValueError, "Invalid day: %d (range is [1, 7])", day); return NULL; } - // Convert (Y, W, D) to (Y, M, D) in-place - int day_1 = iso_week1_monday(year); - - int month = week; - int day_offset = (month - 1)*7 + day - 1; - - ord_to_ymd(day_1 + day_offset, &year, &month, &day); - return new_date_subclass_ex(year, month, day, cls); } @@ -3489,7 +3565,7 @@ static PyMethodDef date_methods[] = { {"fromisoformat", (PyCFunction)date_fromisoformat, METH_O | METH_CLASS, - PyDoc_STR("str -> Construct a date from the output of date.isoformat()")}, + PyDoc_STR("str -> Construct a date from a string in ISO 8601 format.")}, {"fromisocalendar", _PyCFunction_CAST(date_fromisocalendar), METH_VARARGS | METH_KEYWORDS | METH_CLASS, @@ -4564,6 +4640,14 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { goto invalid_string_error; } + // The spec actually requires that time-only ISO 8601 strings start with + // T, but the extended format allows this to be omitted as long as there + // is no ambiguity with date strings. + if (*p == 'T') { + ++p; + len -= 1; + } + int hour = 0, minute = 0, second = 0, microsecond = 0; int tzoffset, tzimicrosecond = 0; int rv = parse_isoformat_time(p, len, @@ -4671,7 +4755,7 @@ static PyMethodDef time_methods[] = { PyDoc_STR("Return time with new specified fields.")}, {"fromisoformat", (PyCFunction)time_fromisoformat, METH_O | METH_CLASS, - PyDoc_STR("string -> time from time.isoformat() output")}, + PyDoc_STR("string -> time from a string in ISO 8601 format")}, {"__reduce_ex__", (PyCFunction)time_reduce_ex, METH_VARARGS, PyDoc_STR("__reduce_ex__(proto) -> (cls, state)")}, @@ -5184,19 +5268,42 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw) static PyObject * _sanitize_isoformat_str(PyObject *dtstr) { + Py_ssize_t len = PyUnicode_GetLength(dtstr); + if (len < 7) { // All valid ISO 8601 strings are at least 7 characters long + return NULL; + } + // `fromisoformat` allows surrogate characters in exactly one position, // the separator; to allow datetime_fromisoformat to make the simplifying // assumption that all valid strings can be encoded in UTF-8, this function // replaces any surrogate character separators with `T`. // // The result of this, if not NULL, returns a new reference - Py_ssize_t len = PyUnicode_GetLength(dtstr); - if (len < 0) { - return NULL; + const void* const unicode_data = PyUnicode_DATA(dtstr); + const unsigned int kind = PyUnicode_KIND(dtstr); + + // Depending on the format of the string, the separator can only ever be + // in positions 7, 8 or 10. We'll check each of these for a surrogate and + // if we find one, replace it with `T`. If there is more than one surrogate, + // we don't have to bother sanitizing it, because the function will later + // fail when we try to encode the string as ASCII. + static const size_t potential_separators[3] = {7, 8, 10}; + size_t surrogate_separator = 0; + for(size_t idx = 0; + idx < sizeof(potential_separators) / sizeof(*potential_separators); + ++idx) { + size_t pos = potential_separators[idx]; + if (pos > (size_t)len) { + break; + } + + if(Py_UNICODE_IS_SURROGATE(PyUnicode_READ(kind, unicode_data, pos))) { + surrogate_separator = pos; + break; + } } - if (len <= 10 || - !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { + if (surrogate_separator == 0) { Py_INCREF(dtstr); return dtstr; } @@ -5206,7 +5313,7 @@ _sanitize_isoformat_str(PyObject *dtstr) return NULL; } - if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) { + if (PyUnicode_WriteChar(str_out, surrogate_separator, (Py_UCS4)'T')) { Py_DECREF(str_out); return NULL; } @@ -5214,6 +5321,106 @@ _sanitize_isoformat_str(PyObject *dtstr) return str_out; } + +static Py_ssize_t +_find_isoformat_datetime_separator(const char *dtstr, Py_ssize_t len) { + // The valid date formats can all be distinguished by characters 4 and 5 + // and further narrowed down by character + // which tells us where to look for the separator character. + // Format | As-rendered | Position + // --------------------------------------- + // %Y-%m-%d | YYYY-MM-DD | 10 + // %Y%m%d | YYYYMMDD | 8 + // %Y-W%V | YYYY-Www | 8 + // %YW%V | YYYYWww | 7 + // %Y-W%V-%u | YYYY-Www-d | 10 + // %YW%V%u | YYYYWwwd | 8 + // %Y-%j | YYYY-DDD | 8 + // %Y%j | YYYYDDD | 7 + // + // Note that because we allow *any* character for the separator, in the + // case where character 4 is W, it's not straightforward to determine where + // the separator is — in the case of YYYY-Www-d, you have actual ambiguity, + // e.g. 2020-W01-0000 could be YYYY-Www-D0HH or YYYY-Www-HHMM, when the + // separator character is a number in the former case or a hyphen in the + // latter case. + // + // The case of YYYYWww can be distinguished from YYYYWwwd by tracking ahead + // to either the end of the string or the first non-numeric character — + // since the time components all come in pairs YYYYWww#HH can be + // distinguished from YYYYWwwd#HH by the fact that there will always be an + // odd number of digits before the first non-digit character in the former + // case. + static const char date_separator = '-'; + static const char week_indicator = 'W'; + + if (len == 7) { + return 7; + } + + if (dtstr[4] == date_separator) { + // YYYY-??? + + if (dtstr[5] == week_indicator) { + // YYYY-W?? + + if (len < 8) { + return -1; + } + + if (len > 8 && dtstr[8] == date_separator) { + // YYYY-Www-D (10) or YYYY-Www-HH (8) + if (len == 9) { return -1; } + if (len > 10 && is_digit(dtstr[10])) { + // This is as far as we'll try to go to resolve the + // ambiguity for the moment — if we have YYYY-Www-##, the + // separator is either a hyphen at 8 or a number at 10. + // + // We'll assume it's a hyphen at 8 because it's way more + // likely that someone will use a hyphen as a separator + // than a number, but at this point it's really best effort + // because this is an extension of the spec anyway. + return 8; + } + + return 10; + } else { + // YYYY-Www (8) + return 8; + } + } else { + // YYYY-MM-DD (10) + return 10; + } + } else { + // YYYY??? + if (dtstr[4] == week_indicator) { + // YYYYWww (7) or YYYYWwwd (8) + size_t idx = 7; + for (; idx < (size_t)len; ++idx) { + // Keep going until we run out of digits. + if (!is_digit(dtstr[idx])) { + break; + } + } + + if (idx < 9) { + return idx; + } + + if (idx % 2 == 0) { + // If the index of the last number is even, it's YYYYWww + return 7; + } else { + return 8; + } + } else { + // YYYYMMDD (8) + return 8; + } + } +} + static PyObject * datetime_fromisoformat(PyObject *cls, PyObject *dtstr) { @@ -5225,9 +5432,14 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr) return NULL; } + // We only need to sanitize this string if the separator is a surrogate + // character. In the situation where the separator location is ambiguous, + // we don't have to sanitize it anything because that can only happen when + // the separator is either '-' or a number. This should mostly be a noop + // but it makes the reference counting easier if we still sanitize. PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr); if (dtstr_clean == NULL) { - goto error; + goto invalid_string_error; } Py_ssize_t len; @@ -5243,30 +5455,35 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr) } } + const Py_ssize_t separator_location = _find_isoformat_datetime_separator( + dt_ptr, len); + + const char *p = dt_ptr; int year = 0, month = 0, day = 0; int hour = 0, minute = 0, second = 0, microsecond = 0; int tzoffset = 0, tzusec = 0; - // date has a fixed length of 10 - int rv = parse_isoformat_date(p, &year, &month, &day); + // date runs up to separator_location + int rv = parse_isoformat_date(p, separator_location, &year, &month, &day); - if (!rv && len > 10) { + if (!rv && len > separator_location) { // In UTF-8, the length of multi-byte characters is encoded in the MSB - if ((p[10] & 0x80) == 0) { - p += 11; + p += separator_location; + if ((p[0] & 0x80) == 0) { + p += 1; } else { - switch (p[10] & 0xf0) { + switch (p[0] & 0xf0) { case 0xe0: - p += 13; + p += 3; break; case 0xf0: - p += 14; + p += 4; break; default: - p += 12; + p += 2; break; } } @@ -6327,7 +6544,7 @@ static PyMethodDef datetime_methods[] = { {"fromisoformat", (PyCFunction)datetime_fromisoformat, METH_O | METH_CLASS, - PyDoc_STR("string -> datetime from datetime.isoformat() output")}, + PyDoc_STR("string -> datetime from a string in most ISO 8601 formats")}, /* Instance methods: */ |