gh-53203: Fix strptime() for %c and %x formats on many locales (GH-124946)

In some locales (like French or Hebrew) the full or abbreviated names of the default month and weekday used in __calc_date_time can be part of other name or constant part of the %c format. The month name can also match %m with constant suffix (like in Japanese). So the code failed to correctly distinguish formats %a, %A, %b, %B and %m. Cycle all month and all days of the week to find the variable part and distinguish %a from %A and %b from %B or %m. Fixed locales for the following languges: Arabic, Bislama, Breton, Bodo, Kashubian, Chuvash, Estonian, French, Irish, Ge'ez, Gurajati, Manx Gaelic, Hebrew, Hindi, Chhattisgarhi, Haitian Kreyol, Japanese, Kannada, Korean, Marathi, Malay, Norwegian, Nynorsk, Punjabi, Rajasthani, Tok Pisin, Yoruba, Yue Chinese, Yau/Nungon and Chinese. Co-authored-by: Eli Bendersky <eliben@gmail.com>
author: Serhiy Storchaka <storchaka@gmail.com> 2024-10-12 17:46:21 (GMT)
committer: GitHub <noreply@github.com> 2024-10-12 17:46:21 (GMT)
commit: c05f9dde8a12dfd63d3ade93da616042df2dc925 (patch)
tree: 7fbcd2b9ec611130e746c12b27cebd816cab45a1
parent: dcd58c50844dae0d83517e88518a677914ea594b (diff)
download: cpython-c05f9dde8a12dfd63d3ade93da616042df2dc925.zip
cpython-c05f9dde8a12dfd63d3ade93da616042df2dc925.tar.gz
cpython-c05f9dde8a12dfd63d3ade93da616042df2dc925.tar.bz2
3 files changed, 134 insertions, 34 deletions
diff --git a/Lib/_strptime.py b/Lib/_strptime.py
index a3f8bb5..89adc17 100644
--- a/Lib/_strptime.py
+++ b/Lib/_strptime.py
@@ -28,6 +28,18 @@ def _getlang():
     # Figure out what the current language is set to.
     return locale.getlocale(locale.LC_TIME)
 
+def _findall(haystack, needle):
+    # Find all positions of needle in haystack.
+    if not needle:
+        return
+    i = 0
+    while True:
+        i = haystack.find(needle, i)
+        if i < 0:
+            break
+        yield i
+        i += len(needle)
+
 class LocaleTime(object):
     """Stores and handles locale-specific information related to time.
 
@@ -102,7 +114,8 @@ class LocaleTime(object):
         am_pm = []
         for hour in (1, 22):
             time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0))
-            am_pm.append(time.strftime("%p", time_tuple).lower())
+            # br_FR has AM/PM info (' ',' ').
+            am_pm.append(time.strftime("%p", time_tuple).lower().strip())
         self.am_pm = am_pm
 
     def __calc_date_time(self):
@@ -114,42 +127,114 @@ class LocaleTime(object):
         # values within the format string is very important; it eliminates
         # possible ambiguity for what something represents.
         time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
-        date_time = [None, None, None]
-        date_time[0] = time.strftime("%c", time_tuple).lower()
-        date_time[1] = time.strftime("%x", time_tuple).lower()
-        date_time[2] = time.strftime("%X", time_tuple).lower()
-        replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'),
-                    (self.f_month[3], '%B'), (self.a_weekday[2], '%a'),
-                    (self.a_month[3], '%b'), (self.am_pm[1], '%p'),
+        time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
+        replacement_pairs = [
                     ('1999', '%Y'), ('99', '%y'), ('22', '%H'),
                     ('44', '%M'), ('55', '%S'), ('76', '%j'),
                     ('17', '%d'), ('03', '%m'), ('3', '%m'),
                     # '3' needed for when no leading zero.
                     ('2', '%w'), ('10', '%I')]
-        replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone
-                                                for tz in tz_values])
-        for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')):
-            current_format = date_time[offset]
-            for old, new in replacement_pairs:
+        date_time = []
+        for directive in ('%c', '%x', '%X'):
+            current_format = time.strftime(directive, time_tuple).lower()
+            current_format = current_format.replace('%', '%%')
+            # The month and the day of the week formats are treated specially
+            # because of a possible ambiguity in some locales where the full
+            # and abbreviated names are equal or names of different types
+            # are equal. See doc of __find_month_format for more details.
+            lst, fmt = self.__find_weekday_format(directive)
+            if lst:
+                current_format = current_format.replace(lst[2], fmt, 1)
+            lst, fmt = self.__find_month_format(directive)
+            if lst:
+                current_format = current_format.replace(lst[3], fmt, 1)
+            if self.am_pm[1]:
                 # Must deal with possible lack of locale info
                 # manifesting itself as the empty string (e.g., Swedish's
                 # lack of AM/PM info) or a platform returning a tuple of empty
                 # strings (e.g., MacOS 9 having timezone as ('','')).
-                if old:
-                    current_format = current_format.replace(old, new)
+                current_format = current_format.replace(self.am_pm[1], '%p')
+            for tz_values in self.timezone:
+                for tz in tz_values:
+                    if tz:
+                        current_format = current_format.replace(tz, "%Z")
+            for old, new in replacement_pairs:
+                current_format = current_format.replace(old, new)
             # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
             # 2005-01-03 occurs before the first Monday of the year.  Otherwise
             # %U is used.
-            time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0))
-            if '00' in time.strftime(directive, time_tuple):
+            if '00' in time.strftime(directive, time_tuple2):
                 U_W = '%W'
             else:
                 U_W = '%U'
-            date_time[offset] = current_format.replace('11', U_W)
+            current_format = current_format.replace('11', U_W)
+            date_time.append(current_format)
         self.LC_date_time = date_time[0]
         self.LC_date = date_time[1]
         self.LC_time = date_time[2]
 
+    def __find_month_format(self, directive):
+        """Find the month format appropriate for the current locale.
+
+        In some locales (for example French and Hebrew), the default month
+        used in __calc_date_time has the same name in full and abbreviated
+        form.  Also, the month name can by accident match other part of the
+        representation: the day of the week name (for example in Morisyen)
+        or the month number (for example in Japanese).  Thus, cycle months
+        of the year and find all positions that match the month name for
+        each month,  If no common positions are found, the representation
+        does not use the month name.
+        """
+        full_indices = abbr_indices = None
+        for m in range(1, 13):
+            time_tuple = time.struct_time((1999, m, 17, 22, 44, 55, 2, 76, 0))
+            datetime = time.strftime(directive, time_tuple).lower()
+            indices = set(_findall(datetime, self.f_month[m]))
+            if full_indices is None:
+                full_indices = indices
+            else:
+                full_indices &= indices
+            indices = set(_findall(datetime, self.a_month[m]))
+            if abbr_indices is None:
+                abbr_indices = indices
+            else:
+                abbr_indices &= indices
+            if not full_indices and not abbr_indices:
+                return None, None
+        if full_indices:
+            return self.f_month, '%B'
+        if abbr_indices:
+            return self.a_month, '%b'
+        return None, None
+
+    def __find_weekday_format(self, directive):
+        """Find the day of the week format appropriate for the current locale.
+
+        Similar to __find_month_format().
+        """
+        full_indices = abbr_indices = None
+        for wd in range(7):
+            time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, wd, 76, 0))
+            datetime = time.strftime(directive, time_tuple).lower()
+            indices = set(_findall(datetime, self.f_weekday[wd]))
+            if full_indices is None:
+                full_indices = indices
+            else:
+                full_indices &= indices
+            if self.f_weekday[wd] != self.a_weekday[wd]:
+                indices = set(_findall(datetime, self.a_weekday[wd]))
+            if abbr_indices is None:
+                abbr_indices = indices
+            else:
+                abbr_indices &= indices
+            if not full_indices and not abbr_indices:
+                return None, None
+        if full_indices:
+            return self.f_weekday, '%A'
+        if abbr_indices:
+            return self.a_weekday, '%a'
+        return None, None
+
     def __calc_timezone(self):
         # Set self.timezone by using time.tzname.
         # Do not worry about possibility of time.tzname[0] == time.tzname[1]
@@ -187,7 +272,7 @@ class TimeRE(dict):
             'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
             'f': r"(?P<f>[0-9]{1,6})",
             'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
-            'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
+            'I': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
             'G': r"(?P<G>\d\d\d\d)",
             'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
             'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
@@ -349,8 +434,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
             _regex_cache[format] = format_regex
     found = format_regex.match(data_string)
     if not found:
-        raise ValueError("time data %r does not match format %r" %
-                         (data_string, format))
+        raise ValueError("time data %r does not match format %r :: /%s/" %
+                         (data_string, format, format_regex.pattern))
     if len(data_string) != found.end():
         raise ValueError("unconverted data remains: %s" %
                           data_string[found.end():])
diff --git a/Lib/test/test_strptime.py b/Lib/test/test_strptime.py
index 37f6b08..79f48df 100644
--- a/Lib/test/test_strptime.py
+++ b/Lib/test/test_strptime.py
@@ -5,6 +5,7 @@ import time
 import locale
 import re
 import os
+import platform
 import sys
 from test import support
 from test.support import warnings_helper
@@ -13,6 +14,13 @@ from datetime import date as datetime_date
 
 import _strptime
 
+libc_ver = platform.libc_ver()
+if libc_ver[0] == 'glibc':
+    glibc_ver = tuple(map(int, libc_ver[1].split('.')))
+else:
+    glibc_ver = None
+
+
 class getlang_Tests(unittest.TestCase):
     """Test _getlang"""
     def test_basic(self):
@@ -478,16 +486,16 @@ class StrptimeTests(unittest.TestCase):
     # * Year is not included: ha_NG.
     # * Use non-Gregorian calendar: lo_LA, thai, th_TH.
     #
-    # BUG: Generates invalid regexp for br_FR, csb_PL, Arabic.
-    # BUG: Generates regexp that does not match the current date and time
-    # for fa_IR, gez_ER, gez_ET, lzh_TW, my_MM, or_IN, shn_MM, yo_NG.
     # BUG: Generates regexp that does not match the current date and time
-    # for fa_IR, gez_ER, gez_ET, lzh_TW, my_MM, or_IN, shn_MM, yo_NG,
-    # fr_FR, ja_JP, he_IL, ko_KR, zh_CN, etc.
-    @run_with_locales('LC_TIME', 'C', 'en_US', 'de_DE',
-                      'eu_ES', 'mfe_MU')
+    # for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
+    @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
+                      'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
+                      'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
     def test_date_time_locale(self):
         # Test %c directive
+        loc = locale.getlocale(locale.LC_TIME)[0]
+        if glibc_ver and glibc_ver < (2, 31) and loc == 'br_FR':
+            self.skipTest('%c in locale br_FR does not include time')
         now = time.time()
         self.roundtrip('%c', slice(0, 6), time.localtime(now))
         # 1 hour 20 minutes 30 seconds ago
@@ -505,7 +513,9 @@ class StrptimeTests(unittest.TestCase):
 
     # NB: Dates before 1969 do not roundtrip on some locales:
     # bo_CN, bo_IN, dz_BT, eu_ES, eu_FR.
-    @run_with_locales('LC_TIME', 'C', 'en_US', 'de_DE', 'ja_JP')
+    @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
+                      'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG',
+                      'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
     def test_date_time_locale2(self):
         # Test %c directive
         self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
@@ -513,10 +523,9 @@ class StrptimeTests(unittest.TestCase):
     # NB: Does not roundtrip because use non-Gregorian calendar:
     # lo_LA, thai, th_TH.
     # BUG: Generates regexp that does not match the current date
-    # for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM,
-    # Arabic, ja_JP, ko_KR, zh_CN, etc.
-    @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE',
-                      'he_IL', 'eu_ES')
+    # for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
+    @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
+                      'he_IL', 'eu_ES', 'ar_AE')
     def test_date_locale(self):
         # Test %x directive
         now = time.time()
@@ -535,7 +544,8 @@ class StrptimeTests(unittest.TestCase):
         support.is_emscripten or support.is_wasi,
         "musl libc issue on Emscripten, bpo-46390"
     )
-    @run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP')
+    @run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
+                      'eu_ES', 'ar_AE')
     def test_date_locale2(self):
         # Test %x directive
         self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
diff --git a/Misc/NEWS.d/next/Library/2024-10-03-20-45-57.gh-issue-53203.3Sk4Ia.rst b/Misc/NEWS.d/next/Library/2024-10-03-20-45-57.gh-issue-53203.3Sk4Ia.rst
new file mode 100644
index 0000000..6895cff
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-10-03-20-45-57.gh-issue-53203.3Sk4Ia.rst
@@ -0,0 +1,5 @@
+Fix :func:`time.strptime` for ``%c`` and ``%x`` formats in many locales:
+Arabic, Bislama, Breton, Bodo, Kashubian, Chuvash, Estonian, French, Irish,
+Ge'ez, Gurajati, Manx Gaelic, Hebrew, Hindi, Chhattisgarhi, Haitian Kreyol,
+Japanese, Kannada, Korean, Marathi, Malay, Norwegian, Nynorsk, Punjabi,
+Rajasthani, Tok Pisin, Yoruba, Yue Chinese, Yau/Nungon and Chinese.
author	Serhiy Storchaka <storchaka@gmail.com>	2024-10-12 17:46:21 (GMT)
committer	GitHub <noreply@github.com>	2024-10-12 17:46:21 (GMT)
commit	c05f9dde8a12dfd63d3ade93da616042df2dc925 (patch)
tree	7fbcd2b9ec611130e746c12b27cebd816cab45a1
parent	dcd58c50844dae0d83517e88518a677914ea594b (diff)
download	cpython-c05f9dde8a12dfd63d3ade93da616042df2dc925.zip cpython-c05f9dde8a12dfd63d3ade93da616042df2dc925.tar.gz cpython-c05f9dde8a12dfd63d3ade93da616042df2dc925.tar.bz2