summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorXiang Zhang <angwerzx@126.com>2017-05-22 17:04:27 (GMT)
committerGitHub <noreply@github.com>2017-05-22 17:04:27 (GMT)
commit6e1b832a6c0c8f32962a196ab631ccc17471d32b (patch)
treeb9732eeeffe1334c6ee8166b7a1bed6d2e4bc8f3
parent2b67c7aae7344365dfc12a31e72e4b2659e6875d (diff)
downloadcpython-6e1b832a6c0c8f32962a196ab631ccc17471d32b.zip
cpython-6e1b832a6c0c8f32962a196ab631ccc17471d32b.tar.gz
cpython-6e1b832a6c0c8f32962a196ab631ccc17471d32b.tar.bz2
bpo-30003: Fix handling escape characters in HZ codec (#1720) (#1556)
-rw-r--r--Lib/test/test_codecencodings_cn.py4
-rw-r--r--Misc/NEWS3
-rw-r--r--Modules/cjkcodecs/_codecs_cn.c27
3 files changed, 21 insertions, 13 deletions
diff --git a/Lib/test/test_codecencodings_cn.py b/Lib/test/test_codecencodings_cn.py
index fdae538..a104937 100644
--- a/Lib/test/test_codecencodings_cn.py
+++ b/Lib/test/test_codecencodings_cn.py
@@ -82,6 +82,10 @@ class Test_HZ(test_multibytecodec_support.TestBase, unittest.TestCase):
(b'ab~cd', 'replace', u'ab\uFFFDd'),
(b'ab\xffcd', 'replace', u'ab\uFFFDcd'),
(b'ab~{\x81\x81\x41\x44~}cd', 'replace', u'ab\uFFFD\uFFFD\u804Acd'),
+ # issue 30003
+ (u'ab~cd', 'strict', b'ab~~cd'), # escape ~
+ (b'~{Dc~~:C~}', 'strict', None), # ~~ only in ASCII mode
+ (b'~{Dc~\n:C~}', 'strict', None), # ~\n only in ASCII mode
)
def test_main():
diff --git a/Misc/NEWS b/Misc/NEWS
index 254bb52..938a029 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -49,6 +49,9 @@ Extension Modules
Library
-------
+- bpo-30003: Fix handling escape characters in HZ codec. Based on patch
+ by Ma Lin.
+
- bpo-30375: Warnings emitted when compile a regular expression now always
point to the line in the user code. Previously they could point into inners
of the re module if emitted from inside of groups or conditionals.
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c
index 3bc652f..92cf06d 100644
--- a/Modules/cjkcodecs/_codecs_cn.c
+++ b/Modules/cjkcodecs/_codecs_cn.c
@@ -335,15 +335,17 @@ ENCODER(hz)
DBCHAR code;
if (c < 0x80) {
- if (state->i == 0) {
- WRITE1((unsigned char)c)
- NEXT(1, 1)
- }
- else {
- WRITE3('~', '}', (unsigned char)c)
- NEXT(1, 3)
+ if (state->i) {
+ WRITE2('~', '}')
+ NEXT_OUT(2)
state->i = 0;
}
+ WRITE1((unsigned char)c)
+ NEXT(1, 1)
+ if (c == '~') {
+ WRITE1('~')
+ NEXT_OUT(1)
+ }
continue;
}
@@ -390,20 +392,19 @@ DECODER(hz)
unsigned char c2 = IN2;
REQUIRE_INBUF(2)
- if (c2 == '~') {
+ if (c2 == '~' && state->i == 0) {
WRITE1('~')
- NEXT(2, 1)
- continue;
+ NEXT_OUT(1)
}
else if (c2 == '{' && state->i == 0)
state->i = 1; /* set GB */
+ else if (c2 == '\n' && state->i == 0)
+ ; /* line-continuation */
else if (c2 == '}' && state->i == 1)
state->i = 0; /* set ASCII */
- else if (c2 == '\n')
- ; /* line-continuation */
else
return 2;
- NEXT(2, 0);
+ NEXT_IN(2)
continue;
}