From 78b187fe71dba0f3710be978050e08fde81efd85 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 8 Apr 2016 12:23:50 +0000 Subject: Fix [8663689908d3304a74fee525cd04aa4162e86391|8663689908d3]: regexp \\w missing characters --- generic/regc_lex.c | 19 ++++++++++++++++--- tests/utf.test | 2 +- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/generic/regc_lex.c b/generic/regc_lex.c index 8d07c59..51cf8a4 100644 --- a/generic/regc_lex.c +++ b/generic/regc_lex.c @@ -256,20 +256,33 @@ static const chr brbacks[] = { /* \s within brackets */ CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), CHR(':'), CHR(']') }; + +#define PUNCT_CONN \ + CHR('_'), \ + 0x203f /* UNDERTIE */, \ + 0x2040 /* CHARACTER TIE */,\ + 0x2054 /* INVERTED UNDERTIE */,\ + 0xfe33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */, \ + 0xfe34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */, \ + 0xfe4d /* DASHED LOW LINE */, \ + 0xfe4e /* CENTRELINE LOW LINE */, \ + 0xfe4f /* WAVY LOW LINE */, \ + 0xff3f /* FULLWIDTH LOW LINE */ + static const chr backw[] = { /* \w */ CHR('['), CHR('['), CHR(':'), CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') + CHR(':'), CHR(']'), PUNCT_CONN, CHR(']') }; static const chr backW[] = { /* \W */ CHR('['), CHR('^'), CHR('['), CHR(':'), CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') + CHR(':'), CHR(']'), PUNCT_CONN, CHR(']') }; static const chr brbackw[] = { /* \w within brackets */ CHR('['), CHR(':'), CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_') + CHR(':'), CHR(']'), PUNCT_CONN }; /* diff --git a/tests/utf.test b/tests/utf.test index 30200c1..e8ee374 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -283,7 +283,7 @@ test utf-21.1 {TclUniCharIsAlnum} { } {1} test utf-21.2 {unicode alnum char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220] + list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f] } {1 1} test utf-21.3 {unicode print char in regc_locale.c} { # this returns 1 with Unicode 7 compliance -- cgit v0.12