summaryrefslogtreecommitdiffstats
path: root/src/icu4c-test.c
blob: 3fa4a22814e43ef31d15f14b58c4a7e9881f20ff (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/*
 * This file is part of MXE. See LICENSE.md for licensing information.
 */

/*** ugrep.c ***/

/*** https://begriffs.com/posts/2019-05-23-unicode-icu.html ***/

#include <locale.h>
#include <stdlib.h>
#include <string.h>

#include <unicode/ucol.h>
#include <unicode/usearch.h>
#include <unicode/ustdio.h>
#include <unicode/ustring.h>

#define BUFSZ 1024

int main(int argc, char **argv)
{
    char *locale;
    UFILE *in;
    UCollator *col;
    UStringSearch *srch = NULL;
    UErrorCode status = U_ZERO_ERROR;
    UChar *needle, line[BUFSZ];
    UColAttributeValue strength;
    int ignoreInsignificant = 0, asymmetric = 0;
    size_t n;
    long i;

    if (argc != 3)
    {
        fprintf(stderr,
            "Usage: %s {1,2,@,3}[i] pattern\n", argv[0]);
        return EXIT_FAILURE;
    }

    /* cryptic parsing for our cryptic options */
    switch (*argv[1])
    {
        case '1':
            strength = UCOL_PRIMARY;
            break;
        case '2':
            strength = UCOL_SECONDARY;
            break;
        case '@':
            strength = UCOL_SECONDARY, asymmetric = 1;
            break;
        case '3':
            strength = UCOL_TERTIARY;
            break;
        default:
            fprintf(stderr,
                "Unknown strength: %s\n", argv[1]);
            return EXIT_FAILURE;
    }
    /* length of argv[1] is >0 or we would have died */
    ignoreInsignificant = argv[1][strlen(argv[1])-1] == 'i';

    n = strlen(argv[2]) + 1;
    /* if UTF-8 could encode it in n, then UTF-16
     * should be able to as well */
    needle = malloc(n * sizeof(*needle));
    u_strFromUTF8(needle, n, NULL, argv[2], -1, &status);

    /* searching is a degenerate case of collation,
     * so we read the LC_COLLATE locale */
    if (!(locale = setlocale(LC_COLLATE, "")))
    {
        fputs("Cannot determine system collation locale\n",
              stderr);
        return EXIT_FAILURE;
    }

    if (!(in = u_finit(stdin, NULL, NULL)))
    {
        fputs("Error opening stdin as UFILE\n", stderr);
        return EXIT_FAILURE;
    }

    col = ucol_open(locale, &status);
    ucol_setStrength(col, strength);

    if (ignoreInsignificant)
        /* shift ignorable characters down to
         * quaternary level */
        ucol_setAttribute(col, UCOL_ALTERNATE_HANDLING,
                          UCOL_SHIFTED, &status);

    /* Assumes all lines fit in BUFSZ. Should
     * fix this in real code and not increment i */
    for (i = 1; u_fgets(line, BUFSZ, in); ++i)
    {
        /* first time through, set up all options */
        if (!srch)
        {
            srch = usearch_openFromCollator(
                needle, -1, line, -1,
                col, NULL, &status
            );
            if (asymmetric)
                usearch_setAttribute(
                    srch, USEARCH_ELEMENT_COMPARISON,
                    USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD,
                    &status
                );
        }
        /* afterward just switch text */
        else
            usearch_setText(srch, line, -1, &status);

        /* check if keyword appears in line */
        if (usearch_first(srch, &status) != USEARCH_DONE)
            u_printf("%ld: %S", i, line);
    }

    usearch_close(srch);
    ucol_close(col);
    u_fclose(in);
    free(needle);

    return EXIT_SUCCESS;
}