src/utf8.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

/******************************************************************************
 *
 * Copyright (C) 1997-2021 by Dimitri van Heesch.
 *
 * Permission to use, copy, modify, and distribute this software and its
 * documentation under the terms of the GNU General Public License is hereby
 * granted. No representations are made about the suitability of this software
 * for any purpose. It is provided "as is" without express or implied warranty.
 * See the GNU General Public License for more details.
 *
 * Documents produced by Doxygen are derivative works derived from the
 * input used in their production; they are not affected by this license.
 *
 */

#ifndef UTF8_H
#define UTF8_H

#include <cstdint>
#include <string>
#include <iostream>

/** @file
 *  @brief Various UTF8 related helper functions.
 *
 *  See https://en.wikipedia.org/wiki/UTF-8 for details on UTF8 encoding.
 */


/** Converts the input string into a lower case version, also taking into account
 *  non-ASCII characters that has a lower case variant.
 */
std::string convertUTF8ToLower(const std::string &input);

/** Converts the input string into a upper case version, also taking into account
 *  non-ASCII characters that has a upper case variant.
 */
std::string convertUTF8ToUpper(const std::string &input);

/** Returns the UTF8 character found at byte position pos in the input string.
 *  The resulting string can be a multi byte sequence.
 */
std::string getUTF8CharAt(const std::string &input,size_t pos);

/** Returns the 32bit Unicode value matching character at byte position pos in
 *  the UTF8 encoded input.
 */
uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos);

/** Returns the number of bytes making up a single UTF8 character given the first byte
 *  in the sequence.
 */
int getUTF8CharNumBytes(char firstByte);

/** Writes the UTF8 character pointed to by s to stream t and returns a pointer
 *  to the next character.
 */
const char *writeUTF8Char(std::ostream &t,const char *s);

/** Returns true iff the last character in input is a multibyte character. */
bool lastUTF8CharIsMultibyte(const std::string &input);

/** Returns true iff the input string at byte position pos holds an upper case character. */
bool isUTF8CharUpperCase(const std::string &input,size_t pos);

/** Check if the first character pointed at by input is a non-breakable whitespace character.
 *  Returns the byte size of the character if there is match or 0 if not.
 */
int isUTF8NonBreakableSpace(const char *input);

#endif