Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "unicode.h"
15 :
16 : #include <string.h>
17 :
18 0 : int utf8_strlen(const char *utf8_str, bool *ascii)
19 : {
20 0 : int utf8_char_count = 0;
21 0 : int i = 0;
22 : // we traverse the string and simply count the amount of utf8 characters in
23 : // the string
24 0 : while (true) {
25 0 : int offset;
26 0 : if (utf8_str[i] == '\0')
27 : break;
28 0 : offset = utf8_length(utf8_str[i]);
29 0 : if (offset < 0)
30 : return -1; // invalid utf8 character
31 0 : i += offset;
32 0 : utf8_char_count++;
33 : }
34 0 : if (ascii != NULL)
35 0 : *ascii = i == utf8_char_count;
36 : return utf8_char_count;
37 : }
38 :
39 0 : size_t utf32_strlen(const Py_UNICODE *utf32_str)
40 : {
41 0 : size_t i = 0;
42 0 : while (utf32_str[i] != 0)
43 0 : i++;
44 0 : return i;
45 : }
46 :
47 0 : int utf8_length(unsigned char utf8_char)
48 : {
49 : // the first byte tells us how many bytes the utf8 character uses
50 0 : if (utf8_char < 0x80)
51 : return 1;
52 0 : else if (utf8_char < 0xe0)
53 : return 2;
54 0 : else if (utf8_char < 0xf0)
55 : return 3;
56 0 : else if (utf8_char < 0xf8)
57 : return 4;
58 : else
59 0 : return -1; // invalid utf8 character, the maximum value of the first
60 : // byte is 0xf7
61 : }
62 :
63 5000116 : int utf32_char_to_utf8_char(size_t position, char *utf8_storage,
64 : unsigned int utf32_char)
65 : {
66 5000116 : int utf8_size = 4;
67 5000116 : if (utf32_char < 0x80)
68 : utf8_size = 1;
69 2 : else if (utf32_char < 0x800)
70 : utf8_size = 2;
71 0 : else if (utf32_char < 0x10000)
72 : utf8_size = 3;
73 0 : else if (utf32_char > 0x0010FFFF)
74 : return -1; // utf32 character is out of legal range
75 :
76 5000116 : switch (utf8_size) {
77 : case 4:
78 0 : utf8_storage[position + 3] = ((utf32_char | 0x80) & 0xbf);
79 0 : utf32_char >>= 6;
80 0 : utf8_storage[position + 2] = ((utf32_char | 0x80) & 0xbf);
81 0 : utf32_char >>= 6;
82 0 : utf8_storage[position + 1] = ((utf32_char | 0x80) & 0xbf);
83 0 : utf32_char >>= 6;
84 0 : utf8_storage[position] = (utf32_char | 0xf0);
85 0 : return utf8_size;
86 : case 3:
87 0 : utf8_storage[position + 2] = ((utf32_char | 0x80) & 0xbf);
88 0 : utf32_char >>= 6;
89 0 : utf8_storage[position + 1] = ((utf32_char | 0x80) & 0xbf);
90 0 : utf32_char >>= 6;
91 0 : utf8_storage[position] = (utf32_char | 0xe0);
92 0 : return utf8_size;
93 : case 2:
94 2 : utf8_storage[position + 1] = ((utf32_char | 0x80) & 0xbf);
95 2 : utf32_char >>= 6;
96 2 : utf8_storage[position] = (utf32_char | 0xc0);
97 2 : return utf8_size;
98 : default:
99 5000114 : utf8_storage[position] = (char)utf32_char;
100 5000114 : return utf8_size;
101 : }
102 : }
103 :
104 0 : bool ucs2_to_utf8(size_t offset, size_t size, char *utf8_storage,
105 : const Py_UNICODE *ucs2)
106 : {
107 0 : size_t i = 0;
108 0 : int position = 0;
109 0 : int shift;
110 0 : for (i = 0; i < size; i++) {
111 0 : if (ucs2[offset + i] == 0) {
112 0 : utf8_storage[position] = '\0';
113 0 : return true;
114 : }
115 0 : shift =
116 0 : utf32_char_to_utf8_char(position, utf8_storage, ucs2[offset + i]);
117 0 : if (shift < 0)
118 : return false;
119 0 : position += shift;
120 : }
121 0 : utf8_storage[position] = '\0';
122 0 : return true;
123 : }
124 :
125 1000022 : bool utf32_to_utf8(size_t offset, size_t size, char *utf8_storage,
126 : const Py_UNICODE *utf32_input)
127 : {
128 1000022 : size_t i = 0;
129 1000022 : int position = 0;
130 1000022 : int shift;
131 1000022 : unsigned int *utf32 = (unsigned int *)utf32_input;
132 :
133 6000138 : for (i = 0; i < size; i++) {
134 5000126 : if (utf32[offset + i] == 0) {
135 10 : utf8_storage[position] = '\0';
136 10 : return true;
137 : }
138 :
139 5000116 : shift =
140 5000116 : utf32_char_to_utf8_char(position, utf8_storage, utf32[offset + i]);
141 5000116 : if (shift < 0)
142 : return false;
143 5000116 : position += shift;
144 : }
145 1000012 : utf8_storage[position] = '\0';
146 1000012 : return true;
147 : }
148 :
149 0 : bool unicode_to_utf8(size_t offset, size_t size, char *utf8_storage,
150 : const Py_UNICODE *unicode)
151 : {
152 : #if Py_UNICODE_SIZE == 2
153 : return ucs2_to_utf8(offset, size, utf8_storage, unicode);
154 : #else
155 0 : return utf32_to_utf8(offset, size, utf8_storage, unicode);
156 : #endif
157 : }
158 :
159 0 : int utf8_char_to_utf32_char(size_t position, Py_UNICODE *utf32_storage,
160 : int offset, const unsigned char *utf8_char)
161 : {
162 0 : unsigned char bytes[4];
163 0 : int utf8_size = 4;
164 0 : bytes[0] = utf8_char[offset];
165 0 : bytes[1] = 0xFF;
166 0 : bytes[2] = 0xFF;
167 0 : bytes[3] = 0xFF;
168 : // the first byte tells us how many bytes the utf8 character uses
169 0 : if (bytes[0] < 0x80)
170 : utf8_size = 1;
171 0 : else if (bytes[0] < 0xe0)
172 : utf8_size = 2;
173 0 : else if (bytes[0] < 0xf0)
174 : utf8_size = 3;
175 0 : else if (bytes[0] < 0xf8)
176 : utf8_size = 4;
177 : else
178 : return -1; // invalid utf8 character, the maximum value of the first
179 : // byte is 0xf7
180 :
181 : #if Py_UNICODE_SIZE == 2
182 : if (utf8_size > 2) {
183 : // utf-8 character out of range on a UCS2 python compilation
184 : return -1;
185 : }
186 : #endif
187 :
188 0 : switch (utf8_size) {
189 : case 4:
190 0 : bytes[3] = utf8_char[offset + 3];
191 0 : if (bytes[3] > 0xc0)
192 : return -1; // invalid utf8 character, the maximum value of the
193 : // second, third and fourth bytes is 0xbf
194 : /* fall through */
195 : case 3:
196 0 : bytes[2] = utf8_char[offset + 2];
197 0 : if (bytes[2] > 0xc0)
198 : return -1;
199 : /* fall through */
200 : case 2:
201 0 : bytes[1] = utf8_char[offset + 1];
202 0 : if (bytes[1] > 0xc0)
203 : return -1;
204 : }
205 :
206 0 : utf32_storage[position] = 0;
207 :
208 0 : switch (utf8_size) {
209 0 : case 4:
210 0 : utf32_storage[position] |= (0x3f & bytes[3]);
211 0 : utf32_storage[position] |= (0x3f & bytes[2]) << 6;
212 0 : utf32_storage[position] |= (0x3f & bytes[1]) << 12;
213 0 : utf32_storage[position] |= (0x7 & bytes[0]) << 18;
214 0 : return utf8_size;
215 0 : case 3:
216 0 : utf32_storage[position] |= (0x3f & bytes[2]);
217 0 : utf32_storage[position] |= (0x3f & bytes[1]) << 6;
218 0 : utf32_storage[position] |= (0xf & bytes[0]) << 12;
219 0 : return utf8_size;
220 0 : case 2:
221 0 : utf32_storage[position] |= (0x3f & bytes[1]);
222 0 : utf32_storage[position] |= (0x1f & bytes[0]) << 6;
223 0 : return utf8_size;
224 0 : default:
225 0 : utf32_storage[position] |= 0x7f & bytes[0];
226 0 : return utf8_size;
227 : }
228 : }
229 :
230 0 : bool utf8_to_utf32(size_t offset, size_t size, Py_UNICODE *utf32_storage,
231 : const unsigned char *utf8)
232 : {
233 0 : size_t i = 0;
234 0 : int position = 0;
235 0 : int shift;
236 0 : for (i = 0; i < size; i++) {
237 0 : if (utf8[offset + position] == 0) {
238 0 : utf32_storage[i] = '\0';
239 0 : return true;
240 : }
241 :
242 0 : shift = utf8_char_to_utf32_char((int)i, utf32_storage,
243 0 : (int)(offset + position), utf8);
244 0 : if (shift < 0)
245 : return false;
246 0 : position += shift;
247 : }
248 : return true;
249 : }
250 :
251 0 : void _unicode_init(void) { _import_array(); }
|