Line data Source code
1 : /* The latest version of this library is available on GitHub;
2 : * https://github.com/sheredom/utf8.h */
3 :
4 : /* This is free and unencumbered software released into the public domain.
5 : *
6 : * Anyone is free to copy, modify, publish, use, compile, sell, or
7 : * distribute this software, either in source code form or as a compiled
8 : * binary, for any purpose, commercial or non-commercial, and by any
9 : * means.
10 : *
11 : * In jurisdictions that recognize copyright laws, the author or authors
12 : * of this software dedicate any and all copyright interest in the
13 : * software to the public domain. We make this dedication for the benefit
14 : * of the public at large and to the detriment of our heirs and
15 : * successors. We intend this dedication to be an overt act of
16 : * relinquishment in perpetuity of all present and future rights to this
17 : * software under copyright law.
18 : *
19 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 : * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 : * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23 : * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24 : * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 : * OTHER DEALINGS IN THE SOFTWARE.
26 : *
27 : * For more information, please refer to <http://unlicense.org/> */
28 :
29 : #ifndef SHEREDOM_UTF8_H_INCLUDED
30 : #define SHEREDOM_UTF8_H_INCLUDED
31 :
32 : #if defined(_MSC_VER)
33 : #pragma warning(push)
34 :
35 : /* disable warning: no function prototype given: converting '()' to '(void)' */
36 : #pragma warning(disable : 4255)
37 :
38 : /* disable warning: '__cplusplus' is not defined as a preprocessor macro,
39 : * replacing with '0' for '#if/#elif' */
40 : #pragma warning(disable : 4668)
41 :
42 : /* disable warning: bytes padding added after construct */
43 : #pragma warning(disable : 4820)
44 : #endif
45 :
46 : #include <stddef.h>
47 : #include <stdlib.h>
48 :
49 : #if defined(_MSC_VER)
50 : #pragma warning(pop)
51 : #endif
52 :
53 : #if defined(_MSC_VER) && (_MSC_VER < 1920)
54 : typedef __int32 utf8_int32_t;
55 : #else
56 : #include <stdint.h>
57 : typedef int32_t utf8_int32_t;
58 : #endif
59 :
60 : #if defined(__clang__)
61 : #pragma clang diagnostic push
62 : #pragma clang diagnostic ignored "-Wold-style-cast"
63 : #pragma clang diagnostic ignored "-Wcast-qual"
64 : #endif
65 :
66 : #ifdef __cplusplus
67 : extern "C" {
68 : #endif
69 :
70 : #if defined(_MSC_VER)
71 : #define utf8_nonnull
72 : #define utf8_pure
73 : #define utf8_restrict __restrict
74 : #define utf8_weak __inline
75 : #elif defined(__clang__) || defined(__GNUC__)
76 : #define utf8_nonnull __attribute__((nonnull))
77 : #define utf8_pure __attribute__((pure))
78 : #define utf8_restrict __restrict__
79 : #define utf8_weak __attribute__((weak))
80 : #else
81 : #error Non clang, non gcc, non MSVC compiler found!
82 : #endif
83 :
84 : #ifdef __cplusplus
85 : #define utf8_null NULL
86 : #else
87 : #define utf8_null 0
88 : #endif
89 :
90 : #if (defined(__cplusplus) && __cplusplus >= 201402L)
91 : #define utf8_constexpr14 constexpr
92 : #define utf8_constexpr14_impl constexpr
93 : #else
94 : /* constexpr and weak are incompatible. so only enable one of them */
95 : #define utf8_constexpr14 utf8_weak
96 : #define utf8_constexpr14_impl
97 : #endif
98 :
99 : #if defined(__cplusplus) && __cplusplus >= 202002L
100 : using utf8_int8_t = char8_t; /* Introduced in C++20 */
101 : #else
102 : typedef char utf8_int8_t;
103 : #endif
104 :
105 : /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
106 : * src2 respectively, case insensitive. */
107 : utf8_constexpr14 utf8_nonnull utf8_pure int
108 : utf8casecmp(const utf8_int8_t *src1, const utf8_int8_t *src2);
109 :
110 : /* Append the utf8 string src onto the utf8 string dst. */
111 : utf8_nonnull utf8_weak utf8_int8_t *
112 : utf8cat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
113 :
114 : /* Find the first match of the utf8 codepoint chr in the utf8 string src. */
115 : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
116 : utf8chr(const utf8_int8_t *src, utf8_int32_t chr);
117 :
118 : /* Return less than 0, 0, greater than 0 if src1 < src2,
119 : * src1 == src2, src1 > src2 respectively. */
120 : utf8_constexpr14 utf8_nonnull utf8_pure int utf8cmp(const utf8_int8_t *src1,
121 : const utf8_int8_t *src2);
122 :
123 : /* Copy the utf8 string src onto the memory allocated in dst. */
124 : utf8_nonnull utf8_weak utf8_int8_t *
125 : utf8cpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
126 :
127 : /* Number of utf8 codepoints in the utf8 string src that consists entirely
128 : * of utf8 codepoints not from the utf8 string reject. */
129 : utf8_constexpr14 utf8_nonnull utf8_pure size_t
130 : utf8cspn(const utf8_int8_t *src, const utf8_int8_t *reject);
131 :
132 : /* Duplicate the utf8 string src by getting its size, malloc'ing a new buffer
133 : * copying over the data, and returning that. Or 0 if malloc failed. */
134 : utf8_weak utf8_int8_t *utf8dup(const utf8_int8_t *src);
135 :
136 : /* Number of utf8 codepoints in the utf8 string str,
137 : * excluding the null terminating byte. */
138 : utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8len(const utf8_int8_t *str);
139 :
140 : /* Similar to utf8len, except that only at most n bytes of src are looked. */
141 : utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8nlen(const utf8_int8_t *str,
142 : size_t n);
143 :
144 : /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
145 : * src2 respectively, case insensitive. Checking at most n bytes of each utf8
146 : * string. */
147 : utf8_constexpr14 utf8_nonnull utf8_pure int
148 : utf8ncasecmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
149 :
150 : /* Append the utf8 string src onto the utf8 string dst,
151 : * writing at most n+1 bytes. Can produce an invalid utf8
152 : * string if n falls partway through a utf8 codepoint. */
153 : utf8_nonnull utf8_weak utf8_int8_t *
154 : utf8ncat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
155 : size_t n);
156 :
157 : /* Return less than 0, 0, greater than 0 if src1 < src2,
158 : * src1 == src2, src1 > src2 respectively. Checking at most n
159 : * bytes of each utf8 string. */
160 : utf8_constexpr14 utf8_nonnull utf8_pure int
161 : utf8ncmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
162 :
163 : /* Copy the utf8 string src onto the memory allocated in dst.
164 : * Copies at most n bytes. If n falls partway through a utf8
165 : * codepoint, or if dst doesn't have enough room for a null
166 : * terminator, the final string will be cut short to preserve
167 : * utf8 validity. */
168 :
169 : utf8_nonnull utf8_weak utf8_int8_t *
170 : utf8ncpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
171 : size_t n);
172 :
173 : /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
174 : * longer than n, only n bytes are copied and a null byte is added.
175 : *
176 : * Returns a new string if successful, 0 otherwise */
177 : utf8_weak utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n);
178 :
179 : /* Locates the first occurrence in the utf8 string str of any byte in the
180 : * utf8 string accept, or 0 if no match was found. */
181 : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
182 : utf8pbrk(const utf8_int8_t *str, const utf8_int8_t *accept);
183 :
184 : /* Find the last match of the utf8 codepoint chr in the utf8 string src. */
185 : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
186 : utf8rchr(const utf8_int8_t *src, int chr);
187 :
188 : /* Number of bytes in the utf8 string str,
189 : * including the null terminating byte. */
190 : utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8size(const utf8_int8_t *str);
191 :
192 : /* Similar to utf8size, except that the null terminating byte is excluded. */
193 : utf8_constexpr14 utf8_nonnull utf8_pure size_t
194 : utf8size_lazy(const utf8_int8_t *str);
195 :
196 : /* Similar to utf8size, except that only at most n bytes of src are looked and
197 : * the null terminating byte is excluded. */
198 : utf8_constexpr14 utf8_nonnull utf8_pure size_t
199 : utf8nsize_lazy(const utf8_int8_t *str, size_t n);
200 :
201 : /* Number of utf8 codepoints in the utf8 string src that consists entirely
202 : * of utf8 codepoints from the utf8 string accept. */
203 : utf8_constexpr14 utf8_nonnull utf8_pure size_t
204 : utf8spn(const utf8_int8_t *src, const utf8_int8_t *accept);
205 :
206 : /* The position of the utf8 string needle in the utf8 string haystack. */
207 : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
208 : utf8str(const utf8_int8_t *haystack, const utf8_int8_t *needle);
209 :
210 : /* The position of the utf8 string needle in the utf8 string haystack, case
211 : * insensitive. */
212 : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
213 : utf8casestr(const utf8_int8_t *haystack, const utf8_int8_t *needle);
214 :
215 : /* Return 0 on success, or the position of the invalid
216 : * utf8 codepoint on failure. */
217 : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
218 : utf8valid(const utf8_int8_t *str);
219 :
220 : /* Similar to utf8valid, except that only at most n bytes of src are looked. */
221 : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
222 : utf8nvalid(const utf8_int8_t *str, size_t n);
223 :
224 : /* Given a null-terminated string, makes the string valid by replacing invalid
225 : * codepoints with a 1-byte replacement. Returns 0 on success. */
226 : utf8_nonnull utf8_weak int utf8makevalid(utf8_int8_t *str,
227 : const utf8_int32_t replacement);
228 :
229 : /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
230 : * address of the next utf8 codepoint after the current one in str. */
231 : utf8_constexpr14 utf8_nonnull utf8_int8_t *
232 : utf8codepoint(const utf8_int8_t *utf8_restrict str,
233 : utf8_int32_t *utf8_restrict out_codepoint);
234 :
235 : /* Calculates the size of the next utf8 codepoint in str. */
236 : utf8_constexpr14 utf8_nonnull size_t
237 : utf8codepointcalcsize(const utf8_int8_t *str);
238 :
239 : /* Returns the size of the given codepoint in bytes. */
240 : utf8_constexpr14 size_t utf8codepointsize(utf8_int32_t chr);
241 :
242 : /* Write a codepoint to the given string, and return the address to the next
243 : * place after the written codepoint. Pass how many bytes left in the buffer to
244 : * n. If there is not enough space for the codepoint, this function returns
245 : * null. */
246 : utf8_nonnull utf8_weak utf8_int8_t *
247 : utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n);
248 :
249 : /* Returns 1 if the given character is lowercase, or 0 if it is not. */
250 : utf8_constexpr14 int utf8islower(utf8_int32_t chr);
251 :
252 : /* Returns 1 if the given character is uppercase, or 0 if it is not. */
253 : utf8_constexpr14 int utf8isupper(utf8_int32_t chr);
254 :
255 : /* Transform the given string into all lowercase codepoints. */
256 : utf8_nonnull utf8_weak void utf8lwr(utf8_int8_t *utf8_restrict str);
257 :
258 : /* Transform the given string into all uppercase codepoints. */
259 : utf8_nonnull utf8_weak void utf8upr(utf8_int8_t *utf8_restrict str);
260 :
261 : /* Make a codepoint lower case if possible. */
262 : utf8_constexpr14 utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
263 :
264 : /* Make a codepoint upper case if possible. */
265 : utf8_constexpr14 utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
266 :
267 : /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
268 : * address of the previous utf8 codepoint before the current one in str. */
269 : utf8_constexpr14 utf8_nonnull utf8_int8_t *
270 : utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
271 : utf8_int32_t *utf8_restrict out_codepoint);
272 :
273 : /* Duplicate the utf8 string src by getting its size, calling alloc_func_ptr to
274 : * copy over data to a new buffer, and returning that. Or 0 if alloc_func_ptr
275 : * returned null. */
276 : utf8_weak utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
277 : utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
278 : size_t),
279 : utf8_int8_t *user_data);
280 :
281 : /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
282 : * longer than n, only n bytes are copied and a null byte is added.
283 : *
284 : * Returns a new string if successful, 0 otherwise. */
285 : utf8_weak utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
286 : utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
287 : size_t),
288 : utf8_int8_t *user_data);
289 :
290 : #undef utf8_weak
291 : #undef utf8_pure
292 : #undef utf8_nonnull
293 :
294 113 : utf8_constexpr14_impl int utf8casecmp(const utf8_int8_t *src1,
295 : const utf8_int8_t *src2) {
296 113 : utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
297 113 : src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
298 :
299 285 : for (;;) {
300 199 : src1 = utf8codepoint(src1, &src1_orig_cp);
301 199 : src2 = utf8codepoint(src2, &src2_orig_cp);
302 :
303 : /* lower the srcs if required */
304 199 : src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
305 199 : src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
306 :
307 : /* lower the srcs if required */
308 199 : src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
309 199 : src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
310 :
311 : /* check if the lowered codepoints match */
312 199 : if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
313 : return 0;
314 182 : } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
315 86 : continue;
316 : }
317 :
318 : /* if they don't match, then we return the difference between the characters
319 : */
320 96 : return src1_lwr_cp - src2_lwr_cp;
321 : }
322 : }
323 :
324 0 : utf8_int8_t *utf8cat(utf8_int8_t *utf8_restrict dst,
325 : const utf8_int8_t *utf8_restrict src) {
326 0 : utf8_int8_t *d = dst;
327 : /* find the null terminating byte in dst */
328 0 : while ('\0' != *d) {
329 0 : d++;
330 : }
331 :
332 : /* overwriting the null terminating byte in dst, append src byte-by-byte */
333 0 : while ('\0' != *src) {
334 0 : *d++ = *src++;
335 : }
336 :
337 : /* write out a new null terminating byte into dst */
338 0 : *d = '\0';
339 :
340 0 : return dst;
341 : }
342 :
343 0 : utf8_constexpr14_impl utf8_int8_t *utf8chr(const utf8_int8_t *src,
344 : utf8_int32_t chr) {
345 0 : utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
346 :
347 0 : if (0 == chr) {
348 : /* being asked to return position of null terminating byte, so
349 : * just run s to the end, and return! */
350 0 : while ('\0' != *src) {
351 0 : src++;
352 : }
353 0 : return (utf8_int8_t *)src;
354 0 : } else if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
355 : /* 1-byte/7-bit ascii
356 : * (0b0xxxxxxx) */
357 0 : c[0] = (utf8_int8_t)chr;
358 0 : } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
359 : /* 2-byte/11-bit utf8 code point
360 : * (0b110xxxxx 0b10xxxxxx) */
361 0 : c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
362 0 : c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
363 0 : } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
364 : /* 3-byte/16-bit utf8 code point
365 : * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
366 0 : c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
367 0 : c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
368 0 : c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
369 : } else { /* if (0 == ((int)0xffe00000 & chr)) { */
370 : /* 4-byte/21-bit utf8 code point
371 : * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
372 0 : c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
373 0 : c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
374 0 : c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
375 0 : c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
376 : }
377 :
378 : /* we've made c into a 2 utf8 codepoint string, one for the chr we are
379 : * seeking, another for the null terminating byte. Now use utf8str to
380 : * search */
381 0 : return utf8str(src, c);
382 : }
383 :
384 0 : utf8_constexpr14_impl int utf8cmp(const utf8_int8_t *src1,
385 : const utf8_int8_t *src2) {
386 0 : while (('\0' != *src1) || ('\0' != *src2)) {
387 0 : if (*src1 < *src2) {
388 : return -1;
389 0 : } else if (*src1 > *src2) {
390 : return 1;
391 : }
392 :
393 0 : src1++;
394 0 : src2++;
395 : }
396 :
397 : /* both utf8 strings matched */
398 : return 0;
399 : }
400 :
401 : utf8_constexpr14_impl int utf8coll(const utf8_int8_t *src1,
402 : const utf8_int8_t *src2);
403 :
404 0 : utf8_int8_t *utf8cpy(utf8_int8_t *utf8_restrict dst,
405 : const utf8_int8_t *utf8_restrict src) {
406 0 : utf8_int8_t *d = dst;
407 :
408 : /* overwriting anything previously in dst, write byte-by-byte
409 : * from src */
410 0 : while ('\0' != *src) {
411 0 : *d++ = *src++;
412 : }
413 :
414 : /* append null terminating byte */
415 0 : *d = '\0';
416 :
417 0 : return dst;
418 : }
419 :
420 0 : utf8_constexpr14_impl size_t utf8cspn(const utf8_int8_t *src,
421 : const utf8_int8_t *reject) {
422 0 : size_t chars = 0;
423 :
424 0 : while ('\0' != *src) {
425 : const utf8_int8_t *r = reject;
426 : size_t offset = 0;
427 :
428 0 : while ('\0' != *r) {
429 : /* checking that if *r is the start of a utf8 codepoint
430 : * (it is not 0b10xxxxxx) and we have successfully matched
431 : * a previous character (0 < offset) - we found a match */
432 0 : if ((0x80 != (0xc0 & *r)) && (0 < offset)) {
433 0 : return chars;
434 : } else {
435 0 : if (*r == src[offset]) {
436 : /* part of a utf8 codepoint matched, so move our checking
437 : * onwards to the next byte */
438 0 : offset++;
439 0 : r++;
440 : } else {
441 : /* r could be in the middle of an unmatching utf8 code point,
442 : * so we need to march it on to the next character beginning, */
443 :
444 0 : do {
445 0 : r++;
446 0 : } while (0x80 == (0xc0 & *r));
447 :
448 : /* reset offset too as we found a mismatch */
449 : offset = 0;
450 : }
451 : }
452 : }
453 :
454 : /* found a match at the end of *r, so didn't get a chance to test it */
455 0 : if (0 < offset) {
456 0 : return chars;
457 : }
458 :
459 : /* the current utf8 codepoint in src did not match reject, but src
460 : * could have been partway through a utf8 codepoint, so we need to
461 : * march it onto the next utf8 codepoint starting byte */
462 0 : do {
463 0 : src++;
464 0 : } while ((0x80 == (0xc0 & *src)));
465 0 : chars++;
466 : }
467 :
468 : return chars;
469 : }
470 :
471 0 : utf8_int8_t *utf8dup(const utf8_int8_t *src) {
472 0 : return utf8dup_ex(src, utf8_null, utf8_null);
473 : }
474 :
475 0 : utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
476 : utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
477 : utf8_int8_t *user_data) {
478 0 : utf8_int8_t *n = utf8_null;
479 :
480 : /* figure out how many bytes (including the terminator) we need to copy first
481 : */
482 0 : size_t bytes = utf8size(src);
483 :
484 0 : if (alloc_func_ptr) {
485 0 : n = alloc_func_ptr(user_data, bytes);
486 : } else {
487 : #if !defined(UTF8_NO_STD_MALLOC)
488 0 : n = (utf8_int8_t *)malloc(bytes);
489 : #else
490 : return utf8_null;
491 : #endif
492 : }
493 :
494 0 : if (utf8_null == n) {
495 : /* out of memory so we bail */
496 : return utf8_null;
497 : } else {
498 : bytes = 0;
499 :
500 : /* copy src byte-by-byte into our new utf8 string */
501 0 : while ('\0' != src[bytes]) {
502 0 : n[bytes] = src[bytes];
503 0 : bytes++;
504 : }
505 :
506 : /* append null terminating byte */
507 0 : n[bytes] = '\0';
508 0 : return n;
509 : }
510 : }
511 :
512 : utf8_constexpr14_impl utf8_int8_t *utf8fry(const utf8_int8_t *str);
513 :
514 34750332 : utf8_constexpr14_impl size_t utf8len(const utf8_int8_t *str) {
515 34750332 : return utf8nlen(str, SIZE_MAX);
516 : }
517 :
518 34741394 : utf8_constexpr14_impl size_t utf8nlen(const utf8_int8_t *str, size_t n) {
519 34741394 : const utf8_int8_t *t = str;
520 34741394 : size_t length = 0;
521 :
522 1086826508 : while ((size_t)(str - t) < n && '\0' != *str) {
523 1052085114 : if (0xf0 == (0xf8 & *str)) {
524 : /* 4-byte utf8 code point (began with 0b11110xxx) */
525 3 : str += 4;
526 1052085111 : } else if (0xe0 == (0xf0 & *str)) {
527 : /* 3-byte utf8 code point (began with 0b1110xxxx) */
528 172 : str += 3;
529 1052084939 : } else if (0xc0 == (0xe0 & *str)) {
530 : /* 2-byte utf8 code point (began with 0b110xxxxx) */
531 1616 : str += 2;
532 : } else { /* if (0x00 == (0x80 & *s)) { */
533 : /* 1-byte ascii (began with 0b0xxxxxxx) */
534 1052083323 : str += 1;
535 : }
536 :
537 : /* no matter the bytes we marched s forward by, it was
538 : * only 1 utf8 codepoint */
539 1052085114 : length++;
540 : }
541 :
542 34741394 : if ((size_t)(str - t) > n) {
543 0 : length--;
544 : }
545 34741394 : return length;
546 : }
547 :
548 65 : utf8_constexpr14_impl int utf8ncasecmp(const utf8_int8_t *src1,
549 : const utf8_int8_t *src2, size_t n) {
550 65 : utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
551 65 : src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
552 :
553 111 : do {
554 111 : const utf8_int8_t *const s1 = src1;
555 111 : const utf8_int8_t *const s2 = src2;
556 :
557 : /* first check that we have enough bytes left in n to contain an entire
558 : * codepoint */
559 111 : if (0 == n) {
560 : return 0;
561 : }
562 :
563 111 : if ((1 == n) && ((0xc0 == (0xe0 & *s1)) || (0xc0 == (0xe0 & *s2)))) {
564 1 : const utf8_int32_t c1 = (0xe0 & *s1);
565 1 : const utf8_int32_t c2 = (0xe0 & *s2);
566 :
567 1 : if (c1 != c2) {
568 1 : return c1 - c2;
569 : } else {
570 : return 0;
571 : }
572 : }
573 :
574 110 : if ((2 >= n) && ((0xe0 == (0xf0 & *s1)) || (0xe0 == (0xf0 & *s2)))) {
575 0 : const utf8_int32_t c1 = (0xf0 & *s1);
576 0 : const utf8_int32_t c2 = (0xf0 & *s2);
577 :
578 0 : if (c1 != c2) {
579 0 : return c1 - c2;
580 : } else {
581 : return 0;
582 : }
583 : }
584 :
585 110 : if ((3 >= n) && ((0xf0 == (0xf8 & *s1)) || (0xf0 == (0xf8 & *s2)))) {
586 0 : const utf8_int32_t c1 = (0xf8 & *s1);
587 0 : const utf8_int32_t c2 = (0xf8 & *s2);
588 :
589 0 : if (c1 != c2) {
590 0 : return c1 - c2;
591 : } else {
592 : return 0;
593 : }
594 : }
595 :
596 110 : src1 = utf8codepoint(src1, &src1_orig_cp);
597 110 : src2 = utf8codepoint(src2, &src2_orig_cp);
598 110 : n -= utf8codepointsize(src1_orig_cp);
599 :
600 110 : src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
601 110 : src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
602 :
603 110 : src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
604 110 : src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
605 :
606 : /* check if the lowered codepoints match */
607 110 : if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
608 : return 0;
609 110 : } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
610 58 : continue;
611 : }
612 :
613 : /* if they don't match, then we return the difference between the characters
614 : */
615 52 : return src1_lwr_cp - src2_lwr_cp;
616 58 : } while (0 < n);
617 :
618 : /* both utf8 strings matched */
619 : return 0;
620 : }
621 :
622 0 : utf8_int8_t *utf8ncat(utf8_int8_t *utf8_restrict dst,
623 : const utf8_int8_t *utf8_restrict src, size_t n) {
624 0 : utf8_int8_t *d = dst;
625 :
626 : /* find the null terminating byte in dst */
627 0 : while ('\0' != *d) {
628 0 : d++;
629 : }
630 :
631 : /* overwriting the null terminating byte in dst, append src byte-by-byte
632 : * stopping if we run out of space */
633 0 : while (('\0' != *src) && (0 != n--)) {
634 0 : *d++ = *src++;
635 : }
636 :
637 : /* write out a new null terminating byte into dst */
638 0 : *d = '\0';
639 :
640 0 : return dst;
641 : }
642 :
643 0 : utf8_constexpr14_impl int utf8ncmp(const utf8_int8_t *src1,
644 : const utf8_int8_t *src2, size_t n) {
645 0 : while ((0 != n--) && (('\0' != *src1) || ('\0' != *src2))) {
646 0 : if (*src1 < *src2) {
647 : return -1;
648 0 : } else if (*src1 > *src2) {
649 : return 1;
650 : }
651 :
652 0 : src1++;
653 0 : src2++;
654 : }
655 :
656 : /* both utf8 strings matched */
657 : return 0;
658 : }
659 :
660 0 : utf8_int8_t *utf8ncpy(utf8_int8_t *utf8_restrict dst,
661 : const utf8_int8_t *utf8_restrict src, size_t n) {
662 0 : utf8_int8_t *d = dst;
663 0 : size_t index = 0, check_index = 0;
664 :
665 0 : if (n == 0) {
666 : return dst;
667 : }
668 :
669 : /* overwriting anything previously in dst, write byte-by-byte
670 : * from src */
671 0 : for (index = 0; index < n; index++) {
672 0 : d[index] = src[index];
673 0 : if ('\0' == src[index]) {
674 : break;
675 : }
676 : }
677 :
678 0 : for (check_index = index - 1;
679 0 : check_index > 0 && 0x80 == (0xc0 & d[check_index]); check_index--) {
680 : /* just moving the index */
681 0 : }
682 :
683 0 : if (check_index < index &&
684 0 : (index - check_index) < utf8codepointsize(d[check_index])) {
685 0 : index = check_index;
686 : }
687 :
688 : /* append null terminating byte */
689 0 : for (; index < n; index++) {
690 0 : d[index] = 0;
691 : }
692 :
693 : return dst;
694 : }
695 :
696 0 : utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n) {
697 0 : return utf8ndup_ex(src, n, utf8_null, utf8_null);
698 : }
699 :
700 0 : utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
701 : utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
702 : utf8_int8_t *user_data) {
703 0 : utf8_int8_t *c = utf8_null;
704 0 : size_t bytes = 0;
705 :
706 : /* Find the end of the string or stop when n is reached */
707 0 : while ('\0' != src[bytes] && bytes < n) {
708 0 : bytes++;
709 : }
710 :
711 : /* In case bytes is actually less than n, we need to set it
712 : * to be used later in the copy byte by byte. */
713 0 : n = bytes;
714 :
715 0 : if (alloc_func_ptr) {
716 0 : c = alloc_func_ptr(user_data, bytes + 1);
717 : } else {
718 : #if !defined(UTF8_NO_STD_MALLOC)
719 0 : c = (utf8_int8_t *)malloc(bytes + 1);
720 : #else
721 : c = utf8_null;
722 : #endif
723 : }
724 :
725 0 : if (utf8_null == c) {
726 : /* out of memory so we bail */
727 : return utf8_null;
728 : }
729 :
730 : bytes = 0;
731 :
732 : /* copy src byte-by-byte into our new utf8 string */
733 0 : while ('\0' != src[bytes] && bytes < n) {
734 0 : c[bytes] = src[bytes];
735 0 : bytes++;
736 : }
737 :
738 : /* append null terminating byte */
739 0 : c[bytes] = '\0';
740 0 : return c;
741 : }
742 :
743 0 : utf8_constexpr14_impl utf8_int8_t *utf8rchr(const utf8_int8_t *src, int chr) {
744 :
745 0 : utf8_int8_t *match = utf8_null;
746 0 : utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
747 :
748 0 : if (0 == chr) {
749 : /* being asked to return position of null terminating byte, so
750 : * just run s to the end, and return! */
751 0 : while ('\0' != *src) {
752 0 : src++;
753 : }
754 0 : return (utf8_int8_t *)src;
755 0 : } else if (0 == ((int)0xffffff80 & chr)) {
756 : /* 1-byte/7-bit ascii
757 : * (0b0xxxxxxx) */
758 0 : c[0] = (utf8_int8_t)chr;
759 0 : } else if (0 == ((int)0xfffff800 & chr)) {
760 : /* 2-byte/11-bit utf8 code point
761 : * (0b110xxxxx 0b10xxxxxx) */
762 0 : c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
763 0 : c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
764 0 : } else if (0 == ((int)0xffff0000 & chr)) {
765 : /* 3-byte/16-bit utf8 code point
766 : * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
767 0 : c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
768 0 : c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
769 0 : c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
770 : } else { /* if (0 == ((int)0xffe00000 & chr)) { */
771 : /* 4-byte/21-bit utf8 code point
772 : * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
773 0 : c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
774 0 : c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
775 0 : c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
776 0 : c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
777 : }
778 :
779 : /* we've created a 2 utf8 codepoint string in c that is
780 : * the utf8 character asked for by chr, and a null
781 : * terminating byte */
782 :
783 0 : while ('\0' != *src) {
784 : size_t offset = 0;
785 :
786 0 : while (src[offset] == c[offset]) {
787 0 : offset++;
788 : }
789 :
790 0 : if ('\0' == c[offset]) {
791 : /* we found a matching utf8 code point */
792 : match = (utf8_int8_t *)src;
793 : src += offset;
794 : } else {
795 0 : src += offset;
796 :
797 : /* need to march s along to next utf8 codepoint start
798 : * (the next byte that doesn't match 0b10xxxxxx) */
799 0 : if ('\0' != *src) {
800 0 : do {
801 0 : src++;
802 0 : } while (0x80 == (0xc0 & *src));
803 : }
804 : }
805 : }
806 :
807 : /* return the last match we found (or 0 if no match was found) */
808 : return match;
809 : }
810 :
811 0 : utf8_constexpr14_impl utf8_int8_t *utf8pbrk(const utf8_int8_t *str,
812 : const utf8_int8_t *accept) {
813 0 : while ('\0' != *str) {
814 : const utf8_int8_t *a = accept;
815 : size_t offset = 0;
816 :
817 0 : while ('\0' != *a) {
818 : /* checking that if *a is the start of a utf8 codepoint
819 : * (it is not 0b10xxxxxx) and we have successfully matched
820 : * a previous character (0 < offset) - we found a match */
821 0 : if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
822 0 : return (utf8_int8_t *)str;
823 : } else {
824 0 : if (*a == str[offset]) {
825 : /* part of a utf8 codepoint matched, so move our checking
826 : * onwards to the next byte */
827 0 : offset++;
828 0 : a++;
829 : } else {
830 : /* r could be in the middle of an unmatching utf8 code point,
831 : * so we need to march it on to the next character beginning, */
832 :
833 0 : do {
834 0 : a++;
835 0 : } while (0x80 == (0xc0 & *a));
836 :
837 : /* reset offset too as we found a mismatch */
838 : offset = 0;
839 : }
840 : }
841 : }
842 :
843 : /* we found a match on the last utf8 codepoint */
844 0 : if (0 < offset) {
845 0 : return (utf8_int8_t *)str;
846 : }
847 :
848 : /* the current utf8 codepoint in src did not match accept, but src
849 : * could have been partway through a utf8 codepoint, so we need to
850 : * march it onto the next utf8 codepoint starting byte */
851 0 : do {
852 0 : str++;
853 0 : } while ((0x80 == (0xc0 & *str)));
854 : }
855 :
856 : return utf8_null;
857 : }
858 :
859 0 : utf8_constexpr14_impl size_t utf8size(const utf8_int8_t *str) {
860 0 : return utf8size_lazy(str) + 1;
861 : }
862 :
863 0 : utf8_constexpr14_impl size_t utf8size_lazy(const utf8_int8_t *str) {
864 0 : return utf8nsize_lazy(str, SIZE_MAX);
865 : }
866 :
867 0 : utf8_constexpr14_impl size_t utf8nsize_lazy(const utf8_int8_t *str, size_t n) {
868 0 : size_t size = 0;
869 0 : while (size < n && '\0' != str[size]) {
870 0 : size++;
871 : }
872 0 : return size;
873 : }
874 :
875 0 : utf8_constexpr14_impl size_t utf8spn(const utf8_int8_t *src,
876 : const utf8_int8_t *accept) {
877 0 : size_t chars = 0;
878 :
879 0 : while ('\0' != *src) {
880 : const utf8_int8_t *a = accept;
881 : size_t offset = 0;
882 :
883 0 : while ('\0' != *a) {
884 : /* checking that if *r is the start of a utf8 codepoint
885 : * (it is not 0b10xxxxxx) and we have successfully matched
886 : * a previous character (0 < offset) - we found a match */
887 0 : if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
888 : /* found a match, so increment the number of utf8 codepoints
889 : * that have matched and stop checking whether any other utf8
890 : * codepoints in a match */
891 0 : chars++;
892 0 : src += offset;
893 0 : offset = 0;
894 0 : break;
895 : } else {
896 0 : if (*a == src[offset]) {
897 0 : offset++;
898 0 : a++;
899 : } else {
900 : /* a could be in the middle of an unmatching utf8 codepoint,
901 : * so we need to march it on to the next character beginning, */
902 0 : do {
903 0 : a++;
904 0 : } while (0x80 == (0xc0 & *a));
905 :
906 : /* reset offset too as we found a mismatch */
907 : offset = 0;
908 : }
909 : }
910 : }
911 :
912 : /* found a match at the end of *a, so didn't get a chance to test it */
913 0 : if (0 < offset) {
914 0 : chars++;
915 0 : src += offset;
916 0 : continue;
917 : }
918 :
919 : /* if a got to its terminating null byte, then we didn't find a match.
920 : * Return the current number of matched utf8 codepoints */
921 0 : if ('\0' == *a) {
922 0 : return chars;
923 : }
924 : }
925 :
926 : return chars;
927 : }
928 :
929 0 : utf8_constexpr14_impl utf8_int8_t *utf8str(const utf8_int8_t *haystack,
930 : const utf8_int8_t *needle) {
931 0 : utf8_int32_t throwaway_codepoint = 0;
932 :
933 : /* if needle has no utf8 codepoints before the null terminating
934 : * byte then return haystack */
935 0 : if ('\0' == *needle) {
936 : return (utf8_int8_t *)haystack;
937 : }
938 :
939 0 : while ('\0' != *haystack) {
940 : const utf8_int8_t *maybeMatch = haystack;
941 : const utf8_int8_t *n = needle;
942 :
943 0 : while (*haystack == *n && (*haystack != '\0' && *n != '\0')) {
944 0 : n++;
945 0 : haystack++;
946 : }
947 :
948 0 : if ('\0' == *n) {
949 : /* we found the whole utf8 string for needle in haystack at
950 : * maybeMatch, so return it */
951 0 : return (utf8_int8_t *)maybeMatch;
952 : } else {
953 : /* h could be in the middle of an unmatching utf8 codepoint,
954 : * so we need to march it on to the next character beginning
955 : * starting from the current character */
956 0 : haystack = utf8codepoint(maybeMatch, &throwaway_codepoint);
957 : }
958 : }
959 :
960 : /* no match */
961 : return utf8_null;
962 : }
963 :
964 164 : utf8_constexpr14_impl utf8_int8_t *utf8casestr(const utf8_int8_t *haystack,
965 : const utf8_int8_t *needle) {
966 : /* if needle has no utf8 codepoints before the null terminating
967 : * byte then return haystack */
968 164 : if ('\0' == *needle) {
969 : return (utf8_int8_t *)haystack;
970 : }
971 :
972 3110 : for (;;) {
973 1637 : const utf8_int8_t *maybeMatch = haystack;
974 1637 : const utf8_int8_t *n = needle;
975 1637 : utf8_int32_t h_cp = 0, n_cp = 0;
976 :
977 : /* Get the next code point and track it */
978 1637 : const utf8_int8_t *nextH = haystack = utf8codepoint(haystack, &h_cp);
979 1599 : n = utf8codepoint(n, &n_cp);
980 :
981 1751 : while ((0 != h_cp) && (0 != n_cp)) {
982 1592 : h_cp = utf8lwrcodepoint(h_cp);
983 1579 : n_cp = utf8lwrcodepoint(n_cp);
984 :
985 : /* if we find a mismatch, bail out! */
986 1640 : if (h_cp != n_cp) {
987 : break;
988 : }
989 :
990 166 : haystack = utf8codepoint(haystack, &h_cp);
991 165 : n = utf8codepoint(n, &n_cp);
992 : }
993 :
994 1635 : if (0 == n_cp) {
995 : /* we found the whole utf8 string for needle in haystack at
996 : * maybeMatch, so return it */
997 162 : return (utf8_int8_t *)maybeMatch;
998 : }
999 :
1000 1607 : if (0 == h_cp) {
1001 : /* no match */
1002 : return utf8_null;
1003 : }
1004 :
1005 : /* Roll back to the next code point in the haystack to test */
1006 1473 : haystack = nextH;
1007 : }
1008 : }
1009 :
1010 42109717 : utf8_constexpr14_impl utf8_int8_t *utf8valid(const utf8_int8_t *str) {
1011 42109717 : return utf8nvalid(str, SIZE_MAX);
1012 : }
1013 :
1014 42454067 : utf8_constexpr14_impl utf8_int8_t *utf8nvalid(const utf8_int8_t *str,
1015 : size_t n) {
1016 42454067 : const utf8_int8_t *t = str;
1017 42454067 : size_t consumed = 0;
1018 :
1019 1451246962 : while ((void)(consumed = (size_t)(str - t)), consumed < n && '\0' != *str) {
1020 1408792895 : const size_t remaining = n - consumed;
1021 :
1022 1408792895 : if (0xf0 == (0xf8 & *str)) {
1023 : /* ensure that there's 4 bytes or more remaining */
1024 3 : if (remaining < 4) {
1025 0 : return (utf8_int8_t *)str;
1026 : }
1027 :
1028 : /* ensure each of the 3 following bytes in this 4-byte
1029 : * utf8 codepoint began with 0b10xxxxxx */
1030 3 : if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2])) ||
1031 3 : (0x80 != (0xc0 & str[3]))) {
1032 0 : return (utf8_int8_t *)str;
1033 : }
1034 :
1035 : /* ensure that our utf8 codepoint ended after 4 bytes */
1036 3 : if ((remaining != 4) && (0x80 == (0xc0 & str[4]))) {
1037 0 : return (utf8_int8_t *)str;
1038 : }
1039 :
1040 : /* ensure that the top 5 bits of this 4-byte utf8
1041 : * codepoint were not 0, as then we could have used
1042 : * one of the smaller encodings */
1043 3 : if ((0 == (0x07 & str[0])) && (0 == (0x30 & str[1]))) {
1044 0 : return (utf8_int8_t *)str;
1045 : }
1046 :
1047 : /* 4-byte utf8 code point (began with 0b11110xxx) */
1048 3 : str += 4;
1049 1408792892 : } else if (0xe0 == (0xf0 & *str)) {
1050 : /* ensure that there's 3 bytes or more remaining */
1051 233 : if (remaining < 3) {
1052 0 : return (utf8_int8_t *)str;
1053 : }
1054 :
1055 : /* ensure each of the 2 following bytes in this 3-byte
1056 : * utf8 codepoint began with 0b10xxxxxx */
1057 233 : if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2]))) {
1058 0 : return (utf8_int8_t *)str;
1059 : }
1060 :
1061 : /* ensure that our utf8 codepoint ended after 3 bytes */
1062 233 : if ((remaining != 3) && (0x80 == (0xc0 & str[3]))) {
1063 0 : return (utf8_int8_t *)str;
1064 : }
1065 :
1066 : /* ensure that the top 5 bits of this 3-byte utf8
1067 : * codepoint were not 0, as then we could have used
1068 : * one of the smaller encodings */
1069 233 : if ((0 == (0x0f & str[0])) && (0 == (0x20 & str[1]))) {
1070 0 : return (utf8_int8_t *)str;
1071 : }
1072 :
1073 : /* 3-byte utf8 code point (began with 0b1110xxxx) */
1074 233 : str += 3;
1075 1408792659 : } else if (0xc0 == (0xe0 & *str)) {
1076 : /* ensure that there's 2 bytes or more remaining */
1077 2259 : if (remaining < 2) {
1078 0 : return (utf8_int8_t *)str;
1079 : }
1080 :
1081 : /* ensure the 1 following byte in this 2-byte
1082 : * utf8 codepoint began with 0b10xxxxxx */
1083 2259 : if (0x80 != (0xc0 & str[1])) {
1084 0 : return (utf8_int8_t *)str;
1085 : }
1086 :
1087 : /* ensure that our utf8 codepoint ended after 2 bytes */
1088 2259 : if ((remaining != 2) && (0x80 == (0xc0 & str[2]))) {
1089 0 : return (utf8_int8_t *)str;
1090 : }
1091 :
1092 : /* ensure that the top 4 bits of this 2-byte utf8
1093 : * codepoint were not 0, as then we could have used
1094 : * one of the smaller encodings */
1095 2259 : if (0 == (0x1e & str[0])) {
1096 0 : return (utf8_int8_t *)str;
1097 : }
1098 :
1099 : /* 2-byte utf8 code point (began with 0b110xxxxx) */
1100 2259 : str += 2;
1101 1408790400 : } else if (0x00 == (0x80 & *str)) {
1102 : /* 1-byte ascii (began with 0b0xxxxxxx) */
1103 1408790400 : str += 1;
1104 : } else {
1105 : /* we have an invalid 0b1xxxxxxx utf8 code point entry */
1106 0 : return (utf8_int8_t *)str;
1107 : }
1108 : }
1109 :
1110 : return utf8_null;
1111 : }
1112 :
1113 0 : int utf8makevalid(utf8_int8_t *str, const utf8_int32_t replacement) {
1114 0 : utf8_int8_t *read = str;
1115 0 : utf8_int8_t *write = read;
1116 0 : const utf8_int8_t r = (utf8_int8_t)replacement;
1117 0 : utf8_int32_t codepoint = 0;
1118 :
1119 0 : if (replacement > 0x7f) {
1120 : return -1;
1121 : }
1122 :
1123 0 : while ('\0' != *read) {
1124 0 : if (0xf0 == (0xf8 & *read)) {
1125 : /* ensure each of the 3 following bytes in this 4-byte
1126 : * utf8 codepoint began with 0b10xxxxxx */
1127 0 : if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2])) ||
1128 0 : (0x80 != (0xc0 & read[3]))) {
1129 0 : *write++ = r;
1130 0 : read++;
1131 0 : continue;
1132 : }
1133 :
1134 : /* 4-byte utf8 code point (began with 0b11110xxx) */
1135 0 : read = utf8codepoint(read, &codepoint);
1136 0 : write = utf8catcodepoint(write, codepoint, 4);
1137 0 : } else if (0xe0 == (0xf0 & *read)) {
1138 : /* ensure each of the 2 following bytes in this 3-byte
1139 : * utf8 codepoint began with 0b10xxxxxx */
1140 0 : if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2]))) {
1141 0 : *write++ = r;
1142 0 : read++;
1143 0 : continue;
1144 : }
1145 :
1146 : /* 3-byte utf8 code point (began with 0b1110xxxx) */
1147 0 : read = utf8codepoint(read, &codepoint);
1148 0 : write = utf8catcodepoint(write, codepoint, 3);
1149 0 : } else if (0xc0 == (0xe0 & *read)) {
1150 : /* ensure the 1 following byte in this 2-byte
1151 : * utf8 codepoint began with 0b10xxxxxx */
1152 0 : if (0x80 != (0xc0 & read[1])) {
1153 0 : *write++ = r;
1154 0 : read++;
1155 0 : continue;
1156 : }
1157 :
1158 : /* 2-byte utf8 code point (began with 0b110xxxxx) */
1159 0 : read = utf8codepoint(read, &codepoint);
1160 0 : write = utf8catcodepoint(write, codepoint, 2);
1161 0 : } else if (0x00 == (0x80 & *read)) {
1162 : /* 1-byte ascii (began with 0b0xxxxxxx) */
1163 0 : read = utf8codepoint(read, &codepoint);
1164 0 : write = utf8catcodepoint(write, codepoint, 1);
1165 : } else {
1166 : /* if we got here then we've got a dangling continuation (0b10xxxxxx) */
1167 0 : *write++ = r;
1168 0 : read++;
1169 0 : continue;
1170 : }
1171 : }
1172 :
1173 0 : *write = '\0';
1174 :
1175 0 : return 0;
1176 : }
1177 :
1178 : utf8_constexpr14_impl utf8_int8_t *
1179 4151 : utf8codepoint(const utf8_int8_t *utf8_restrict str,
1180 : utf8_int32_t *utf8_restrict out_codepoint) {
1181 4151 : if (0xf0 == (0xf8 & str[0])) {
1182 : /* 4 byte utf8 codepoint */
1183 0 : *out_codepoint = ((0x07 & str[0]) << 18) | ((0x3f & str[1]) << 12) |
1184 0 : ((0x3f & str[2]) << 6) | (0x3f & str[3]);
1185 0 : str += 4;
1186 4151 : } else if (0xe0 == (0xf0 & str[0])) {
1187 : /* 3 byte utf8 codepoint */
1188 0 : *out_codepoint = ((0x0f & str[0]) << 12) | ((0x3f & str[1]) << 6) | (0x3f & str[2]);
1189 0 : str += 3;
1190 4151 : } else if (0xc0 == (0xe0 & str[0])) {
1191 : /* 2 byte utf8 codepoint */
1192 751 : *out_codepoint = ((0x1f & str[0]) << 6) | (0x3f & str[1]);
1193 751 : str += 2;
1194 : } else {
1195 : /* 1 byte utf8 codepoint otherwise */
1196 3400 : *out_codepoint = str[0];
1197 3400 : str += 1;
1198 : }
1199 :
1200 4151 : return (utf8_int8_t *)str;
1201 : }
1202 :
1203 0 : utf8_constexpr14_impl size_t utf8codepointcalcsize(const utf8_int8_t *str) {
1204 0 : if (0xf0 == (0xf8 & str[0])) {
1205 : /* 4 byte utf8 codepoint */
1206 : return 4;
1207 0 : } else if (0xe0 == (0xf0 & str[0])) {
1208 : /* 3 byte utf8 codepoint */
1209 : return 3;
1210 0 : } else if (0xc0 == (0xe0 & str[0])) {
1211 : /* 2 byte utf8 codepoint */
1212 0 : return 2;
1213 : }
1214 :
1215 : /* 1 byte utf8 codepoint otherwise */
1216 : return 1;
1217 : }
1218 :
1219 110 : utf8_constexpr14_impl size_t utf8codepointsize(utf8_int32_t chr) {
1220 110 : if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
1221 : return 1;
1222 35 : } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
1223 : return 2;
1224 0 : } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
1225 : return 3;
1226 : } else { /* if (0 == ((int)0xffe00000 & chr)) { */
1227 0 : return 4;
1228 : }
1229 : }
1230 :
1231 0 : utf8_int8_t *utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n) {
1232 0 : if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
1233 : /* 1-byte/7-bit ascii
1234 : * (0b0xxxxxxx) */
1235 0 : if (n < 1) {
1236 : return utf8_null;
1237 : }
1238 0 : str[0] = (utf8_int8_t)chr;
1239 0 : str += 1;
1240 0 : } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
1241 : /* 2-byte/11-bit utf8 code point
1242 : * (0b110xxxxx 0b10xxxxxx) */
1243 0 : if (n < 2) {
1244 : return utf8_null;
1245 : }
1246 0 : str[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)((chr >> 6) & 0x1f));
1247 0 : str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1248 0 : str += 2;
1249 0 : } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
1250 : /* 3-byte/16-bit utf8 code point
1251 : * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
1252 0 : if (n < 3) {
1253 : return utf8_null;
1254 : }
1255 0 : str[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)((chr >> 12) & 0x0f));
1256 0 : str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
1257 0 : str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1258 0 : str += 3;
1259 : } else { /* if (0 == ((int)0xffe00000 & chr)) { */
1260 : /* 4-byte/21-bit utf8 code point
1261 : * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
1262 0 : if (n < 4) {
1263 : return utf8_null;
1264 : }
1265 0 : str[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)((chr >> 18) & 0x07));
1266 0 : str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
1267 0 : str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
1268 0 : str[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1269 0 : str += 4;
1270 : }
1271 :
1272 : return str;
1273 : }
1274 :
1275 0 : utf8_constexpr14_impl int utf8islower(utf8_int32_t chr) {
1276 0 : return chr != utf8uprcodepoint(chr);
1277 : }
1278 :
1279 0 : utf8_constexpr14_impl int utf8isupper(utf8_int32_t chr) {
1280 0 : return chr != utf8lwrcodepoint(chr);
1281 : }
1282 :
1283 0 : void utf8lwr(utf8_int8_t *utf8_restrict str) {
1284 0 : utf8_int32_t cp = 0;
1285 0 : utf8_int8_t *pn = utf8codepoint(str, &cp);
1286 :
1287 0 : while (cp != 0) {
1288 0 : const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp);
1289 0 : const size_t size = utf8codepointsize(lwr_cp);
1290 :
1291 0 : if (lwr_cp != cp) {
1292 0 : utf8catcodepoint(str, lwr_cp, size);
1293 : }
1294 :
1295 0 : str = pn;
1296 0 : pn = utf8codepoint(str, &cp);
1297 : }
1298 0 : }
1299 :
1300 0 : void utf8upr(utf8_int8_t *utf8_restrict str) {
1301 0 : utf8_int32_t cp = 0;
1302 0 : utf8_int8_t *pn = utf8codepoint(str, &cp);
1303 :
1304 0 : while (cp != 0) {
1305 0 : const utf8_int32_t lwr_cp = utf8uprcodepoint(cp);
1306 0 : const size_t size = utf8codepointsize(lwr_cp);
1307 :
1308 0 : if (lwr_cp != cp) {
1309 0 : utf8catcodepoint(str, lwr_cp, size);
1310 : }
1311 :
1312 0 : str = pn;
1313 0 : pn = utf8codepoint(str, &cp);
1314 : }
1315 0 : }
1316 :
1317 3872 : utf8_constexpr14_impl utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
1318 3872 : if (((0x0041 <= cp) && (0x005a >= cp)) ||
1319 3872 : ((0x00c0 <= cp) && (0x00d6 >= cp)) ||
1320 : ((0x00d8 <= cp) && (0x00de >= cp)) ||
1321 : ((0x0391 <= cp) && (0x03a1 >= cp)) ||
1322 : ((0x03a3 <= cp) && (0x03ab >= cp)) ||
1323 : ((0x0410 <= cp) && (0x042f >= cp))) {
1324 1635 : cp += 32;
1325 : } else if ((0x0400 <= cp) && (0x040f >= cp)) {
1326 0 : cp += 80;
1327 : } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1328 : ((0x0132 <= cp) && (0x0137 >= cp)) ||
1329 : ((0x014a <= cp) && (0x0177 >= cp)) ||
1330 : ((0x0182 <= cp) && (0x0185 >= cp)) ||
1331 : ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1332 : ((0x01de <= cp) && (0x01ef >= cp)) ||
1333 : ((0x01f8 <= cp) && (0x021f >= cp)) ||
1334 : ((0x0222 <= cp) && (0x0233 >= cp)) ||
1335 : ((0x0246 <= cp) && (0x024f >= cp)) ||
1336 : ((0x03d8 <= cp) && (0x03ef >= cp)) ||
1337 : ((0x0460 <= cp) && (0x0481 >= cp)) ||
1338 : ((0x048a <= cp) && (0x04ff >= cp))) {
1339 0 : cp |= 0x1;
1340 : } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1341 : ((0x0179 <= cp) && (0x017e >= cp)) ||
1342 : ((0x01af <= cp) && (0x01b0 >= cp)) ||
1343 : ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1344 : ((0x01cd <= cp) && (0x01dc >= cp))) {
1345 0 : cp += 1;
1346 0 : cp &= ~0x1;
1347 : } else {
1348 2237 : switch (cp) {
1349 : default:
1350 : break;
1351 0 : case 0x0178:
1352 0 : cp = 0x00ff;
1353 0 : break;
1354 0 : case 0x0243:
1355 0 : cp = 0x0180;
1356 0 : break;
1357 0 : case 0x018e:
1358 0 : cp = 0x01dd;
1359 0 : break;
1360 0 : case 0x023d:
1361 0 : cp = 0x019a;
1362 0 : break;
1363 0 : case 0x0220:
1364 0 : cp = 0x019e;
1365 0 : break;
1366 0 : case 0x01b7:
1367 0 : cp = 0x0292;
1368 0 : break;
1369 0 : case 0x01c4:
1370 0 : cp = 0x01c6;
1371 0 : break;
1372 0 : case 0x01c7:
1373 0 : cp = 0x01c9;
1374 0 : break;
1375 0 : case 0x01ca:
1376 0 : cp = 0x01cc;
1377 0 : break;
1378 0 : case 0x01f1:
1379 0 : cp = 0x01f3;
1380 0 : break;
1381 0 : case 0x01f7:
1382 0 : cp = 0x01bf;
1383 0 : break;
1384 0 : case 0x0187:
1385 0 : cp = 0x0188;
1386 0 : break;
1387 0 : case 0x018b:
1388 0 : cp = 0x018c;
1389 0 : break;
1390 0 : case 0x0191:
1391 0 : cp = 0x0192;
1392 0 : break;
1393 0 : case 0x0198:
1394 0 : cp = 0x0199;
1395 0 : break;
1396 0 : case 0x01a7:
1397 0 : cp = 0x01a8;
1398 0 : break;
1399 0 : case 0x01ac:
1400 0 : cp = 0x01ad;
1401 0 : break;
1402 0 : case 0x01af:
1403 0 : cp = 0x01b0;
1404 0 : break;
1405 0 : case 0x01b8:
1406 0 : cp = 0x01b9;
1407 0 : break;
1408 0 : case 0x01bc:
1409 0 : cp = 0x01bd;
1410 0 : break;
1411 0 : case 0x01f4:
1412 0 : cp = 0x01f5;
1413 0 : break;
1414 0 : case 0x023b:
1415 0 : cp = 0x023c;
1416 0 : break;
1417 0 : case 0x0241:
1418 0 : cp = 0x0242;
1419 0 : break;
1420 0 : case 0x03fd:
1421 0 : cp = 0x037b;
1422 0 : break;
1423 0 : case 0x03fe:
1424 0 : cp = 0x037c;
1425 0 : break;
1426 0 : case 0x03ff:
1427 0 : cp = 0x037d;
1428 0 : break;
1429 0 : case 0x037f:
1430 0 : cp = 0x03f3;
1431 0 : break;
1432 0 : case 0x0386:
1433 0 : cp = 0x03ac;
1434 0 : break;
1435 0 : case 0x0388:
1436 0 : cp = 0x03ad;
1437 0 : break;
1438 0 : case 0x0389:
1439 0 : cp = 0x03ae;
1440 0 : break;
1441 0 : case 0x038a:
1442 0 : cp = 0x03af;
1443 0 : break;
1444 0 : case 0x038c:
1445 0 : cp = 0x03cc;
1446 0 : break;
1447 0 : case 0x038e:
1448 0 : cp = 0x03cd;
1449 0 : break;
1450 0 : case 0x038f:
1451 0 : cp = 0x03ce;
1452 0 : break;
1453 0 : case 0x0370:
1454 0 : cp = 0x0371;
1455 0 : break;
1456 0 : case 0x0372:
1457 0 : cp = 0x0373;
1458 0 : break;
1459 0 : case 0x0376:
1460 0 : cp = 0x0377;
1461 0 : break;
1462 0 : case 0x03f4:
1463 0 : cp = 0x03b8;
1464 0 : break;
1465 0 : case 0x03cf:
1466 0 : cp = 0x03d7;
1467 0 : break;
1468 0 : case 0x03f9:
1469 0 : cp = 0x03f2;
1470 0 : break;
1471 0 : case 0x03f7:
1472 0 : cp = 0x03f8;
1473 0 : break;
1474 0 : case 0x03fa:
1475 0 : cp = 0x03fb;
1476 0 : break;
1477 : }
1478 : }
1479 :
1480 3872 : return cp;
1481 : }
1482 :
1483 618 : utf8_constexpr14_impl utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
1484 618 : if (((0x0061 <= cp) && (0x007a >= cp)) ||
1485 618 : ((0x00e0 <= cp) && (0x00f6 >= cp)) ||
1486 : ((0x00f8 <= cp) && (0x00fe >= cp)) ||
1487 : ((0x03b1 <= cp) && (0x03c1 >= cp)) ||
1488 : ((0x03c3 <= cp) && (0x03cb >= cp)) ||
1489 : ((0x0430 <= cp) && (0x044f >= cp))) {
1490 326 : cp -= 32;
1491 : } else if ((0x0450 <= cp) && (0x045f >= cp)) {
1492 0 : cp -= 80;
1493 : } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1494 : ((0x0132 <= cp) && (0x0137 >= cp)) ||
1495 : ((0x014a <= cp) && (0x0177 >= cp)) ||
1496 : ((0x0182 <= cp) && (0x0185 >= cp)) ||
1497 : ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
1498 : ((0x01de <= cp) && (0x01ef >= cp)) ||
1499 : ((0x01f8 <= cp) && (0x021f >= cp)) ||
1500 : ((0x0222 <= cp) && (0x0233 >= cp)) ||
1501 : ((0x0246 <= cp) && (0x024f >= cp)) ||
1502 : ((0x03d8 <= cp) && (0x03ef >= cp)) ||
1503 : ((0x0460 <= cp) && (0x0481 >= cp)) ||
1504 : ((0x048a <= cp) && (0x04ff >= cp))) {
1505 0 : cp &= ~0x1;
1506 : } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1507 : ((0x0179 <= cp) && (0x017e >= cp)) ||
1508 : ((0x01af <= cp) && (0x01b0 >= cp)) ||
1509 : ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
1510 : ((0x01cd <= cp) && (0x01dc >= cp))) {
1511 0 : cp -= 1;
1512 0 : cp |= 0x1;
1513 : } else {
1514 292 : switch (cp) {
1515 : default:
1516 : break;
1517 0 : case 0x00ff:
1518 0 : cp = 0x0178;
1519 0 : break;
1520 0 : case 0x0180:
1521 0 : cp = 0x0243;
1522 0 : break;
1523 0 : case 0x01dd:
1524 0 : cp = 0x018e;
1525 0 : break;
1526 0 : case 0x019a:
1527 0 : cp = 0x023d;
1528 0 : break;
1529 0 : case 0x019e:
1530 0 : cp = 0x0220;
1531 0 : break;
1532 0 : case 0x0292:
1533 0 : cp = 0x01b7;
1534 0 : break;
1535 0 : case 0x01c6:
1536 0 : cp = 0x01c4;
1537 0 : break;
1538 0 : case 0x01c9:
1539 0 : cp = 0x01c7;
1540 0 : break;
1541 0 : case 0x01cc:
1542 0 : cp = 0x01ca;
1543 0 : break;
1544 0 : case 0x01f3:
1545 0 : cp = 0x01f1;
1546 0 : break;
1547 0 : case 0x01bf:
1548 0 : cp = 0x01f7;
1549 0 : break;
1550 0 : case 0x0188:
1551 0 : cp = 0x0187;
1552 0 : break;
1553 0 : case 0x018c:
1554 0 : cp = 0x018b;
1555 0 : break;
1556 0 : case 0x0192:
1557 0 : cp = 0x0191;
1558 0 : break;
1559 0 : case 0x0199:
1560 0 : cp = 0x0198;
1561 0 : break;
1562 0 : case 0x01a8:
1563 0 : cp = 0x01a7;
1564 0 : break;
1565 0 : case 0x01ad:
1566 0 : cp = 0x01ac;
1567 0 : break;
1568 0 : case 0x01b0:
1569 0 : cp = 0x01af;
1570 0 : break;
1571 0 : case 0x01b9:
1572 0 : cp = 0x01b8;
1573 0 : break;
1574 0 : case 0x01bd:
1575 0 : cp = 0x01bc;
1576 0 : break;
1577 0 : case 0x01f5:
1578 0 : cp = 0x01f4;
1579 0 : break;
1580 0 : case 0x023c:
1581 0 : cp = 0x023b;
1582 0 : break;
1583 0 : case 0x0242:
1584 0 : cp = 0x0241;
1585 0 : break;
1586 0 : case 0x037b:
1587 0 : cp = 0x03fd;
1588 0 : break;
1589 0 : case 0x037c:
1590 0 : cp = 0x03fe;
1591 0 : break;
1592 0 : case 0x037d:
1593 0 : cp = 0x03ff;
1594 0 : break;
1595 0 : case 0x03f3:
1596 0 : cp = 0x037f;
1597 0 : break;
1598 0 : case 0x03ac:
1599 0 : cp = 0x0386;
1600 0 : break;
1601 0 : case 0x03ad:
1602 0 : cp = 0x0388;
1603 0 : break;
1604 12 : case 0x03ae:
1605 12 : cp = 0x0389;
1606 12 : break;
1607 0 : case 0x03af:
1608 0 : cp = 0x038a;
1609 0 : break;
1610 0 : case 0x03cc:
1611 0 : cp = 0x038c;
1612 0 : break;
1613 0 : case 0x03cd:
1614 0 : cp = 0x038e;
1615 0 : break;
1616 0 : case 0x03ce:
1617 0 : cp = 0x038f;
1618 0 : break;
1619 0 : case 0x0371:
1620 0 : cp = 0x0370;
1621 0 : break;
1622 0 : case 0x0373:
1623 0 : cp = 0x0372;
1624 0 : break;
1625 0 : case 0x0377:
1626 0 : cp = 0x0376;
1627 0 : break;
1628 0 : case 0x03d1:
1629 0 : cp = 0x0398;
1630 0 : break;
1631 0 : case 0x03d7:
1632 0 : cp = 0x03cf;
1633 0 : break;
1634 0 : case 0x03f2:
1635 0 : cp = 0x03f9;
1636 0 : break;
1637 0 : case 0x03f8:
1638 0 : cp = 0x03f7;
1639 0 : break;
1640 0 : case 0x03fb:
1641 0 : cp = 0x03fa;
1642 0 : break;
1643 : }
1644 : }
1645 :
1646 618 : return cp;
1647 : }
1648 :
1649 : utf8_constexpr14_impl utf8_int8_t *
1650 0 : utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
1651 : utf8_int32_t *utf8_restrict out_codepoint) {
1652 0 : const utf8_int8_t *s = (const utf8_int8_t *)str;
1653 :
1654 0 : if (0xf0 == (0xf8 & s[0])) {
1655 : /* 4 byte utf8 codepoint */
1656 0 : *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
1657 0 : ((0x3f & s[2]) << 6) | (0x3f & s[3]);
1658 0 : } else if (0xe0 == (0xf0 & s[0])) {
1659 : /* 3 byte utf8 codepoint */
1660 0 : *out_codepoint = ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
1661 0 : } else if (0xc0 == (0xe0 & s[0])) {
1662 : /* 2 byte utf8 codepoint */
1663 0 : *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
1664 : } else {
1665 : /* 1 byte utf8 codepoint otherwise */
1666 0 : *out_codepoint = s[0];
1667 : }
1668 :
1669 0 : do {
1670 0 : s--;
1671 0 : } while ((0 != (0x80 & s[0])) && (0x80 == (0xc0 & s[0])));
1672 :
1673 0 : return (utf8_int8_t *)s;
1674 : }
1675 :
1676 : #undef utf8_restrict
1677 : #undef utf8_constexpr14
1678 : #undef utf8_null
1679 :
1680 : #ifdef __cplusplus
1681 : } /* extern "C" */
1682 : #endif
1683 :
1684 : #if defined(__clang__)
1685 : #pragma clang diagnostic pop
1686 : #endif
1687 :
1688 : #endif /* SHEREDOM_UTF8_H_INCLUDED */
|