LCOV - code coverage report
Current view: top level - monetdb5/modules/atoms - utf8.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 129 722 17.9 %
Date: 2024-04-25 20:03:45 Functions: 11 38 28.9 %

          Line data    Source code
       1             : /* The latest version of this library is available on GitHub;
       2             :  * https://github.com/sheredom/utf8.h */
       3             : 
       4             : /* This is free and unencumbered software released into the public domain.
       5             :  *
       6             :  * Anyone is free to copy, modify, publish, use, compile, sell, or
       7             :  * distribute this software, either in source code form or as a compiled
       8             :  * binary, for any purpose, commercial or non-commercial, and by any
       9             :  * means.
      10             :  *
      11             :  * In jurisdictions that recognize copyright laws, the author or authors
      12             :  * of this software dedicate any and all copyright interest in the
      13             :  * software to the public domain. We make this dedication for the benefit
      14             :  * of the public at large and to the detriment of our heirs and
      15             :  * successors. We intend this dedication to be an overt act of
      16             :  * relinquishment in perpetuity of all present and future rights to this
      17             :  * software under copyright law.
      18             :  *
      19             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      20             :  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      21             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      22             :  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
      23             :  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
      24             :  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
      25             :  * OTHER DEALINGS IN THE SOFTWARE.
      26             :  *
      27             :  * For more information, please refer to <http://unlicense.org/> */
      28             : 
      29             : #ifndef SHEREDOM_UTF8_H_INCLUDED
      30             : #define SHEREDOM_UTF8_H_INCLUDED
      31             : 
      32             : #if defined(_MSC_VER)
      33             : #pragma warning(push)
      34             : 
      35             : /* disable warning: no function prototype given: converting '()' to '(void)' */
      36             : #pragma warning(disable : 4255)
      37             : 
      38             : /* disable warning: '__cplusplus' is not defined as a preprocessor macro,
      39             :  * replacing with '0' for '#if/#elif' */
      40             : #pragma warning(disable : 4668)
      41             : 
      42             : /* disable warning: bytes padding added after construct */
      43             : #pragma warning(disable : 4820)
      44             : #endif
      45             : 
      46             : #include <stddef.h>
      47             : #include <stdlib.h>
      48             : 
      49             : #if defined(_MSC_VER)
      50             : #pragma warning(pop)
      51             : #endif
      52             : 
      53             : #if defined(_MSC_VER) && (_MSC_VER < 1920)
      54             : typedef __int32 utf8_int32_t;
      55             : #else
      56             : #include <stdint.h>
      57             : typedef int32_t utf8_int32_t;
      58             : #endif
      59             : 
      60             : #if defined(__clang__)
      61             : #pragma clang diagnostic push
      62             : #pragma clang diagnostic ignored "-Wold-style-cast"
      63             : #pragma clang diagnostic ignored "-Wcast-qual"
      64             : #endif
      65             : 
      66             : #ifdef __cplusplus
      67             : extern "C" {
      68             : #endif
      69             : 
      70             : #if defined(_MSC_VER)
      71             : #define utf8_nonnull
      72             : #define utf8_pure
      73             : #define utf8_restrict __restrict
      74             : #define utf8_weak __inline
      75             : #elif defined(__clang__) || defined(__GNUC__)
      76             : #define utf8_nonnull __attribute__((nonnull))
      77             : #define utf8_pure __attribute__((pure))
      78             : #define utf8_restrict __restrict__
      79             : #define utf8_weak __attribute__((weak))
      80             : #else
      81             : #error Non clang, non gcc, non MSVC compiler found!
      82             : #endif
      83             : 
      84             : #ifdef __cplusplus
      85             : #define utf8_null NULL
      86             : #else
      87             : #define utf8_null 0
      88             : #endif
      89             : 
      90             : #if (defined(__cplusplus) && __cplusplus >= 201402L)
      91             : #define utf8_constexpr14 constexpr
      92             : #define utf8_constexpr14_impl constexpr
      93             : #else
      94             : /* constexpr and weak are incompatible. so only enable one of them */
      95             : #define utf8_constexpr14 utf8_weak
      96             : #define utf8_constexpr14_impl
      97             : #endif
      98             : 
      99             : #if defined(__cplusplus) && __cplusplus >= 202002L
     100             : using utf8_int8_t = char8_t; /* Introduced in C++20 */
     101             : #else
     102             : typedef char utf8_int8_t;
     103             : #endif
     104             : 
     105             : /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
     106             :  * src2 respectively, case insensitive. */
     107             : utf8_constexpr14 utf8_nonnull utf8_pure int
     108             : utf8casecmp(const utf8_int8_t *src1, const utf8_int8_t *src2);
     109             : 
     110             : /* Append the utf8 string src onto the utf8 string dst. */
     111             : utf8_nonnull utf8_weak utf8_int8_t *
     112             : utf8cat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
     113             : 
     114             : /* Find the first match of the utf8 codepoint chr in the utf8 string src. */
     115             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     116             : utf8chr(const utf8_int8_t *src, utf8_int32_t chr);
     117             : 
     118             : /* Return less than 0, 0, greater than 0 if src1 < src2,
     119             :  * src1 == src2, src1 > src2 respectively. */
     120             : utf8_constexpr14 utf8_nonnull utf8_pure int utf8cmp(const utf8_int8_t *src1,
     121             :                                                     const utf8_int8_t *src2);
     122             : 
     123             : /* Copy the utf8 string src onto the memory allocated in dst. */
     124             : utf8_nonnull utf8_weak utf8_int8_t *
     125             : utf8cpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
     126             : 
     127             : /* Number of utf8 codepoints in the utf8 string src that consists entirely
     128             :  * of utf8 codepoints not from the utf8 string reject. */
     129             : utf8_constexpr14 utf8_nonnull utf8_pure size_t
     130             : utf8cspn(const utf8_int8_t *src, const utf8_int8_t *reject);
     131             : 
     132             : /* Duplicate the utf8 string src by getting its size, malloc'ing a new buffer
     133             :  * copying over the data, and returning that. Or 0 if malloc failed. */
     134             : utf8_weak utf8_int8_t *utf8dup(const utf8_int8_t *src);
     135             : 
     136             : /* Number of utf8 codepoints in the utf8 string str,
     137             :  * excluding the null terminating byte. */
     138             : utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8len(const utf8_int8_t *str);
     139             : 
     140             : /* Similar to utf8len, except that only at most n bytes of src are looked. */
     141             : utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8nlen(const utf8_int8_t *str,
     142             :                                                         size_t n);
     143             : 
     144             : /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
     145             :  * src2 respectively, case insensitive. Checking at most n bytes of each utf8
     146             :  * string. */
     147             : utf8_constexpr14 utf8_nonnull utf8_pure int
     148             : utf8ncasecmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
     149             : 
     150             : /* Append the utf8 string src onto the utf8 string dst,
     151             :  * writing at most n+1 bytes. Can produce an invalid utf8
     152             :  * string if n falls partway through a utf8 codepoint. */
     153             : utf8_nonnull utf8_weak utf8_int8_t *
     154             : utf8ncat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
     155             :          size_t n);
     156             : 
     157             : /* Return less than 0, 0, greater than 0 if src1 < src2,
     158             :  * src1 == src2, src1 > src2 respectively. Checking at most n
     159             :  * bytes of each utf8 string. */
     160             : utf8_constexpr14 utf8_nonnull utf8_pure int
     161             : utf8ncmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
     162             : 
     163             : /* Copy the utf8 string src onto the memory allocated in dst.
     164             :  * Copies at most n bytes. If n falls partway through a utf8
     165             :  * codepoint, or if dst doesn't have enough room for a null
     166             :  * terminator, the final string will be cut short to preserve
     167             :  * utf8 validity. */
     168             : 
     169             : utf8_nonnull utf8_weak utf8_int8_t *
     170             : utf8ncpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
     171             :          size_t n);
     172             : 
     173             : /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
     174             :  * longer than n, only n bytes are copied and a null byte is added.
     175             :  *
     176             :  * Returns a new string if successful, 0 otherwise */
     177             : utf8_weak utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n);
     178             : 
     179             : /* Locates the first occurrence in the utf8 string str of any byte in the
     180             :  * utf8 string accept, or 0 if no match was found. */
     181             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     182             : utf8pbrk(const utf8_int8_t *str, const utf8_int8_t *accept);
     183             : 
     184             : /* Find the last match of the utf8 codepoint chr in the utf8 string src. */
     185             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     186             : utf8rchr(const utf8_int8_t *src, int chr);
     187             : 
     188             : /* Number of bytes in the utf8 string str,
     189             :  * including the null terminating byte. */
     190             : utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8size(const utf8_int8_t *str);
     191             : 
     192             : /* Similar to utf8size, except that the null terminating byte is excluded. */
     193             : utf8_constexpr14 utf8_nonnull utf8_pure size_t
     194             : utf8size_lazy(const utf8_int8_t *str);
     195             : 
     196             : /* Similar to utf8size, except that only at most n bytes of src are looked and
     197             :  * the null terminating byte is excluded. */
     198             : utf8_constexpr14 utf8_nonnull utf8_pure size_t
     199             : utf8nsize_lazy(const utf8_int8_t *str, size_t n);
     200             : 
     201             : /* Number of utf8 codepoints in the utf8 string src that consists entirely
     202             :  * of utf8 codepoints from the utf8 string accept. */
     203             : utf8_constexpr14 utf8_nonnull utf8_pure size_t
     204             : utf8spn(const utf8_int8_t *src, const utf8_int8_t *accept);
     205             : 
     206             : /* The position of the utf8 string needle in the utf8 string haystack. */
     207             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     208             : utf8str(const utf8_int8_t *haystack, const utf8_int8_t *needle);
     209             : 
     210             : /* The position of the utf8 string needle in the utf8 string haystack, case
     211             :  * insensitive. */
     212             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     213             : utf8casestr(const utf8_int8_t *haystack, const utf8_int8_t *needle);
     214             : 
     215             : /* Return 0 on success, or the position of the invalid
     216             :  * utf8 codepoint on failure. */
     217             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     218             : utf8valid(const utf8_int8_t *str);
     219             : 
     220             : /* Similar to utf8valid, except that only at most n bytes of src are looked. */
     221             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     222             : utf8nvalid(const utf8_int8_t *str, size_t n);
     223             : 
     224             : /* Given a null-terminated string, makes the string valid by replacing invalid
     225             :  * codepoints with a 1-byte replacement. Returns 0 on success. */
     226             : utf8_nonnull utf8_weak int utf8makevalid(utf8_int8_t *str,
     227             :                                          const utf8_int32_t replacement);
     228             : 
     229             : /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
     230             :  * address of the next utf8 codepoint after the current one in str. */
     231             : utf8_constexpr14 utf8_nonnull utf8_int8_t *
     232             : utf8codepoint(const utf8_int8_t *utf8_restrict str,
     233             :               utf8_int32_t *utf8_restrict out_codepoint);
     234             : 
     235             : /* Calculates the size of the next utf8 codepoint in str. */
     236             : utf8_constexpr14 utf8_nonnull size_t
     237             : utf8codepointcalcsize(const utf8_int8_t *str);
     238             : 
     239             : /* Returns the size of the given codepoint in bytes. */
     240             : utf8_constexpr14 size_t utf8codepointsize(utf8_int32_t chr);
     241             : 
     242             : /* Write a codepoint to the given string, and return the address to the next
     243             :  * place after the written codepoint. Pass how many bytes left in the buffer to
     244             :  * n. If there is not enough space for the codepoint, this function returns
     245             :  * null. */
     246             : utf8_nonnull utf8_weak utf8_int8_t *
     247             : utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n);
     248             : 
     249             : /* Returns 1 if the given character is lowercase, or 0 if it is not. */
     250             : utf8_constexpr14 int utf8islower(utf8_int32_t chr);
     251             : 
     252             : /* Returns 1 if the given character is uppercase, or 0 if it is not. */
     253             : utf8_constexpr14 int utf8isupper(utf8_int32_t chr);
     254             : 
     255             : /* Transform the given string into all lowercase codepoints. */
     256             : utf8_nonnull utf8_weak void utf8lwr(utf8_int8_t *utf8_restrict str);
     257             : 
     258             : /* Transform the given string into all uppercase codepoints. */
     259             : utf8_nonnull utf8_weak void utf8upr(utf8_int8_t *utf8_restrict str);
     260             : 
     261             : /* Make a codepoint lower case if possible. */
     262             : utf8_constexpr14 utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
     263             : 
     264             : /* Make a codepoint upper case if possible. */
     265             : utf8_constexpr14 utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
     266             : 
     267             : /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
     268             :  * address of the previous utf8 codepoint before the current one in str. */
     269             : utf8_constexpr14 utf8_nonnull utf8_int8_t *
     270             : utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
     271             :                utf8_int32_t *utf8_restrict out_codepoint);
     272             : 
     273             : /* Duplicate the utf8 string src by getting its size, calling alloc_func_ptr to
     274             :  * copy over data to a new buffer, and returning that. Or 0 if alloc_func_ptr
     275             :  * returned null. */
     276             : utf8_weak utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
     277             :                                   utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
     278             :                                                                  size_t),
     279             :                                   utf8_int8_t *user_data);
     280             : 
     281             : /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
     282             :  * longer than n, only n bytes are copied and a null byte is added.
     283             :  *
     284             :  * Returns a new string if successful, 0 otherwise. */
     285             : utf8_weak utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
     286             :                                    utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
     287             :                                                                   size_t),
     288             :                                    utf8_int8_t *user_data);
     289             : 
     290             : #undef utf8_weak
     291             : #undef utf8_pure
     292             : #undef utf8_nonnull
     293             : 
     294         113 : utf8_constexpr14_impl int utf8casecmp(const utf8_int8_t *src1,
     295             :                                       const utf8_int8_t *src2) {
     296         113 :   utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
     297         113 :                src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
     298             : 
     299         285 :   for (;;) {
     300         199 :     src1 = utf8codepoint(src1, &src1_orig_cp);
     301         199 :     src2 = utf8codepoint(src2, &src2_orig_cp);
     302             : 
     303             :     /* lower the srcs if required */
     304         199 :     src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
     305         199 :     src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
     306             : 
     307             :     /* lower the srcs if required */
     308         199 :     src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
     309         199 :     src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
     310             : 
     311             :     /* check if the lowered codepoints match */
     312         199 :     if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
     313             :       return 0;
     314         182 :     } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
     315          86 :       continue;
     316             :     }
     317             : 
     318             :     /* if they don't match, then we return the difference between the characters
     319             :      */
     320          96 :     return src1_lwr_cp - src2_lwr_cp;
     321             :   }
     322             : }
     323             : 
     324           0 : utf8_int8_t *utf8cat(utf8_int8_t *utf8_restrict dst,
     325             :                      const utf8_int8_t *utf8_restrict src) {
     326           0 :   utf8_int8_t *d = dst;
     327             :   /* find the null terminating byte in dst */
     328           0 :   while ('\0' != *d) {
     329           0 :     d++;
     330             :   }
     331             : 
     332             :   /* overwriting the null terminating byte in dst, append src byte-by-byte */
     333           0 :   while ('\0' != *src) {
     334           0 :     *d++ = *src++;
     335             :   }
     336             : 
     337             :   /* write out a new null terminating byte into dst */
     338           0 :   *d = '\0';
     339             : 
     340           0 :   return dst;
     341             : }
     342             : 
     343           0 : utf8_constexpr14_impl utf8_int8_t *utf8chr(const utf8_int8_t *src,
     344             :                                            utf8_int32_t chr) {
     345           0 :   utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
     346             : 
     347           0 :   if (0 == chr) {
     348             :     /* being asked to return position of null terminating byte, so
     349             :      * just run s to the end, and return! */
     350           0 :     while ('\0' != *src) {
     351           0 :       src++;
     352             :     }
     353           0 :     return (utf8_int8_t *)src;
     354           0 :   } else if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
     355             :     /* 1-byte/7-bit ascii
     356             :      * (0b0xxxxxxx) */
     357           0 :     c[0] = (utf8_int8_t)chr;
     358           0 :   } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
     359             :     /* 2-byte/11-bit utf8 code point
     360             :      * (0b110xxxxx 0b10xxxxxx) */
     361           0 :     c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
     362           0 :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     363           0 :   } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
     364             :     /* 3-byte/16-bit utf8 code point
     365             :      * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
     366           0 :     c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
     367           0 :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
     368           0 :     c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     369             :   } else { /* if (0 == ((int)0xffe00000 & chr)) { */
     370             :     /* 4-byte/21-bit utf8 code point
     371             :      * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
     372           0 :     c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
     373           0 :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
     374           0 :     c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
     375           0 :     c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     376             :   }
     377             : 
     378             :   /* we've made c into a 2 utf8 codepoint string, one for the chr we are
     379             :    * seeking, another for the null terminating byte. Now use utf8str to
     380             :    * search */
     381           0 :   return utf8str(src, c);
     382             : }
     383             : 
     384           0 : utf8_constexpr14_impl int utf8cmp(const utf8_int8_t *src1,
     385             :                                   const utf8_int8_t *src2) {
     386           0 :   while (('\0' != *src1) || ('\0' != *src2)) {
     387           0 :     if (*src1 < *src2) {
     388             :       return -1;
     389           0 :     } else if (*src1 > *src2) {
     390             :       return 1;
     391             :     }
     392             : 
     393           0 :     src1++;
     394           0 :     src2++;
     395             :   }
     396             : 
     397             :   /* both utf8 strings matched */
     398             :   return 0;
     399             : }
     400             : 
     401             : utf8_constexpr14_impl int utf8coll(const utf8_int8_t *src1,
     402             :                                    const utf8_int8_t *src2);
     403             : 
     404           0 : utf8_int8_t *utf8cpy(utf8_int8_t *utf8_restrict dst,
     405             :                      const utf8_int8_t *utf8_restrict src) {
     406           0 :   utf8_int8_t *d = dst;
     407             : 
     408             :   /* overwriting anything previously in dst, write byte-by-byte
     409             :    * from src */
     410           0 :   while ('\0' != *src) {
     411           0 :     *d++ = *src++;
     412             :   }
     413             : 
     414             :   /* append null terminating byte */
     415           0 :   *d = '\0';
     416             : 
     417           0 :   return dst;
     418             : }
     419             : 
     420           0 : utf8_constexpr14_impl size_t utf8cspn(const utf8_int8_t *src,
     421             :                                       const utf8_int8_t *reject) {
     422           0 :   size_t chars = 0;
     423             : 
     424           0 :   while ('\0' != *src) {
     425             :     const utf8_int8_t *r = reject;
     426             :     size_t offset = 0;
     427             : 
     428           0 :     while ('\0' != *r) {
     429             :       /* checking that if *r is the start of a utf8 codepoint
     430             :        * (it is not 0b10xxxxxx) and we have successfully matched
     431             :        * a previous character (0 < offset) - we found a match */
     432           0 :       if ((0x80 != (0xc0 & *r)) && (0 < offset)) {
     433           0 :         return chars;
     434             :       } else {
     435           0 :         if (*r == src[offset]) {
     436             :           /* part of a utf8 codepoint matched, so move our checking
     437             :            * onwards to the next byte */
     438           0 :           offset++;
     439           0 :           r++;
     440             :         } else {
     441             :           /* r could be in the middle of an unmatching utf8 code point,
     442             :            * so we need to march it on to the next character beginning, */
     443             : 
     444           0 :           do {
     445           0 :             r++;
     446           0 :           } while (0x80 == (0xc0 & *r));
     447             : 
     448             :           /* reset offset too as we found a mismatch */
     449             :           offset = 0;
     450             :         }
     451             :       }
     452             :     }
     453             : 
     454             :     /* found a match at the end of *r, so didn't get a chance to test it */
     455           0 :     if (0 < offset) {
     456           0 :       return chars;
     457             :     }
     458             : 
     459             :     /* the current utf8 codepoint in src did not match reject, but src
     460             :      * could have been partway through a utf8 codepoint, so we need to
     461             :      * march it onto the next utf8 codepoint starting byte */
     462           0 :     do {
     463           0 :       src++;
     464           0 :     } while ((0x80 == (0xc0 & *src)));
     465           0 :     chars++;
     466             :   }
     467             : 
     468             :   return chars;
     469             : }
     470             : 
     471           0 : utf8_int8_t *utf8dup(const utf8_int8_t *src) {
     472           0 :   return utf8dup_ex(src, utf8_null, utf8_null);
     473             : }
     474             : 
     475           0 : utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
     476             :                         utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
     477             :                         utf8_int8_t *user_data) {
     478           0 :   utf8_int8_t *n = utf8_null;
     479             : 
     480             :   /* figure out how many bytes (including the terminator) we need to copy first
     481             :    */
     482           0 :   size_t bytes = utf8size(src);
     483             : 
     484           0 :   if (alloc_func_ptr) {
     485           0 :     n = alloc_func_ptr(user_data, bytes);
     486             :   } else {
     487             : #if !defined(UTF8_NO_STD_MALLOC)
     488           0 :     n = (utf8_int8_t *)malloc(bytes);
     489             : #else
     490             :     return utf8_null;
     491             : #endif
     492             :   }
     493             : 
     494           0 :   if (utf8_null == n) {
     495             :     /* out of memory so we bail */
     496             :     return utf8_null;
     497             :   } else {
     498             :     bytes = 0;
     499             : 
     500             :     /* copy src byte-by-byte into our new utf8 string */
     501           0 :     while ('\0' != src[bytes]) {
     502           0 :       n[bytes] = src[bytes];
     503           0 :       bytes++;
     504             :     }
     505             : 
     506             :     /* append null terminating byte */
     507           0 :     n[bytes] = '\0';
     508           0 :     return n;
     509             :   }
     510             : }
     511             : 
     512             : utf8_constexpr14_impl utf8_int8_t *utf8fry(const utf8_int8_t *str);
     513             : 
     514    34750332 : utf8_constexpr14_impl size_t utf8len(const utf8_int8_t *str) {
     515    34750332 :   return utf8nlen(str, SIZE_MAX);
     516             : }
     517             : 
     518    34741394 : utf8_constexpr14_impl size_t utf8nlen(const utf8_int8_t *str, size_t n) {
     519    34741394 :   const utf8_int8_t *t = str;
     520    34741394 :   size_t length = 0;
     521             : 
     522  1086826508 :   while ((size_t)(str - t) < n && '\0' != *str) {
     523  1052085114 :     if (0xf0 == (0xf8 & *str)) {
     524             :       /* 4-byte utf8 code point (began with 0b11110xxx) */
     525           3 :       str += 4;
     526  1052085111 :     } else if (0xe0 == (0xf0 & *str)) {
     527             :       /* 3-byte utf8 code point (began with 0b1110xxxx) */
     528         172 :       str += 3;
     529  1052084939 :     } else if (0xc0 == (0xe0 & *str)) {
     530             :       /* 2-byte utf8 code point (began with 0b110xxxxx) */
     531        1616 :       str += 2;
     532             :     } else { /* if (0x00 == (0x80 & *s)) { */
     533             :       /* 1-byte ascii (began with 0b0xxxxxxx) */
     534  1052083323 :       str += 1;
     535             :     }
     536             : 
     537             :     /* no matter the bytes we marched s forward by, it was
     538             :      * only 1 utf8 codepoint */
     539  1052085114 :     length++;
     540             :   }
     541             : 
     542    34741394 :   if ((size_t)(str - t) > n) {
     543           0 :     length--;
     544             :   }
     545    34741394 :   return length;
     546             : }
     547             : 
     548          65 : utf8_constexpr14_impl int utf8ncasecmp(const utf8_int8_t *src1,
     549             :                                        const utf8_int8_t *src2, size_t n) {
     550          65 :   utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
     551          65 :                src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
     552             : 
     553         111 :   do {
     554         111 :     const utf8_int8_t *const s1 = src1;
     555         111 :     const utf8_int8_t *const s2 = src2;
     556             : 
     557             :     /* first check that we have enough bytes left in n to contain an entire
     558             :      * codepoint */
     559         111 :     if (0 == n) {
     560             :       return 0;
     561             :     }
     562             : 
     563         111 :     if ((1 == n) && ((0xc0 == (0xe0 & *s1)) || (0xc0 == (0xe0 & *s2)))) {
     564           1 :       const utf8_int32_t c1 = (0xe0 & *s1);
     565           1 :       const utf8_int32_t c2 = (0xe0 & *s2);
     566             : 
     567           1 :       if (c1 != c2) {
     568           1 :         return c1 - c2;
     569             :       } else {
     570             :         return 0;
     571             :       }
     572             :     }
     573             : 
     574         110 :     if ((2 >= n) && ((0xe0 == (0xf0 & *s1)) || (0xe0 == (0xf0 & *s2)))) {
     575           0 :       const utf8_int32_t c1 = (0xf0 & *s1);
     576           0 :       const utf8_int32_t c2 = (0xf0 & *s2);
     577             : 
     578           0 :       if (c1 != c2) {
     579           0 :         return c1 - c2;
     580             :       } else {
     581             :         return 0;
     582             :       }
     583             :     }
     584             : 
     585         110 :     if ((3 >= n) && ((0xf0 == (0xf8 & *s1)) || (0xf0 == (0xf8 & *s2)))) {
     586           0 :       const utf8_int32_t c1 = (0xf8 & *s1);
     587           0 :       const utf8_int32_t c2 = (0xf8 & *s2);
     588             : 
     589           0 :       if (c1 != c2) {
     590           0 :         return c1 - c2;
     591             :       } else {
     592             :         return 0;
     593             :       }
     594             :     }
     595             : 
     596         110 :     src1 = utf8codepoint(src1, &src1_orig_cp);
     597         110 :     src2 = utf8codepoint(src2, &src2_orig_cp);
     598         110 :     n -= utf8codepointsize(src1_orig_cp);
     599             : 
     600         110 :     src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
     601         110 :     src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
     602             : 
     603         110 :     src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
     604         110 :     src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
     605             : 
     606             :     /* check if the lowered codepoints match */
     607         110 :     if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
     608             :       return 0;
     609         110 :     } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
     610          58 :       continue;
     611             :     }
     612             : 
     613             :     /* if they don't match, then we return the difference between the characters
     614             :      */
     615          52 :     return src1_lwr_cp - src2_lwr_cp;
     616          58 :   } while (0 < n);
     617             : 
     618             :   /* both utf8 strings matched */
     619             :   return 0;
     620             : }
     621             : 
     622           0 : utf8_int8_t *utf8ncat(utf8_int8_t *utf8_restrict dst,
     623             :                       const utf8_int8_t *utf8_restrict src, size_t n) {
     624           0 :   utf8_int8_t *d = dst;
     625             : 
     626             :   /* find the null terminating byte in dst */
     627           0 :   while ('\0' != *d) {
     628           0 :     d++;
     629             :   }
     630             : 
     631             :   /* overwriting the null terminating byte in dst, append src byte-by-byte
     632             :    * stopping if we run out of space */
     633           0 :   while (('\0' != *src) && (0 != n--)) {
     634           0 :     *d++ = *src++;
     635             :   }
     636             : 
     637             :   /* write out a new null terminating byte into dst */
     638           0 :   *d = '\0';
     639             : 
     640           0 :   return dst;
     641             : }
     642             : 
     643           0 : utf8_constexpr14_impl int utf8ncmp(const utf8_int8_t *src1,
     644             :                                    const utf8_int8_t *src2, size_t n) {
     645           0 :   while ((0 != n--) && (('\0' != *src1) || ('\0' != *src2))) {
     646           0 :     if (*src1 < *src2) {
     647             :       return -1;
     648           0 :     } else if (*src1 > *src2) {
     649             :       return 1;
     650             :     }
     651             : 
     652           0 :     src1++;
     653           0 :     src2++;
     654             :   }
     655             : 
     656             :   /* both utf8 strings matched */
     657             :   return 0;
     658             : }
     659             : 
     660           0 : utf8_int8_t *utf8ncpy(utf8_int8_t *utf8_restrict dst,
     661             :                       const utf8_int8_t *utf8_restrict src, size_t n) {
     662           0 :   utf8_int8_t *d = dst;
     663           0 :   size_t index = 0, check_index = 0;
     664             : 
     665           0 :   if (n == 0) {
     666             :     return dst;
     667             :   }
     668             : 
     669             :   /* overwriting anything previously in dst, write byte-by-byte
     670             :    * from src */
     671           0 :   for (index = 0; index < n; index++) {
     672           0 :     d[index] = src[index];
     673           0 :     if ('\0' == src[index]) {
     674             :       break;
     675             :     }
     676             :   }
     677             : 
     678           0 :   for (check_index = index - 1;
     679           0 :        check_index > 0 && 0x80 == (0xc0 & d[check_index]); check_index--) {
     680             :     /* just moving the index */
     681           0 :   }
     682             : 
     683           0 :   if (check_index < index &&
     684           0 :       (index - check_index) < utf8codepointsize(d[check_index])) {
     685           0 :     index = check_index;
     686             :   }
     687             : 
     688             :   /* append null terminating byte */
     689           0 :   for (; index < n; index++) {
     690           0 :     d[index] = 0;
     691             :   }
     692             : 
     693             :   return dst;
     694             : }
     695             : 
     696           0 : utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n) {
     697           0 :   return utf8ndup_ex(src, n, utf8_null, utf8_null);
     698             : }
     699             : 
     700           0 : utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
     701             :                          utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
     702             :                          utf8_int8_t *user_data) {
     703           0 :   utf8_int8_t *c = utf8_null;
     704           0 :   size_t bytes = 0;
     705             : 
     706             :   /* Find the end of the string or stop when n is reached */
     707           0 :   while ('\0' != src[bytes] && bytes < n) {
     708           0 :     bytes++;
     709             :   }
     710             : 
     711             :   /* In case bytes is actually less than n, we need to set it
     712             :    * to be used later in the copy byte by byte. */
     713           0 :   n = bytes;
     714             : 
     715           0 :   if (alloc_func_ptr) {
     716           0 :     c = alloc_func_ptr(user_data, bytes + 1);
     717             :   } else {
     718             : #if !defined(UTF8_NO_STD_MALLOC)
     719           0 :     c = (utf8_int8_t *)malloc(bytes + 1);
     720             : #else
     721             :     c = utf8_null;
     722             : #endif
     723             :   }
     724             : 
     725           0 :   if (utf8_null == c) {
     726             :     /* out of memory so we bail */
     727             :     return utf8_null;
     728             :   }
     729             : 
     730             :   bytes = 0;
     731             : 
     732             :   /* copy src byte-by-byte into our new utf8 string */
     733           0 :   while ('\0' != src[bytes] && bytes < n) {
     734           0 :     c[bytes] = src[bytes];
     735           0 :     bytes++;
     736             :   }
     737             : 
     738             :   /* append null terminating byte */
     739           0 :   c[bytes] = '\0';
     740           0 :   return c;
     741             : }
     742             : 
     743           0 : utf8_constexpr14_impl utf8_int8_t *utf8rchr(const utf8_int8_t *src, int chr) {
     744             : 
     745           0 :   utf8_int8_t *match = utf8_null;
     746           0 :   utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
     747             : 
     748           0 :   if (0 == chr) {
     749             :     /* being asked to return position of null terminating byte, so
     750             :      * just run s to the end, and return! */
     751           0 :     while ('\0' != *src) {
     752           0 :       src++;
     753             :     }
     754           0 :     return (utf8_int8_t *)src;
     755           0 :   } else if (0 == ((int)0xffffff80 & chr)) {
     756             :     /* 1-byte/7-bit ascii
     757             :      * (0b0xxxxxxx) */
     758           0 :     c[0] = (utf8_int8_t)chr;
     759           0 :   } else if (0 == ((int)0xfffff800 & chr)) {
     760             :     /* 2-byte/11-bit utf8 code point
     761             :      * (0b110xxxxx 0b10xxxxxx) */
     762           0 :     c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
     763           0 :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     764           0 :   } else if (0 == ((int)0xffff0000 & chr)) {
     765             :     /* 3-byte/16-bit utf8 code point
     766             :      * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
     767           0 :     c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
     768           0 :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
     769           0 :     c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     770             :   } else { /* if (0 == ((int)0xffe00000 & chr)) { */
     771             :     /* 4-byte/21-bit utf8 code point
     772             :      * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
     773           0 :     c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
     774           0 :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
     775           0 :     c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
     776           0 :     c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     777             :   }
     778             : 
     779             :   /* we've created a 2 utf8 codepoint string in c that is
     780             :    * the utf8 character asked for by chr, and a null
     781             :    * terminating byte */
     782             : 
     783           0 :   while ('\0' != *src) {
     784             :     size_t offset = 0;
     785             : 
     786           0 :     while (src[offset] == c[offset]) {
     787           0 :       offset++;
     788             :     }
     789             : 
     790           0 :     if ('\0' == c[offset]) {
     791             :       /* we found a matching utf8 code point */
     792             :       match = (utf8_int8_t *)src;
     793             :       src += offset;
     794             :     } else {
     795           0 :       src += offset;
     796             : 
     797             :       /* need to march s along to next utf8 codepoint start
     798             :        * (the next byte that doesn't match 0b10xxxxxx) */
     799           0 :       if ('\0' != *src) {
     800           0 :         do {
     801           0 :           src++;
     802           0 :         } while (0x80 == (0xc0 & *src));
     803             :       }
     804             :     }
     805             :   }
     806             : 
     807             :   /* return the last match we found (or 0 if no match was found) */
     808             :   return match;
     809             : }
     810             : 
     811           0 : utf8_constexpr14_impl utf8_int8_t *utf8pbrk(const utf8_int8_t *str,
     812             :                                             const utf8_int8_t *accept) {
     813           0 :   while ('\0' != *str) {
     814             :     const utf8_int8_t *a = accept;
     815             :     size_t offset = 0;
     816             : 
     817           0 :     while ('\0' != *a) {
     818             :       /* checking that if *a is the start of a utf8 codepoint
     819             :        * (it is not 0b10xxxxxx) and we have successfully matched
     820             :        * a previous character (0 < offset) - we found a match */
     821           0 :       if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
     822           0 :         return (utf8_int8_t *)str;
     823             :       } else {
     824           0 :         if (*a == str[offset]) {
     825             :           /* part of a utf8 codepoint matched, so move our checking
     826             :            * onwards to the next byte */
     827           0 :           offset++;
     828           0 :           a++;
     829             :         } else {
     830             :           /* r could be in the middle of an unmatching utf8 code point,
     831             :            * so we need to march it on to the next character beginning, */
     832             : 
     833           0 :           do {
     834           0 :             a++;
     835           0 :           } while (0x80 == (0xc0 & *a));
     836             : 
     837             :           /* reset offset too as we found a mismatch */
     838             :           offset = 0;
     839             :         }
     840             :       }
     841             :     }
     842             : 
     843             :     /* we found a match on the last utf8 codepoint */
     844           0 :     if (0 < offset) {
     845           0 :       return (utf8_int8_t *)str;
     846             :     }
     847             : 
     848             :     /* the current utf8 codepoint in src did not match accept, but src
     849             :      * could have been partway through a utf8 codepoint, so we need to
     850             :      * march it onto the next utf8 codepoint starting byte */
     851           0 :     do {
     852           0 :       str++;
     853           0 :     } while ((0x80 == (0xc0 & *str)));
     854             :   }
     855             : 
     856             :   return utf8_null;
     857             : }
     858             : 
     859           0 : utf8_constexpr14_impl size_t utf8size(const utf8_int8_t *str) {
     860           0 :   return utf8size_lazy(str) + 1;
     861             : }
     862             : 
     863           0 : utf8_constexpr14_impl size_t utf8size_lazy(const utf8_int8_t *str) {
     864           0 :   return utf8nsize_lazy(str, SIZE_MAX);
     865             : }
     866             : 
     867           0 : utf8_constexpr14_impl size_t utf8nsize_lazy(const utf8_int8_t *str, size_t n) {
     868           0 :   size_t size = 0;
     869           0 :   while (size < n && '\0' != str[size]) {
     870           0 :     size++;
     871             :   }
     872           0 :   return size;
     873             : }
     874             : 
     875           0 : utf8_constexpr14_impl size_t utf8spn(const utf8_int8_t *src,
     876             :                                      const utf8_int8_t *accept) {
     877           0 :   size_t chars = 0;
     878             : 
     879           0 :   while ('\0' != *src) {
     880             :     const utf8_int8_t *a = accept;
     881             :     size_t offset = 0;
     882             : 
     883           0 :     while ('\0' != *a) {
     884             :       /* checking that if *r is the start of a utf8 codepoint
     885             :        * (it is not 0b10xxxxxx) and we have successfully matched
     886             :        * a previous character (0 < offset) - we found a match */
     887           0 :       if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
     888             :         /* found a match, so increment the number of utf8 codepoints
     889             :          * that have matched and stop checking whether any other utf8
     890             :          * codepoints in a match */
     891           0 :         chars++;
     892           0 :         src += offset;
     893           0 :         offset = 0;
     894           0 :         break;
     895             :       } else {
     896           0 :         if (*a == src[offset]) {
     897           0 :           offset++;
     898           0 :           a++;
     899             :         } else {
     900             :           /* a could be in the middle of an unmatching utf8 codepoint,
     901             :            * so we need to march it on to the next character beginning, */
     902           0 :           do {
     903           0 :             a++;
     904           0 :           } while (0x80 == (0xc0 & *a));
     905             : 
     906             :           /* reset offset too as we found a mismatch */
     907             :           offset = 0;
     908             :         }
     909             :       }
     910             :     }
     911             : 
     912             :     /* found a match at the end of *a, so didn't get a chance to test it */
     913           0 :     if (0 < offset) {
     914           0 :       chars++;
     915           0 :       src += offset;
     916           0 :       continue;
     917             :     }
     918             : 
     919             :     /* if a got to its terminating null byte, then we didn't find a match.
     920             :      * Return the current number of matched utf8 codepoints */
     921           0 :     if ('\0' == *a) {
     922           0 :       return chars;
     923             :     }
     924             :   }
     925             : 
     926             :   return chars;
     927             : }
     928             : 
     929           0 : utf8_constexpr14_impl utf8_int8_t *utf8str(const utf8_int8_t *haystack,
     930             :                                            const utf8_int8_t *needle) {
     931           0 :   utf8_int32_t throwaway_codepoint = 0;
     932             : 
     933             :   /* if needle has no utf8 codepoints before the null terminating
     934             :    * byte then return haystack */
     935           0 :   if ('\0' == *needle) {
     936             :     return (utf8_int8_t *)haystack;
     937             :   }
     938             : 
     939           0 :   while ('\0' != *haystack) {
     940             :     const utf8_int8_t *maybeMatch = haystack;
     941             :     const utf8_int8_t *n = needle;
     942             : 
     943           0 :     while (*haystack == *n && (*haystack != '\0' && *n != '\0')) {
     944           0 :       n++;
     945           0 :       haystack++;
     946             :     }
     947             : 
     948           0 :     if ('\0' == *n) {
     949             :       /* we found the whole utf8 string for needle in haystack at
     950             :        * maybeMatch, so return it */
     951           0 :       return (utf8_int8_t *)maybeMatch;
     952             :     } else {
     953             :       /* h could be in the middle of an unmatching utf8 codepoint,
     954             :        * so we need to march it on to the next character beginning
     955             :        * starting from the current character */
     956           0 :       haystack = utf8codepoint(maybeMatch, &throwaway_codepoint);
     957             :     }
     958             :   }
     959             : 
     960             :   /* no match */
     961             :   return utf8_null;
     962             : }
     963             : 
     964         164 : utf8_constexpr14_impl utf8_int8_t *utf8casestr(const utf8_int8_t *haystack,
     965             :                                                const utf8_int8_t *needle) {
     966             :   /* if needle has no utf8 codepoints before the null terminating
     967             :    * byte then return haystack */
     968         164 :   if ('\0' == *needle) {
     969             :     return (utf8_int8_t *)haystack;
     970             :   }
     971             : 
     972        3110 :   for (;;) {
     973        1637 :     const utf8_int8_t *maybeMatch = haystack;
     974        1637 :     const utf8_int8_t *n = needle;
     975        1637 :     utf8_int32_t h_cp = 0, n_cp = 0;
     976             : 
     977             :     /* Get the next code point and track it */
     978        1637 :     const utf8_int8_t *nextH = haystack = utf8codepoint(haystack, &h_cp);
     979        1599 :     n = utf8codepoint(n, &n_cp);
     980             : 
     981        1751 :     while ((0 != h_cp) && (0 != n_cp)) {
     982        1592 :       h_cp = utf8lwrcodepoint(h_cp);
     983        1579 :       n_cp = utf8lwrcodepoint(n_cp);
     984             : 
     985             :       /* if we find a mismatch, bail out! */
     986        1640 :       if (h_cp != n_cp) {
     987             :         break;
     988             :       }
     989             : 
     990         166 :       haystack = utf8codepoint(haystack, &h_cp);
     991         165 :       n = utf8codepoint(n, &n_cp);
     992             :     }
     993             : 
     994        1635 :     if (0 == n_cp) {
     995             :       /* we found the whole utf8 string for needle in haystack at
     996             :        * maybeMatch, so return it */
     997         162 :       return (utf8_int8_t *)maybeMatch;
     998             :     }
     999             : 
    1000        1607 :     if (0 == h_cp) {
    1001             :       /* no match */
    1002             :       return utf8_null;
    1003             :     }
    1004             : 
    1005             :     /* Roll back to the next code point in the haystack to test */
    1006        1473 :     haystack = nextH;
    1007             :   }
    1008             : }
    1009             : 
    1010    42109717 : utf8_constexpr14_impl utf8_int8_t *utf8valid(const utf8_int8_t *str) {
    1011    42109717 :   return utf8nvalid(str, SIZE_MAX);
    1012             : }
    1013             : 
    1014    42454067 : utf8_constexpr14_impl utf8_int8_t *utf8nvalid(const utf8_int8_t *str,
    1015             :                                               size_t n) {
    1016    42454067 :   const utf8_int8_t *t = str;
    1017    42454067 :   size_t consumed = 0;
    1018             : 
    1019  1451246962 :   while ((void)(consumed = (size_t)(str - t)), consumed < n && '\0' != *str) {
    1020  1408792895 :     const size_t remaining = n - consumed;
    1021             : 
    1022  1408792895 :     if (0xf0 == (0xf8 & *str)) {
    1023             :       /* ensure that there's 4 bytes or more remaining */
    1024           3 :       if (remaining < 4) {
    1025           0 :         return (utf8_int8_t *)str;
    1026             :       }
    1027             : 
    1028             :       /* ensure each of the 3 following bytes in this 4-byte
    1029             :        * utf8 codepoint began with 0b10xxxxxx */
    1030           3 :       if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2])) ||
    1031           3 :           (0x80 != (0xc0 & str[3]))) {
    1032           0 :         return (utf8_int8_t *)str;
    1033             :       }
    1034             : 
    1035             :       /* ensure that our utf8 codepoint ended after 4 bytes */
    1036           3 :       if ((remaining != 4) && (0x80 == (0xc0 & str[4]))) {
    1037           0 :         return (utf8_int8_t *)str;
    1038             :       }
    1039             : 
    1040             :       /* ensure that the top 5 bits of this 4-byte utf8
    1041             :        * codepoint were not 0, as then we could have used
    1042             :        * one of the smaller encodings */
    1043           3 :       if ((0 == (0x07 & str[0])) && (0 == (0x30 & str[1]))) {
    1044           0 :         return (utf8_int8_t *)str;
    1045             :       }
    1046             : 
    1047             :       /* 4-byte utf8 code point (began with 0b11110xxx) */
    1048           3 :       str += 4;
    1049  1408792892 :     } else if (0xe0 == (0xf0 & *str)) {
    1050             :       /* ensure that there's 3 bytes or more remaining */
    1051         233 :       if (remaining < 3) {
    1052           0 :         return (utf8_int8_t *)str;
    1053             :       }
    1054             : 
    1055             :       /* ensure each of the 2 following bytes in this 3-byte
    1056             :        * utf8 codepoint began with 0b10xxxxxx */
    1057         233 :       if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2]))) {
    1058           0 :         return (utf8_int8_t *)str;
    1059             :       }
    1060             : 
    1061             :       /* ensure that our utf8 codepoint ended after 3 bytes */
    1062         233 :       if ((remaining != 3) && (0x80 == (0xc0 & str[3]))) {
    1063           0 :         return (utf8_int8_t *)str;
    1064             :       }
    1065             : 
    1066             :       /* ensure that the top 5 bits of this 3-byte utf8
    1067             :        * codepoint were not 0, as then we could have used
    1068             :        * one of the smaller encodings */
    1069         233 :       if ((0 == (0x0f & str[0])) && (0 == (0x20 & str[1]))) {
    1070           0 :         return (utf8_int8_t *)str;
    1071             :       }
    1072             : 
    1073             :       /* 3-byte utf8 code point (began with 0b1110xxxx) */
    1074         233 :       str += 3;
    1075  1408792659 :     } else if (0xc0 == (0xe0 & *str)) {
    1076             :       /* ensure that there's 2 bytes or more remaining */
    1077        2259 :       if (remaining < 2) {
    1078           0 :         return (utf8_int8_t *)str;
    1079             :       }
    1080             : 
    1081             :       /* ensure the 1 following byte in this 2-byte
    1082             :        * utf8 codepoint began with 0b10xxxxxx */
    1083        2259 :       if (0x80 != (0xc0 & str[1])) {
    1084           0 :         return (utf8_int8_t *)str;
    1085             :       }
    1086             : 
    1087             :       /* ensure that our utf8 codepoint ended after 2 bytes */
    1088        2259 :       if ((remaining != 2) && (0x80 == (0xc0 & str[2]))) {
    1089           0 :         return (utf8_int8_t *)str;
    1090             :       }
    1091             : 
    1092             :       /* ensure that the top 4 bits of this 2-byte utf8
    1093             :        * codepoint were not 0, as then we could have used
    1094             :        * one of the smaller encodings */
    1095        2259 :       if (0 == (0x1e & str[0])) {
    1096           0 :         return (utf8_int8_t *)str;
    1097             :       }
    1098             : 
    1099             :       /* 2-byte utf8 code point (began with 0b110xxxxx) */
    1100        2259 :       str += 2;
    1101  1408790400 :     } else if (0x00 == (0x80 & *str)) {
    1102             :       /* 1-byte ascii (began with 0b0xxxxxxx) */
    1103  1408790400 :       str += 1;
    1104             :     } else {
    1105             :       /* we have an invalid 0b1xxxxxxx utf8 code point entry */
    1106           0 :       return (utf8_int8_t *)str;
    1107             :     }
    1108             :   }
    1109             : 
    1110             :   return utf8_null;
    1111             : }
    1112             : 
    1113           0 : int utf8makevalid(utf8_int8_t *str, const utf8_int32_t replacement) {
    1114           0 :   utf8_int8_t *read = str;
    1115           0 :   utf8_int8_t *write = read;
    1116           0 :   const utf8_int8_t r = (utf8_int8_t)replacement;
    1117           0 :   utf8_int32_t codepoint = 0;
    1118             : 
    1119           0 :   if (replacement > 0x7f) {
    1120             :     return -1;
    1121             :   }
    1122             : 
    1123           0 :   while ('\0' != *read) {
    1124           0 :     if (0xf0 == (0xf8 & *read)) {
    1125             :       /* ensure each of the 3 following bytes in this 4-byte
    1126             :        * utf8 codepoint began with 0b10xxxxxx */
    1127           0 :       if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2])) ||
    1128           0 :           (0x80 != (0xc0 & read[3]))) {
    1129           0 :         *write++ = r;
    1130           0 :         read++;
    1131           0 :         continue;
    1132             :       }
    1133             : 
    1134             :       /* 4-byte utf8 code point (began with 0b11110xxx) */
    1135           0 :       read = utf8codepoint(read, &codepoint);
    1136           0 :       write = utf8catcodepoint(write, codepoint, 4);
    1137           0 :     } else if (0xe0 == (0xf0 & *read)) {
    1138             :       /* ensure each of the 2 following bytes in this 3-byte
    1139             :        * utf8 codepoint began with 0b10xxxxxx */
    1140           0 :       if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2]))) {
    1141           0 :         *write++ = r;
    1142           0 :         read++;
    1143           0 :         continue;
    1144             :       }
    1145             : 
    1146             :       /* 3-byte utf8 code point (began with 0b1110xxxx) */
    1147           0 :       read = utf8codepoint(read, &codepoint);
    1148           0 :       write = utf8catcodepoint(write, codepoint, 3);
    1149           0 :     } else if (0xc0 == (0xe0 & *read)) {
    1150             :       /* ensure the 1 following byte in this 2-byte
    1151             :        * utf8 codepoint began with 0b10xxxxxx */
    1152           0 :       if (0x80 != (0xc0 & read[1])) {
    1153           0 :         *write++ = r;
    1154           0 :         read++;
    1155           0 :         continue;
    1156             :       }
    1157             : 
    1158             :       /* 2-byte utf8 code point (began with 0b110xxxxx) */
    1159           0 :       read = utf8codepoint(read, &codepoint);
    1160           0 :       write = utf8catcodepoint(write, codepoint, 2);
    1161           0 :     } else if (0x00 == (0x80 & *read)) {
    1162             :       /* 1-byte ascii (began with 0b0xxxxxxx) */
    1163           0 :       read = utf8codepoint(read, &codepoint);
    1164           0 :       write = utf8catcodepoint(write, codepoint, 1);
    1165             :     } else {
    1166             :       /* if we got here then we've got a dangling continuation (0b10xxxxxx) */
    1167           0 :       *write++ = r;
    1168           0 :       read++;
    1169           0 :       continue;
    1170             :     }
    1171             :   }
    1172             : 
    1173           0 :   *write = '\0';
    1174             : 
    1175           0 :   return 0;
    1176             : }
    1177             : 
    1178             : utf8_constexpr14_impl utf8_int8_t *
    1179        4151 : utf8codepoint(const utf8_int8_t *utf8_restrict str,
    1180             :               utf8_int32_t *utf8_restrict out_codepoint) {
    1181        4151 :   if (0xf0 == (0xf8 & str[0])) {
    1182             :     /* 4 byte utf8 codepoint */
    1183           0 :     *out_codepoint = ((0x07 & str[0]) << 18) | ((0x3f & str[1]) << 12) |
    1184           0 :                      ((0x3f & str[2]) << 6) | (0x3f & str[3]);
    1185           0 :     str += 4;
    1186        4151 :   } else if (0xe0 == (0xf0 & str[0])) {
    1187             :     /* 3 byte utf8 codepoint */
    1188           0 :           *out_codepoint = ((0x0f & str[0]) << 12) | ((0x3f & str[1]) << 6) | (0x3f & str[2]);
    1189           0 :     str += 3;
    1190        4151 :   } else if (0xc0 == (0xe0 & str[0])) {
    1191             :     /* 2 byte utf8 codepoint */
    1192         751 :     *out_codepoint = ((0x1f & str[0]) << 6) | (0x3f & str[1]);
    1193         751 :     str += 2;
    1194             :   } else {
    1195             :     /* 1 byte utf8 codepoint otherwise */
    1196        3400 :     *out_codepoint = str[0];
    1197        3400 :     str += 1;
    1198             :   }
    1199             : 
    1200        4151 :   return (utf8_int8_t *)str;
    1201             : }
    1202             : 
    1203           0 : utf8_constexpr14_impl size_t utf8codepointcalcsize(const utf8_int8_t *str) {
    1204           0 :   if (0xf0 == (0xf8 & str[0])) {
    1205             :     /* 4 byte utf8 codepoint */
    1206             :     return 4;
    1207           0 :   } else if (0xe0 == (0xf0 & str[0])) {
    1208             :     /* 3 byte utf8 codepoint */
    1209             :     return 3;
    1210           0 :   } else if (0xc0 == (0xe0 & str[0])) {
    1211             :     /* 2 byte utf8 codepoint */
    1212           0 :     return 2;
    1213             :   }
    1214             : 
    1215             :   /* 1 byte utf8 codepoint otherwise */
    1216             :   return 1;
    1217             : }
    1218             : 
    1219         110 : utf8_constexpr14_impl size_t utf8codepointsize(utf8_int32_t chr) {
    1220         110 :   if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
    1221             :     return 1;
    1222          35 :   } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
    1223             :     return 2;
    1224           0 :   } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
    1225             :     return 3;
    1226             :   } else { /* if (0 == ((int)0xffe00000 & chr)) { */
    1227           0 :     return 4;
    1228             :   }
    1229             : }
    1230             : 
    1231           0 : utf8_int8_t *utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n) {
    1232           0 :   if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
    1233             :     /* 1-byte/7-bit ascii
    1234             :      * (0b0xxxxxxx) */
    1235           0 :     if (n < 1) {
    1236             :       return utf8_null;
    1237             :     }
    1238           0 :     str[0] = (utf8_int8_t)chr;
    1239           0 :     str += 1;
    1240           0 :   } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
    1241             :     /* 2-byte/11-bit utf8 code point
    1242             :      * (0b110xxxxx 0b10xxxxxx) */
    1243           0 :     if (n < 2) {
    1244             :       return utf8_null;
    1245             :     }
    1246           0 :     str[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)((chr >> 6) & 0x1f));
    1247           0 :     str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
    1248           0 :     str += 2;
    1249           0 :   } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
    1250             :     /* 3-byte/16-bit utf8 code point
    1251             :      * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
    1252           0 :     if (n < 3) {
    1253             :       return utf8_null;
    1254             :     }
    1255           0 :     str[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)((chr >> 12) & 0x0f));
    1256           0 :     str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
    1257           0 :     str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
    1258           0 :     str += 3;
    1259             :   } else { /* if (0 == ((int)0xffe00000 & chr)) { */
    1260             :     /* 4-byte/21-bit utf8 code point
    1261             :      * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
    1262           0 :     if (n < 4) {
    1263             :       return utf8_null;
    1264             :     }
    1265           0 :     str[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)((chr >> 18) & 0x07));
    1266           0 :     str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
    1267           0 :     str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
    1268           0 :     str[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
    1269           0 :     str += 4;
    1270             :   }
    1271             : 
    1272             :   return str;
    1273             : }
    1274             : 
    1275           0 : utf8_constexpr14_impl int utf8islower(utf8_int32_t chr) {
    1276           0 :   return chr != utf8uprcodepoint(chr);
    1277             : }
    1278             : 
    1279           0 : utf8_constexpr14_impl int utf8isupper(utf8_int32_t chr) {
    1280           0 :   return chr != utf8lwrcodepoint(chr);
    1281             : }
    1282             : 
    1283           0 : void utf8lwr(utf8_int8_t *utf8_restrict str) {
    1284           0 :   utf8_int32_t cp = 0;
    1285           0 :   utf8_int8_t *pn = utf8codepoint(str, &cp);
    1286             : 
    1287           0 :   while (cp != 0) {
    1288           0 :     const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp);
    1289           0 :     const size_t size = utf8codepointsize(lwr_cp);
    1290             : 
    1291           0 :     if (lwr_cp != cp) {
    1292           0 :       utf8catcodepoint(str, lwr_cp, size);
    1293             :     }
    1294             : 
    1295           0 :     str = pn;
    1296           0 :     pn = utf8codepoint(str, &cp);
    1297             :   }
    1298           0 : }
    1299             : 
    1300           0 : void utf8upr(utf8_int8_t *utf8_restrict str) {
    1301           0 :   utf8_int32_t cp = 0;
    1302           0 :   utf8_int8_t *pn = utf8codepoint(str, &cp);
    1303             : 
    1304           0 :   while (cp != 0) {
    1305           0 :     const utf8_int32_t lwr_cp = utf8uprcodepoint(cp);
    1306           0 :     const size_t size = utf8codepointsize(lwr_cp);
    1307             : 
    1308           0 :     if (lwr_cp != cp) {
    1309           0 :       utf8catcodepoint(str, lwr_cp, size);
    1310             :     }
    1311             : 
    1312           0 :     str = pn;
    1313           0 :     pn = utf8codepoint(str, &cp);
    1314             :   }
    1315           0 : }
    1316             : 
    1317        3872 : utf8_constexpr14_impl utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
    1318        3872 :   if (((0x0041 <= cp) && (0x005a >= cp)) ||
    1319        3872 :       ((0x00c0 <= cp) && (0x00d6 >= cp)) ||
    1320             :       ((0x00d8 <= cp) && (0x00de >= cp)) ||
    1321             :       ((0x0391 <= cp) && (0x03a1 >= cp)) ||
    1322             :       ((0x03a3 <= cp) && (0x03ab >= cp)) ||
    1323             :       ((0x0410 <= cp) && (0x042f >= cp))) {
    1324        1635 :     cp += 32;
    1325             :   } else if ((0x0400 <= cp) && (0x040f >= cp)) {
    1326           0 :     cp += 80;
    1327             :   } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
    1328             :              ((0x0132 <= cp) && (0x0137 >= cp)) ||
    1329             :              ((0x014a <= cp) && (0x0177 >= cp)) ||
    1330             :              ((0x0182 <= cp) && (0x0185 >= cp)) ||
    1331             :              ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
    1332             :              ((0x01de <= cp) && (0x01ef >= cp)) ||
    1333             :              ((0x01f8 <= cp) && (0x021f >= cp)) ||
    1334             :              ((0x0222 <= cp) && (0x0233 >= cp)) ||
    1335             :              ((0x0246 <= cp) && (0x024f >= cp)) ||
    1336             :              ((0x03d8 <= cp) && (0x03ef >= cp)) ||
    1337             :              ((0x0460 <= cp) && (0x0481 >= cp)) ||
    1338             :              ((0x048a <= cp) && (0x04ff >= cp))) {
    1339           0 :     cp |= 0x1;
    1340             :   } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
    1341             :              ((0x0179 <= cp) && (0x017e >= cp)) ||
    1342             :              ((0x01af <= cp) && (0x01b0 >= cp)) ||
    1343             :              ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
    1344             :              ((0x01cd <= cp) && (0x01dc >= cp))) {
    1345           0 :     cp += 1;
    1346           0 :     cp &= ~0x1;
    1347             :   } else {
    1348        2237 :     switch (cp) {
    1349             :     default:
    1350             :       break;
    1351           0 :     case 0x0178:
    1352           0 :       cp = 0x00ff;
    1353           0 :       break;
    1354           0 :     case 0x0243:
    1355           0 :       cp = 0x0180;
    1356           0 :       break;
    1357           0 :     case 0x018e:
    1358           0 :       cp = 0x01dd;
    1359           0 :       break;
    1360           0 :     case 0x023d:
    1361           0 :       cp = 0x019a;
    1362           0 :       break;
    1363           0 :     case 0x0220:
    1364           0 :       cp = 0x019e;
    1365           0 :       break;
    1366           0 :     case 0x01b7:
    1367           0 :       cp = 0x0292;
    1368           0 :       break;
    1369           0 :     case 0x01c4:
    1370           0 :       cp = 0x01c6;
    1371           0 :       break;
    1372           0 :     case 0x01c7:
    1373           0 :       cp = 0x01c9;
    1374           0 :       break;
    1375           0 :     case 0x01ca:
    1376           0 :       cp = 0x01cc;
    1377           0 :       break;
    1378           0 :     case 0x01f1:
    1379           0 :       cp = 0x01f3;
    1380           0 :       break;
    1381           0 :     case 0x01f7:
    1382           0 :       cp = 0x01bf;
    1383           0 :       break;
    1384           0 :     case 0x0187:
    1385           0 :       cp = 0x0188;
    1386           0 :       break;
    1387           0 :     case 0x018b:
    1388           0 :       cp = 0x018c;
    1389           0 :       break;
    1390           0 :     case 0x0191:
    1391           0 :       cp = 0x0192;
    1392           0 :       break;
    1393           0 :     case 0x0198:
    1394           0 :       cp = 0x0199;
    1395           0 :       break;
    1396           0 :     case 0x01a7:
    1397           0 :       cp = 0x01a8;
    1398           0 :       break;
    1399           0 :     case 0x01ac:
    1400           0 :       cp = 0x01ad;
    1401           0 :       break;
    1402           0 :     case 0x01af:
    1403           0 :       cp = 0x01b0;
    1404           0 :       break;
    1405           0 :     case 0x01b8:
    1406           0 :       cp = 0x01b9;
    1407           0 :       break;
    1408           0 :     case 0x01bc:
    1409           0 :       cp = 0x01bd;
    1410           0 :       break;
    1411           0 :     case 0x01f4:
    1412           0 :       cp = 0x01f5;
    1413           0 :       break;
    1414           0 :     case 0x023b:
    1415           0 :       cp = 0x023c;
    1416           0 :       break;
    1417           0 :     case 0x0241:
    1418           0 :       cp = 0x0242;
    1419           0 :       break;
    1420           0 :     case 0x03fd:
    1421           0 :       cp = 0x037b;
    1422           0 :       break;
    1423           0 :     case 0x03fe:
    1424           0 :       cp = 0x037c;
    1425           0 :       break;
    1426           0 :     case 0x03ff:
    1427           0 :       cp = 0x037d;
    1428           0 :       break;
    1429           0 :     case 0x037f:
    1430           0 :       cp = 0x03f3;
    1431           0 :       break;
    1432           0 :     case 0x0386:
    1433           0 :       cp = 0x03ac;
    1434           0 :       break;
    1435           0 :     case 0x0388:
    1436           0 :       cp = 0x03ad;
    1437           0 :       break;
    1438           0 :     case 0x0389:
    1439           0 :       cp = 0x03ae;
    1440           0 :       break;
    1441           0 :     case 0x038a:
    1442           0 :       cp = 0x03af;
    1443           0 :       break;
    1444           0 :     case 0x038c:
    1445           0 :       cp = 0x03cc;
    1446           0 :       break;
    1447           0 :     case 0x038e:
    1448           0 :       cp = 0x03cd;
    1449           0 :       break;
    1450           0 :     case 0x038f:
    1451           0 :       cp = 0x03ce;
    1452           0 :       break;
    1453           0 :     case 0x0370:
    1454           0 :       cp = 0x0371;
    1455           0 :       break;
    1456           0 :     case 0x0372:
    1457           0 :       cp = 0x0373;
    1458           0 :       break;
    1459           0 :     case 0x0376:
    1460           0 :       cp = 0x0377;
    1461           0 :       break;
    1462           0 :     case 0x03f4:
    1463           0 :       cp = 0x03b8;
    1464           0 :       break;
    1465           0 :     case 0x03cf:
    1466           0 :       cp = 0x03d7;
    1467           0 :       break;
    1468           0 :     case 0x03f9:
    1469           0 :       cp = 0x03f2;
    1470           0 :       break;
    1471           0 :     case 0x03f7:
    1472           0 :       cp = 0x03f8;
    1473           0 :       break;
    1474           0 :     case 0x03fa:
    1475           0 :       cp = 0x03fb;
    1476           0 :       break;
    1477             :     }
    1478             :   }
    1479             : 
    1480        3872 :   return cp;
    1481             : }
    1482             : 
    1483         618 : utf8_constexpr14_impl utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
    1484         618 :   if (((0x0061 <= cp) && (0x007a >= cp)) ||
    1485         618 :       ((0x00e0 <= cp) && (0x00f6 >= cp)) ||
    1486             :       ((0x00f8 <= cp) && (0x00fe >= cp)) ||
    1487             :       ((0x03b1 <= cp) && (0x03c1 >= cp)) ||
    1488             :       ((0x03c3 <= cp) && (0x03cb >= cp)) ||
    1489             :       ((0x0430 <= cp) && (0x044f >= cp))) {
    1490         326 :     cp -= 32;
    1491             :   } else if ((0x0450 <= cp) && (0x045f >= cp)) {
    1492           0 :     cp -= 80;
    1493             :   } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
    1494             :              ((0x0132 <= cp) && (0x0137 >= cp)) ||
    1495             :              ((0x014a <= cp) && (0x0177 >= cp)) ||
    1496             :              ((0x0182 <= cp) && (0x0185 >= cp)) ||
    1497             :              ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
    1498             :              ((0x01de <= cp) && (0x01ef >= cp)) ||
    1499             :              ((0x01f8 <= cp) && (0x021f >= cp)) ||
    1500             :              ((0x0222 <= cp) && (0x0233 >= cp)) ||
    1501             :              ((0x0246 <= cp) && (0x024f >= cp)) ||
    1502             :              ((0x03d8 <= cp) && (0x03ef >= cp)) ||
    1503             :              ((0x0460 <= cp) && (0x0481 >= cp)) ||
    1504             :              ((0x048a <= cp) && (0x04ff >= cp))) {
    1505           0 :     cp &= ~0x1;
    1506             :   } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
    1507             :              ((0x0179 <= cp) && (0x017e >= cp)) ||
    1508             :              ((0x01af <= cp) && (0x01b0 >= cp)) ||
    1509             :              ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
    1510             :              ((0x01cd <= cp) && (0x01dc >= cp))) {
    1511           0 :     cp -= 1;
    1512           0 :     cp |= 0x1;
    1513             :   } else {
    1514         292 :     switch (cp) {
    1515             :     default:
    1516             :       break;
    1517           0 :     case 0x00ff:
    1518           0 :       cp = 0x0178;
    1519           0 :       break;
    1520           0 :     case 0x0180:
    1521           0 :       cp = 0x0243;
    1522           0 :       break;
    1523           0 :     case 0x01dd:
    1524           0 :       cp = 0x018e;
    1525           0 :       break;
    1526           0 :     case 0x019a:
    1527           0 :       cp = 0x023d;
    1528           0 :       break;
    1529           0 :     case 0x019e:
    1530           0 :       cp = 0x0220;
    1531           0 :       break;
    1532           0 :     case 0x0292:
    1533           0 :       cp = 0x01b7;
    1534           0 :       break;
    1535           0 :     case 0x01c6:
    1536           0 :       cp = 0x01c4;
    1537           0 :       break;
    1538           0 :     case 0x01c9:
    1539           0 :       cp = 0x01c7;
    1540           0 :       break;
    1541           0 :     case 0x01cc:
    1542           0 :       cp = 0x01ca;
    1543           0 :       break;
    1544           0 :     case 0x01f3:
    1545           0 :       cp = 0x01f1;
    1546           0 :       break;
    1547           0 :     case 0x01bf:
    1548           0 :       cp = 0x01f7;
    1549           0 :       break;
    1550           0 :     case 0x0188:
    1551           0 :       cp = 0x0187;
    1552           0 :       break;
    1553           0 :     case 0x018c:
    1554           0 :       cp = 0x018b;
    1555           0 :       break;
    1556           0 :     case 0x0192:
    1557           0 :       cp = 0x0191;
    1558           0 :       break;
    1559           0 :     case 0x0199:
    1560           0 :       cp = 0x0198;
    1561           0 :       break;
    1562           0 :     case 0x01a8:
    1563           0 :       cp = 0x01a7;
    1564           0 :       break;
    1565           0 :     case 0x01ad:
    1566           0 :       cp = 0x01ac;
    1567           0 :       break;
    1568           0 :     case 0x01b0:
    1569           0 :       cp = 0x01af;
    1570           0 :       break;
    1571           0 :     case 0x01b9:
    1572           0 :       cp = 0x01b8;
    1573           0 :       break;
    1574           0 :     case 0x01bd:
    1575           0 :       cp = 0x01bc;
    1576           0 :       break;
    1577           0 :     case 0x01f5:
    1578           0 :       cp = 0x01f4;
    1579           0 :       break;
    1580           0 :     case 0x023c:
    1581           0 :       cp = 0x023b;
    1582           0 :       break;
    1583           0 :     case 0x0242:
    1584           0 :       cp = 0x0241;
    1585           0 :       break;
    1586           0 :     case 0x037b:
    1587           0 :       cp = 0x03fd;
    1588           0 :       break;
    1589           0 :     case 0x037c:
    1590           0 :       cp = 0x03fe;
    1591           0 :       break;
    1592           0 :     case 0x037d:
    1593           0 :       cp = 0x03ff;
    1594           0 :       break;
    1595           0 :     case 0x03f3:
    1596           0 :       cp = 0x037f;
    1597           0 :       break;
    1598           0 :     case 0x03ac:
    1599           0 :       cp = 0x0386;
    1600           0 :       break;
    1601           0 :     case 0x03ad:
    1602           0 :       cp = 0x0388;
    1603           0 :       break;
    1604          12 :     case 0x03ae:
    1605          12 :       cp = 0x0389;
    1606          12 :       break;
    1607           0 :     case 0x03af:
    1608           0 :       cp = 0x038a;
    1609           0 :       break;
    1610           0 :     case 0x03cc:
    1611           0 :       cp = 0x038c;
    1612           0 :       break;
    1613           0 :     case 0x03cd:
    1614           0 :       cp = 0x038e;
    1615           0 :       break;
    1616           0 :     case 0x03ce:
    1617           0 :       cp = 0x038f;
    1618           0 :       break;
    1619           0 :     case 0x0371:
    1620           0 :       cp = 0x0370;
    1621           0 :       break;
    1622           0 :     case 0x0373:
    1623           0 :       cp = 0x0372;
    1624           0 :       break;
    1625           0 :     case 0x0377:
    1626           0 :       cp = 0x0376;
    1627           0 :       break;
    1628           0 :     case 0x03d1:
    1629           0 :       cp = 0x0398;
    1630           0 :       break;
    1631           0 :     case 0x03d7:
    1632           0 :       cp = 0x03cf;
    1633           0 :       break;
    1634           0 :     case 0x03f2:
    1635           0 :       cp = 0x03f9;
    1636           0 :       break;
    1637           0 :     case 0x03f8:
    1638           0 :       cp = 0x03f7;
    1639           0 :       break;
    1640           0 :     case 0x03fb:
    1641           0 :       cp = 0x03fa;
    1642           0 :       break;
    1643             :     }
    1644             :   }
    1645             : 
    1646         618 :   return cp;
    1647             : }
    1648             : 
    1649             : utf8_constexpr14_impl utf8_int8_t *
    1650           0 : utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
    1651             :                utf8_int32_t *utf8_restrict out_codepoint) {
    1652           0 :   const utf8_int8_t *s = (const utf8_int8_t *)str;
    1653             : 
    1654           0 :   if (0xf0 == (0xf8 & s[0])) {
    1655             :     /* 4 byte utf8 codepoint */
    1656           0 :     *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
    1657           0 :                      ((0x3f & s[2]) << 6) | (0x3f & s[3]);
    1658           0 :   } else if (0xe0 == (0xf0 & s[0])) {
    1659             :     /* 3 byte utf8 codepoint */
    1660           0 :           *out_codepoint = ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
    1661           0 :   } else if (0xc0 == (0xe0 & s[0])) {
    1662             :     /* 2 byte utf8 codepoint */
    1663           0 :     *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
    1664             :   } else {
    1665             :     /* 1 byte utf8 codepoint otherwise */
    1666           0 :     *out_codepoint = s[0];
    1667             :   }
    1668             : 
    1669           0 :   do {
    1670           0 :     s--;
    1671           0 :   } while ((0 != (0x80 & s[0])) && (0x80 == (0xc0 & s[0])));
    1672             : 
    1673           0 :   return (utf8_int8_t *)s;
    1674             : }
    1675             : 
    1676             : #undef utf8_restrict
    1677             : #undef utf8_constexpr14
    1678             : #undef utf8_null
    1679             : 
    1680             : #ifdef __cplusplus
    1681             : } /* extern "C" */
    1682             : #endif
    1683             : 
    1684             : #if defined(__clang__)
    1685             : #pragma clang diagnostic pop
    1686             : #endif
    1687             : 
    1688             : #endif /* SHEREDOM_UTF8_H_INCLUDED */

Generated by: LCOV version 1.14