Line data Source code
1 : /* 2 : * SPDX-License-Identifier: MPL-2.0 3 : * 4 : * This Source Code Form is subject to the terms of the Mozilla Public 5 : * License, v. 2.0. If a copy of the MPL was not distributed with this 6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 : * 8 : * Copyright 2024 MonetDB Foundation; 9 : * Copyright August 2008 - 2023 MonetDB B.V.; 10 : * Copyright 1997 - July 2008 CWI. 11 : */ 12 : 13 : #ifndef _MSTRING_H_ 14 : #define _MSTRING_H_ 15 : 16 : #include <stdarg.h> /* va_list etc. */ 17 : #include <string.h> /* strlen */ 18 : 19 : #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 4)) 20 : /* not on CentOS 6 (GCC 4.4.7) */ 21 : #define GCC_Pragma(pragma) _Pragma(pragma) 22 : #else 23 : #define GCC_Pragma(pragma) 24 : #endif 25 : 26 : /* copy at most (n-1) bytes from src to dst and add a terminating NULL 27 : * byte; return length of src (i.e. can be more than what is copied) */ 28 : static inline size_t 29 255812597 : strcpy_len(char *restrict dst, const char *restrict src, size_t n) 30 : { 31 255812597 : if (dst != NULL && n != 0) { 32 4929455347 : for (size_t i = 0; i < n; i++) { 33 4925742689 : if ((dst[i] = src[i]) == 0) 34 252099939 : return i; 35 : } 36 : /* for correctness, the decrement isn't needed (just assigning 0 37 : * to dst[n-1] would be sufficient), but to work around a too 38 : * strict GNU C compiler, we do need it */ 39 3712658 : dst[--n] = 0; 40 : /* in some versions of GCC (at least gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 41 : * 7.5.0), the error just can't be turned off when using 42 : * --enable-strict, so we just use the (more) expensive way of getting the 43 : * right answer (rescan the whole string) */ 44 : #if !defined(__GNUC__) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 5) 45 : /* This code is correct, but GCC gives a warning in certain 46 : * conditions, so we disable the warning temporarily. 47 : * The warning happens e.g. in 48 : * strcpy_len(buf, "fixed string", sizeof(buf)) 49 : * where buf is larger than the string. In that case we never get here 50 : * since return is executed in the loop above, but the compiler 51 : * complains anyway about reading out-of-bounds. 52 : * For GCC we use _Pragma to disable the warning (and hence error). 53 : * Since other compilers may warn (and hence error out) on 54 : * unrecognized pragmas, we use some preprocessor trickery. */ 55 : GCC_Pragma("GCC diagnostic push") 56 : GCC_Pragma("GCC diagnostic ignored \"-Warray-bounds\"") 57 3712658 : return n + strlen(src + n); 58 : GCC_Pragma("GCC diagnostic pop") 59 : #endif 60 : } 61 0 : return strlen(src); 62 : } 63 : 64 : /* copy the NULL terminated list of src strings with a maximum of n 65 : * bytes to dst; return the combined length of the src strings */ 66 : static inline size_t 67 54653383 : strconcat_len(char *restrict dst, size_t n, const char *restrict src, ...) 68 : { 69 54653383 : va_list ap; 70 54653383 : size_t i = 0; 71 : 72 54653383 : va_start(ap, src); 73 221551313 : while (src) { 74 166907745 : size_t l; 75 166907745 : if (dst && i < n) 76 166897568 : l = strcpy_len(dst + i, src, n - i); 77 : else 78 10177 : l = strlen(src); 79 166890228 : i += l; 80 166890228 : src = va_arg(ap, const char *); 81 : } 82 54643568 : va_end(ap); 83 54643568 : return i; 84 : } 85 : 86 : #ifndef __GNUC__ 87 : /* __builtin_expect returns its first argument; it is expected to be 88 : * equal to the second argument */ 89 : #define __builtin_expect(expr, expect) (expr) 90 : #endif 91 : 92 : /* 93 : * UTF-8 encoding is as follows: 94 : * U-00000000 - U-0000007F: 0xxxxxxx 95 : * U-00000080 - U-000007FF: 110zzzzx 10xxxxxx 96 : * U-00000800 - U-0000FFFF: 1110zzzz 10zxxxxx 10xxxxxx 97 : * U-00010000 - U-0010FFFF: 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx 98 : * 99 : * To be correctly coded UTF-8, the sequence should be the shortest 100 : * possible encoding of the value being encoded. This means that at 101 : * least one of the z bits must be non-zero. Also note that the four 102 : * byte sequence can encode more than is allowed and that the values 103 : * U+D800..U+DFFF are not allowed to be encoded. 104 : */ 105 : static inline bool 106 65364846 : checkUTF8(const char *v) 107 : { 108 : /* It is unlikely that this functions returns false, because 109 : * it is likely that the string presented is a correctly coded 110 : * UTF-8 string. So we annotate the tests that are very 111 : * unlikely to succeed, i.e. the ones that lead to a return of 112 : * false, as being expected to return 0 using the 113 : * __builtin_expect function. */ 114 65364846 : if (v != NULL) { 115 65364846 : if (v[0] != '\200' || v[1] != '\0') { 116 : /* check that string is correctly encoded UTF-8 */ 117 2380445398 : for (size_t i = 0; v[i]; i++) { 118 : /* we do not annotate all tests, only the ones 119 : * leading directly to an unlikely return 120 : * statement */ 121 2316118045 : if ((v[i] & 0x80) == 0) { 122 : ; 123 261589 : } else if ((v[i] & 0xE0) == 0xC0) { 124 248904 : if (__builtin_expect(((v[i] & 0x1E) == 0), 0)) 125 : return false; 126 248904 : if (__builtin_expect(((v[++i] & 0xC0) != 0x80), 0)) 127 : return false; 128 12685 : } else if ((v[i] & 0xF0) == 0xE0) { 129 12553 : if ((v[i++] & 0x0F) == 0) { 130 20 : if (__builtin_expect(((v[i] & 0xE0) != 0xA0), 0)) 131 : return false; 132 : } else { 133 12533 : if (__builtin_expect(((v[i] & 0xC0) != 0x80), 0)) 134 : return false; 135 : } 136 12553 : if (__builtin_expect(((v[++i] & 0xC0) != 0x80), 0)) 137 : return false; 138 132 : } else if (__builtin_expect(((v[i] & 0xF8) == 0xF0), 1)) { 139 131 : if ((v[i++] & 0x07) == 0) { 140 129 : if (__builtin_expect(((v[i] & 0x30) == 0), 0)) 141 : return false; 142 : } 143 131 : if (__builtin_expect(((v[i] & 0xC0) != 0x80), 0)) 144 : return false; 145 129 : if (__builtin_expect(((v[++i] & 0xC0) != 0x80), 0)) 146 : return false; 147 129 : if (__builtin_expect(((v[++i] & 0xC0) != 0x80), 0)) 148 : return false; 149 : } else { 150 : return false; 151 : } 152 : } 153 : } 154 : } 155 : return true; 156 : } 157 : 158 : #ifndef __GNUC__ 159 : #undef __builtin_expect 160 : #endif 161 : 162 : #endif