Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #ifndef _MSTRING_H_
14 : #define _MSTRING_H_
15 :
16 : #include <stdarg.h> /* va_list etc. */
17 : #include <string.h> /* strlen */
18 :
19 : #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 4))
20 : /* not on CentOS 6 (GCC 4.4.7) */
21 : #define GCC_Pragma(pragma) _Pragma(pragma)
22 : #else
23 : #define GCC_Pragma(pragma)
24 : #endif
25 :
26 : /* copy at most (n-1) bytes from src to dst and add a terminating NULL
27 : * byte; return length of src (i.e. can be more than what is copied) */
28 : static inline size_t
29 281395558 : strcpy_len(char *restrict dst, const char *restrict src, size_t n)
30 : {
31 281395558 : if (dst != NULL && n != 0) {
32 5694201593 : for (size_t i = 0; i < n; i++) {
33 5690053646 : if ((dst[i] = src[i]) == 0)
34 277247608 : return i;
35 : }
36 : /* for correctness, the decrement isn't needed (just assigning 0
37 : * to dst[n-1] would be sufficient), but to work around a too
38 : * strict GNU C compiler, we do need it */
39 4147947 : dst[--n] = 0;
40 : /* in some versions of GCC (at least gcc (Ubuntu 7.5.0-3ubuntu1~18.04)
41 : * 7.5.0), the error just can't be turned off when using
42 : * --enable-strict, so we just use the (more) expensive way of getting the
43 : * right answer (rescan the whole string) */
44 : #if !defined(__GNUC__) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 5)
45 : /* This code is correct, but GCC gives a warning in certain
46 : * conditions, so we disable the warning temporarily.
47 : * The warning happens e.g. in
48 : * strcpy_len(buf, "fixed string", sizeof(buf))
49 : * where buf is larger than the string. In that case we never get here
50 : * since return is executed in the loop above, but the compiler
51 : * complains anyway about reading out-of-bounds.
52 : * For GCC we use _Pragma to disable the warning (and hence error).
53 : * Since other compilers may warn (and hence error out) on
54 : * unrecognized pragmas, we use some preprocessor trickery. */
55 : GCC_Pragma("GCC diagnostic push")
56 : GCC_Pragma("GCC diagnostic ignored \"-Warray-bounds\"")
57 4147947 : return n + strlen(src + n);
58 : GCC_Pragma("GCC diagnostic pop")
59 : #endif
60 : }
61 3 : return strlen(src);
62 : }
63 :
64 : /* copy the NULL terminated list of src strings with a maximum of n
65 : * bytes to dst; return the combined length of the src strings */
66 : static inline size_t
67 48113809 : strconcat_len(char *restrict dst, size_t n, const char *restrict src, ...)
68 : {
69 48113809 : va_list ap;
70 48113809 : size_t i = 0;
71 :
72 48113809 : va_start(ap, src);
73 231585019 : while (src) {
74 183480776 : size_t l;
75 183480776 : if (dst && i < n)
76 183470619 : l = strcpy_len(dst + i, src, n - i);
77 : else
78 10157 : l = strlen(src);
79 183444736 : i += l;
80 183444736 : src = va_arg(ap, const char *);
81 : }
82 48104243 : va_end(ap);
83 48104243 : return i;
84 : }
85 :
86 : #ifdef __has_builtin
87 : #if __has_builtin(__builtin_expect)
88 : /* __builtin_expect returns its first argument; it is expected to be
89 : * equal to the second argument */
90 : #define unlikely(expr) __builtin_expect((expr) != 0, 0)
91 : #define likely(expr) __builtin_expect((expr) != 0, 1)
92 : #endif
93 : #endif
94 : #ifndef unlikely
95 : #ifdef _MSC_VER
96 : #define unlikely(expr) (__assume(!(expr)), (expr))
97 : #define likely(expr) (__assume((expr)), (expr))
98 : #else
99 : #define unlikely(expr) (expr)
100 : #define likely(expr) (expr)
101 : #endif
102 : #endif
103 :
104 : /*
105 : * UTF-8 encoding is as follows:
106 : * U-00000000 - U-0000007F: 0xxxxxxx
107 : * U-00000080 - U-000007FF: 110zzzzx 10xxxxxx
108 : * U-00000800 - U-0000FFFF: 1110zzzz 10zxxxxx 10xxxxxx
109 : * U-00010000 - U-0010FFFF: 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx
110 : *
111 : * To be correctly coded UTF-8, the sequence should be the shortest
112 : * possible encoding of the value being encoded. This means that at
113 : * least one of the z bits must be non-zero. Also note that the four
114 : * byte sequence can encode more than is allowed and that the values
115 : * U+D800..U+DFFF are not allowed to be encoded.
116 : */
117 : static inline bool
118 101894550 : checkUTF8(const char *v)
119 : {
120 : /* It is unlikely that this functions returns false, because it is
121 : * likely that the string presented is a correctly coded UTF-8
122 : * string. So we annotate the tests that are very (un)likely to
123 : * succeed, i.e. the ones that lead to a return of false. This can
124 : * help the compiler produce more efficient code. */
125 101894550 : if (v != NULL) {
126 101894550 : if (v[0] != '\200' || v[1] != '\0') {
127 : /* check that string is correctly encoded UTF-8 */
128 3978347079 : for (size_t i = 0; v[i]; i++) {
129 : /* we do not annotate all tests, only the ones
130 : * leading directly to an unlikely return
131 : * statement */
132 3877493815 : if ((v[i] & 0x80) == 0) {
133 : ;
134 264043 : } else if ((v[i] & 0xE0) == 0xC0) {
135 251143 : if (unlikely(((v[i] & 0x1E) == 0)))
136 : return false;
137 251143 : if (unlikely(((v[++i] & 0xC0) != 0x80)))
138 : return false;
139 12900 : } else if ((v[i] & 0xF0) == 0xE0) {
140 12760 : if ((v[i++] & 0x0F) == 0) {
141 24 : if (unlikely(((v[i] & 0xE0) != 0xA0)))
142 : return false;
143 : } else {
144 12736 : if (unlikely(((v[i] & 0xC0) != 0x80)))
145 : return false;
146 : }
147 12760 : if (unlikely(((v[++i] & 0xC0) != 0x80)))
148 : return false;
149 140 : } else if (likely(((v[i] & 0xF8) == 0xF0))) {
150 139 : if ((v[i++] & 0x07) == 0) {
151 137 : if (unlikely(((v[i] & 0x30) == 0)))
152 : return false;
153 : }
154 139 : if (unlikely(((v[i] & 0xC0) != 0x80)))
155 : return false;
156 137 : if (unlikely(((v[++i] & 0xC0) != 0x80)))
157 : return false;
158 137 : if (unlikely(((v[++i] & 0xC0) != 0x80)))
159 : return false;
160 : } else {
161 : return false;
162 : }
163 : }
164 : }
165 : }
166 : return true;
167 : }
168 :
169 : static inline int vreallocprintf(char **buf, size_t *pos, size_t *size, const char *fmt, va_list ap)
170 : __attribute__((__format__(__printf__, 4, 0)));
171 :
172 : static inline int
173 7157 : vreallocprintf(char **buf, size_t *pos, size_t *capacity, const char *fmt, va_list args)
174 : {
175 7157 : va_list ap;
176 :
177 7157 : assert(*pos <= *capacity);
178 7157 : assert(*buf == NULL || *capacity > 0);
179 :
180 7157 : size_t need_at_least = strlen(fmt);
181 7157 : need_at_least += 1; // trailing NUL
182 7157 : need_at_least += 80; // some space for the items
183 7157 : while (1) {
184 : // Common cases:
185 : // 1. buf=NULL, pos=cap=0: allocate reasonable amount
186 : // 2. buf=NULL, pos=0, cap=something: start with allocating cap
187 : // 3. buf not NULL, cap=something: allocate larger cap
188 7157 : if (*buf == NULL || need_at_least > *capacity - *pos) {
189 1922 : size_t cap1 = *pos + need_at_least;
190 1922 : size_t cap2 = *capacity;
191 1922 : if (*buf)
192 20 : cap2 += cap2 / 2;
193 1922 : size_t new_cap = cap1 > cap2 ? cap1 : cap2;
194 1922 : char *new_buf = realloc(*buf, new_cap);
195 1922 : if (new_buf == 0)
196 : return -1;
197 1922 : *buf = new_buf;
198 1922 : *capacity = new_cap;
199 : }
200 7157 : assert(*buf);
201 7157 : assert(need_at_least <= *capacity - *pos);
202 7157 : char *output = &(*buf)[*pos];
203 7157 : size_t avail = *capacity - *pos;
204 7157 : assert(avail >= 1);
205 :
206 7157 : va_copy(ap, args);
207 7157 : int n = vsnprintf(output, avail, fmt, ap);
208 7157 : va_end(ap);
209 :
210 7157 : if (n < 0)
211 0 : return n;
212 7157 : size_t needed = (size_t)n;
213 7157 : if (needed <= avail - 1) {
214 : // it wanted to print n chars and it could
215 7157 : *pos += needed;
216 7157 : return n;
217 : }
218 0 : need_at_least = needed + 1;
219 : }
220 : }
221 :
222 : static inline int reallocprintf(char **buf, size_t *pos, size_t *size, const char *fmt, ...)
223 : __attribute__((__format__(__printf__, 4, 5)));
224 :
225 : static inline int
226 6477 : reallocprintf(char **buf, size_t *pos, size_t *capacity, const char *fmt, ...)
227 : {
228 6477 : int n;
229 6477 : va_list ap;
230 6477 : va_start(ap, fmt);
231 6477 : n = vreallocprintf(buf, pos, capacity, fmt, ap);
232 6477 : va_end(ap);
233 6477 : return n;
234 : }
235 :
236 : #undef unlikely
237 : #undef likely
238 :
239 : #endif
|