Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "gdk.h"
15 : #include "gdk_private.h"
16 : #include "gdk_cand.h"
17 :
18 : /* String Atom Implementation
19 : *
20 : * Strings are stored in two parts. The first part is the normal tail
21 : * heap which contains a list of offsets. The second part is the
22 : * theap which contains the actual strings. The offsets in the tail
23 : * heap (a.k.a. offset heap) point into the theap (a.k.a. string
24 : * heap). Strings are NULL-terminated and are stored without any
25 : * escape sequences. Strings are encoded using the UTF-8 encoding
26 : * of Unicode. This means that individual "characters" (really,
27 : * Unicode code points) can be between one and four bytes long.
28 : *
29 : * Because in many typical situations there are lots of duplicated
30 : * string values that are being stored in a table, but also in many
31 : * (other) typical situations there are very few duplicated string
32 : * values stored, a scheme has been introduced to cater to both
33 : * situations.
34 : *
35 : * When the string heap is "small" (defined as less than 64KiB), the
36 : * string heap is fully duplicate eliminated. When the string heap
37 : * grows beyond this size, the heap is not kept free of duplicate
38 : * strings, but there is then a heuristic that tries to limit the
39 : * number of duplicates.
40 : *
41 : * This is done by having a fixed sized hash table at the start of the
42 : * string heap, and allocating space for collision lists in the first
43 : * 64KiB of the string heap. After the first 64KiB no extra space is
44 : * allocated for lists, so hash collisions cannot be resolved.
45 : */
46 :
47 : /* some of these macros are duplicates from gdk_atoms.c */
48 : #define num08(x) ((x) >= '0' && (x) <= '7')
49 : #define base08(x) ((x) - '0')
50 : #define mult08(x) ((x) << 3)
51 :
52 : #define num16(x) isxdigit((unsigned char) (x))
53 : #define base16(x) (((x) >= 'a' && (x) <= 'f') ? ((x) - 'a' + 10) : ((x) >= 'A' && (x) <= 'F') ? ((x) - 'A' + 10) : (x) - '0')
54 : #define mult16(x) ((x) << 4)
55 :
56 : #define atommem(size) \
57 : do { \
58 : if (*dst == NULL || *len < (size)) { \
59 : GDKfree(*dst); \
60 : *len = (size); \
61 : *dst = GDKmalloc(*len); \
62 : if (*dst == NULL) { \
63 : *len = 0; \
64 : return -1; \
65 : } \
66 : } \
67 : } while (0)
68 :
69 : const char str_nil[2] = { '\200', 0 };
70 :
71 : gdk_return
72 587357 : strHeap(Heap *d, size_t cap)
73 : {
74 587357 : size_t size;
75 :
76 587357 : cap = MAX(cap, BATTINY);
77 587357 : size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * GDK_VARALIGN);
78 587357 : return HEAPalloc(d, size, 1);
79 : }
80 :
81 :
82 : void
83 4362 : strCleanHash(Heap *h, bool rebuild)
84 : {
85 4362 : stridx_t newhash[GDK_STRHASHTABLE];
86 4362 : size_t pad, pos;
87 4362 : BUN off, strhash;
88 4362 : const char *s;
89 :
90 4362 : (void) rebuild;
91 4362 : if (!h->cleanhash)
92 818 : return;
93 3544 : if (h->size < GDK_STRHASHTABLE * sizeof(stridx_t) &&
94 0 : HEAPextend(h, GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN, true) != GDK_SUCCEED) {
95 0 : GDKclrerr();
96 0 : if (h->size > 0)
97 0 : memset(h->base, 0, h->size);
98 0 : return;
99 : }
100 :
101 : /* rebuild hash table for double elimination
102 : *
103 : * If appending strings to the BAT was aborted, if the heap
104 : * was memory mapped, the hash in the string heap may well be
105 : * incorrect. Therefore we don't trust it when we read in a
106 : * string heap and we rebuild the complete table (it is small,
107 : * so this won't take any time at all).
108 : * Note that we will only do this the first time the heap is
109 : * loaded, and only for heaps that existed when the server was
110 : * started. */
111 3544 : memset(newhash, 0, sizeof(newhash));
112 3544 : pos = GDK_STRHASHSIZE;
113 340922 : while (pos < h->free) {
114 337624 : pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
115 337624 : if (pad < sizeof(stridx_t))
116 289006 : pad += GDK_VARALIGN;
117 337624 : pos += pad;
118 337624 : if (pos >= GDK_ELIMLIMIT)
119 : break;
120 337378 : s = h->base + pos;
121 337378 : strhash = strHash(s);
122 337378 : off = strhash & GDK_STRHASHMASK;
123 337378 : newhash[off] = (stridx_t) (pos - sizeof(stridx_t));
124 337378 : pos += strlen(s) + 1;
125 : }
126 : /* only set dirty flag if the hash table actually changed */
127 3544 : if (memcmp(newhash, h->base, sizeof(newhash)) != 0) {
128 180 : memcpy(h->base, newhash, sizeof(newhash));
129 180 : if (h->storage == STORE_MMAP) {
130 31 : if (!(ATOMIC_GET(&GDKdebug) & NOSYNCMASK))
131 0 : (void) MT_msync(h->base, GDK_STRHASHSIZE);
132 : } else
133 149 : h->dirty = true;
134 : }
135 : #ifndef NDEBUG
136 3544 : if (GDK_ELIMDOUBLES(h)) {
137 : pos = GDK_STRHASHSIZE;
138 233275 : while (pos < h->free) {
139 229977 : pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
140 229977 : if (pad < sizeof(stridx_t))
141 191729 : pad += GDK_VARALIGN;
142 229977 : pos += pad;
143 229977 : s = h->base + pos;
144 229977 : assert(strLocate(h, s) != 0);
145 229977 : pos += strlen(s) + 1;
146 : }
147 : }
148 : #endif
149 3544 : h->cleanhash = false;
150 : }
151 :
152 : /*
153 : * The strPut routine. The routine strLocate can be used to identify
154 : * the location of a string in the heap if it exists. Otherwise it
155 : * returns (var_t) -2 (-1 is reserved for error).
156 : */
157 : var_t
158 406884 : strLocate(Heap *h, const char *v)
159 : {
160 406884 : stridx_t *ref, *next;
161 :
162 : /* search hash-table, if double-elimination is still in place */
163 406884 : BUN off;
164 406884 : if (h->free == 0) {
165 : /* empty, so there are no strings */
166 : return (var_t) -2;
167 : }
168 :
169 406884 : off = strHash(v);
170 406884 : off &= GDK_STRHASHMASK;
171 :
172 : /* should only use strLocate iff fully double eliminated */
173 406884 : assert(GDK_ELIMBASE(h->free) == 0);
174 :
175 : /* search the linked list */
176 452705 : for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
177 449695 : next = (stridx_t *) (h->base + *ref);
178 449695 : if (strcmp(v, (str) (next + 1)) == 0)
179 403874 : return (var_t) ((sizeof(stridx_t) + *ref)); /* found */
180 : }
181 : return (var_t) -2;
182 : }
183 :
184 : var_t
185 94954177 : strPut(BAT *b, var_t *dst, const void *V)
186 : {
187 94954177 : const char *v = V;
188 94954177 : Heap *h = b->tvheap;
189 94954177 : size_t pad;
190 94954177 : size_t pos, len = strlen(v) + 1;
191 94954177 : stridx_t *bucket;
192 94954177 : BUN off;
193 :
194 94954177 : if (h->free == 0) {
195 228954 : if (h->size < GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN) {
196 0 : if (HEAPgrow(&b->tvheap, GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN, true) != GDK_SUCCEED) {
197 : return (var_t) -1;
198 : }
199 0 : h = b->tvheap;
200 : }
201 228954 : h->free = GDK_STRHASHTABLE * sizeof(stridx_t);
202 228954 : h->dirty = true;
203 : #ifdef NDEBUG
204 : memset(h->base, 0, h->free);
205 : #else
206 : /* fill should solve initialization problems within valgrind */
207 228954 : memset(h->base, 0, h->size);
208 : #endif
209 : }
210 :
211 94954177 : off = strHash(v);
212 94954177 : off &= GDK_STRHASHMASK;
213 94954177 : bucket = ((stridx_t *) h->base) + off;
214 :
215 94954177 : if (*bucket) {
216 92457443 : assert(*bucket < h->free);
217 : /* the hash list is not empty */
218 92457443 : if (*bucket < GDK_ELIMLIMIT) {
219 : /* small string heap (<64KiB) -- fully double
220 : * eliminated: search the linked list */
221 : const stridx_t *ref = bucket;
222 :
223 46917727 : do {
224 46917727 : pos = *ref + sizeof(stridx_t);
225 46917727 : assert(pos < h->free);
226 46917727 : if (strcmp(v, h->base + pos) == 0) {
227 : /* found */
228 42987067 : return *dst = (var_t) pos;
229 : }
230 3930660 : ref = (stridx_t *) (h->base + *ref);
231 3930660 : } while (*ref);
232 : } else {
233 : /* large string heap (>=64KiB) -- there is no
234 : * linked list, so only look at single
235 : * entry */
236 48386362 : pos = *bucket;
237 48386362 : if (strcmp(v, h->base + pos) == 0) {
238 : /* already in heap: reuse */
239 828624 : return *dst = (var_t) pos;
240 : }
241 : }
242 : }
243 : /* the string was not found in the heap, we need to enter it */
244 :
245 : /* check that string is correctly encoded UTF-8; there was no
246 : * need to do this earlier: if the string was found above, it
247 : * must have gone through here in the past */
248 : #ifndef NDEBUG
249 51138486 : if (!checkUTF8(v)) {
250 0 : GDKerror("incorrectly encoded UTF-8\n");
251 0 : return (var_t) -1;
252 : }
253 : #endif
254 :
255 51182741 : pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - 1));
256 51182741 : if (GDK_ELIMBASE(h->free + pad) == 0) { /* i.e. h->free+pad < GDK_ELIMLIMIT */
257 2854030 : if (pad < sizeof(stridx_t)) {
258 : /* make room for hash link */
259 2211003 : pad += GDK_VARALIGN;
260 : }
261 48328711 : } else if (GDK_ELIMBASE(h->free) != 0) {
262 : /* no extra padding needed when no hash links needed
263 : * (but only when padding doesn't cross duplicate
264 : * elimination boundary) */
265 48329340 : pad = 0;
266 : }
267 :
268 : /* check heap for space (limited to a certain maximum after
269 : * which nils are inserted) */
270 51182741 : if (h->free + pad + len >= h->size) {
271 8820 : size_t newsize = MAX(h->size, 4096);
272 :
273 : /* double the heap size until we have enough space */
274 8857 : do {
275 8857 : if (newsize < 4 * 1024 * 1024)
276 8596 : newsize <<= 1;
277 : else
278 261 : newsize += 4 * 1024 * 1024;
279 8857 : } while (newsize <= h->free + pad + len);
280 :
281 8820 : assert(newsize);
282 :
283 8820 : if (h->free + pad + len >= (size_t) VAR_MAX) {
284 0 : GDKerror("string heap gets larger than %zuGiB.\n", (size_t) VAR_MAX >> 30);
285 0 : return (var_t) -1;
286 : }
287 8820 : TRC_DEBUG(HEAP, "HEAPextend in strPut %s %zu %zu\n", h->filename, h->size, newsize);
288 8820 : if (HEAPgrow(&b->tvheap, newsize, true) != GDK_SUCCEED) {
289 : return (var_t) -1;
290 : }
291 8820 : h = b->tvheap;
292 :
293 : /* make bucket point into the new heap */
294 8820 : bucket = ((stridx_t *) h->base) + off;
295 : }
296 :
297 : /* insert string */
298 51182741 : pos = h->free + pad;
299 51182741 : *dst = (var_t) pos;
300 51182741 : if (pad > 0)
301 2852361 : memset(h->base + h->free, 0, pad);
302 51182741 : memcpy(h->base + pos, v, len);
303 51182741 : h->free += pad + len;
304 51182741 : h->dirty = true;
305 :
306 : /* maintain hash table */
307 51182741 : if (GDK_ELIMBASE(pos) == 0) { /* small string heap: link the next pointer */
308 : /* the stridx_t next pointer directly precedes the
309 : * string */
310 2853824 : pos -= sizeof(stridx_t);
311 2853824 : *(stridx_t *) (h->base + pos) = *bucket;
312 : }
313 51182741 : *bucket = (stridx_t) pos; /* set bucket to the new string */
314 :
315 51182741 : return *dst;
316 : }
317 :
318 : /*
319 : * Convert an "" separated string to a GDK string value, checking that
320 : * the input is correct UTF-8.
321 : */
322 :
323 : #ifdef __GNUC__
324 : /* __builtin_expect returns its first argument; it is expected to be
325 : * equal to the second argument */
326 : #define unlikely(expr) __builtin_expect((expr) != 0, 0)
327 : #define likely(expr) __builtin_expect((expr) != 0, 1)
328 : #else
329 : #define unlikely(expr) (expr)
330 : #define likely(expr) (expr)
331 : #endif
332 :
333 : ssize_t
334 345708535 : GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len, char quote)
335 : {
336 345708535 : unsigned char *p = dst;
337 345708535 : const unsigned char *cur = src, *end = src + len;
338 345708535 : bool escaped = false;
339 345708535 : int mask = 0, n, c, utf8char = 0;
340 :
341 345708535 : if (len >= 2 && strNil((const char *) src)) {
342 0 : strcpy((char *) dst, str_nil);
343 0 : return 1;
344 : }
345 :
346 : /* copy it in, while performing the correct escapes */
347 : /* n is the number of follow-on bytes left in a multi-byte
348 : * UTF-8 sequence */
349 2223369862 : for (cur = src, n = 0; cur < end || escaped; cur++) {
350 : /* first convert any \ escapes and store value in c */
351 1877661331 : if (escaped) {
352 547321 : switch (*cur) {
353 3825 : case '0':
354 : case '1':
355 : case '2':
356 : case '3':
357 : case '4':
358 : case '5':
359 : case '6':
360 : case '7':
361 : /* \ with up to three octal digits */
362 3825 : c = base08(*cur);
363 3825 : if (num08(cur[1])) {
364 3825 : cur++;
365 3825 : c = mult08(c) + base08(*cur);
366 3825 : if (num08(cur[1])) {
367 3825 : if (unlikely(c > 037)) {
368 : /* octal
369 : * escape
370 : * sequence
371 : * out or
372 : * range */
373 1 : GDKerror("not an octal number\n");
374 1 : return -1;
375 : }
376 3824 : cur++;
377 3824 : c = mult08(c) + base08(*cur);
378 3824 : assert(c >= 0 && c <= 0377);
379 : }
380 : }
381 : break;
382 57 : case 'x':
383 : /* \x with one or two hexadecimal digits */
384 57 : if (num16(cur[1])) {
385 57 : cur++;
386 57 : c = base16(*cur);
387 57 : if (num16(cur[1])) {
388 57 : cur++;
389 57 : c = mult16(c) + base16(*cur);
390 : }
391 : } else
392 : c = 'x';
393 : break;
394 1 : case 'u':
395 : case 'U':
396 : /* \u with four hexadecimal digits or
397 : * \U with eight hexadecimal digits */
398 1 : if (unlikely(n > 0)) {
399 : /* not when in the middle of a
400 : * UTF-8 sequence */
401 0 : goto notutf8;
402 : }
403 1 : c = 0;
404 2 : for (n = *cur == 'U' ? 8 : 4; n > 0; n--) {
405 1 : cur++;
406 1 : if (unlikely(!num16(*cur))) {
407 1 : GDKerror("not a Unicode code point escape\n");
408 1 : return -1;
409 : }
410 0 : c = c << 4 | base16(*cur);
411 : }
412 : /* n == 0 now */
413 0 : if (unlikely(c == 0 || c > 0x10FFFF ||
414 : (c & 0xFFF800) == 0xD800)) {
415 0 : GDKerror("illegal Unicode code point\n");
416 0 : return -1;
417 : }
418 0 : if (c < 0x80) {
419 0 : *p++ = (unsigned char) c;
420 : } else {
421 0 : if (c < 0x800) {
422 0 : *p++ = 0xC0 | (c >> 6);
423 : } else {
424 0 : if (c < 0x10000) {
425 0 : *p++ = 0xE0 | (c >> 12);
426 : } else {
427 0 : *p++ = 0xF0 | (c >> 18);
428 0 : *p++ = 0x80 | ((c >> 12) & 0x3F);
429 : }
430 0 : *p++ = 0x80 | ((c >> 6) & 0x3F);
431 : }
432 0 : *p++ = 0x80 | (c & 0x3F);
433 : }
434 0 : escaped = false;
435 0 : continue;
436 : case 'a':
437 : c = '\a';
438 : break;
439 1 : case 'b':
440 1 : c = '\b';
441 1 : break;
442 5 : case 'f':
443 5 : c = '\f';
444 5 : break;
445 10963 : case 'n':
446 10963 : c = '\n';
447 10963 : break;
448 12 : case 'r':
449 12 : c = '\r';
450 12 : break;
451 1996 : case 't':
452 1996 : c = '\t';
453 1996 : break;
454 0 : case '\0':
455 0 : c = '\\';
456 0 : break;
457 530461 : case '\'':
458 : case '\\':
459 : /* \' and \\ can be handled by the
460 : * default case */
461 : default:
462 : /* unrecognized \ escape, just copy
463 : * the backslashed character */
464 530461 : c = *cur;
465 530461 : break;
466 : }
467 : escaped = false;
468 1877114010 : } else if ((c = *cur) == '\\') {
469 547321 : escaped = true;
470 547321 : continue;
471 1876566689 : } else if (c == quote && cur[1] == quote) {
472 5558 : assert(c != 0);
473 5558 : if (unlikely(n > 0))
474 0 : goto notutf8;
475 5558 : *p++ = quote;
476 5558 : cur++;
477 5558 : continue;
478 : }
479 :
480 1877108450 : if (n > 0) {
481 : /* we're still expecting follow-up bytes in a
482 : * UTF-8 sequence */
483 0 : if (unlikely((c & 0xC0) != 0x80)) {
484 : /* incorrect UTF-8 sequence: byte is
485 : * not 10xxxxxx */
486 0 : goto notutf8;
487 : }
488 0 : utf8char = (utf8char << 6) | (c & 0x3F);
489 0 : n--;
490 0 : if (n == 0) {
491 : /* this was the last byte in the sequence */
492 26824 : if (unlikely((utf8char & mask) == 0)) {
493 : /* incorrect UTF-8 sequence:
494 : * not shortest possible */
495 0 : goto notutf8;
496 : }
497 26824 : if (unlikely(utf8char > 0x10FFFF)) {
498 : /* incorrect UTF-8 sequence:
499 : * value too large */
500 0 : goto notutf8;
501 : }
502 26824 : if (unlikely((utf8char & 0x1FFF800) == 0xD800)) {
503 : /* incorrect UTF-8 sequence:
504 : * low or high surrogate
505 : * encoded as UTF-8 */
506 0 : goto notutf8;
507 : }
508 : }
509 1895508491 : } else if ((c & 0x80) == 0) {
510 : ;
511 26826 : } else if ((c & 0xE0) == 0xC0) {
512 1888 : n = 1;
513 1888 : mask = 0x000780;
514 1888 : utf8char = c & 0x1F;
515 24938 : } else if ((c & 0xF0) == 0xE0) {
516 24926 : n = 2;
517 24926 : mask = 0x00F800;
518 24926 : utf8char = c & 0x0F;
519 12 : } else if ((c & 0xF8) == 0xF0) {
520 10 : n = 3;
521 10 : mask = 0x1F0000;
522 10 : utf8char = c & 0x07;
523 : } else {
524 : /* incorrect UTF-8 sequence */
525 2 : goto notutf8;
526 : }
527 1877108448 : *p++ = c;
528 : }
529 345708531 : if (unlikely(n > 0)) {
530 : /* incomplete UTF-8 sequence */
531 0 : goto notutf8;
532 : }
533 345708531 : *p++ = 0;
534 345708531 : return len;
535 2 : notutf8:
536 2 : GDKerror("not a proper UTF-8 sequence\n");
537 2 : return -1;
538 : }
539 :
540 : ssize_t
541 28589011 : strFromStr(const char *restrict src, size_t *restrict len, char **restrict dst, bool external)
542 : {
543 28589011 : const char *cur = src, *start = NULL;
544 28589011 : size_t l = 1;
545 28589011 : bool escaped = false;
546 :
547 28589011 : if (!external) {
548 28588960 : size_t sz = strLen(src);
549 28588960 : atommem(sz);
550 28591678 : return (ssize_t) strcpy_len(*dst, src, sz);
551 : }
552 :
553 51 : if (strNil(src)) {
554 0 : atommem(2);
555 0 : strcpy(*dst, str_nil);
556 0 : return 1;
557 : }
558 :
559 51 : while (GDKisspace(*cur))
560 0 : cur++;
561 51 : if (*cur != '"') {
562 0 : if (strncmp(cur, "nil", 3) == 0) {
563 0 : atommem(2);
564 0 : strcpy(*dst, str_nil);
565 0 : return (ssize_t) (cur - src) + 3;
566 : }
567 0 : GDKerror("not a quoted string\n");
568 0 : return -1;
569 : }
570 :
571 : /* scout the string to find out its length and whether it was
572 : * properly quoted */
573 193 : for (start = ++cur; *cur != '"' || escaped; cur++) {
574 142 : if (*cur == 0) {
575 0 : GDKerror("no closing quotes\n");
576 0 : return -1;
577 142 : } else if (*cur == '\\' && !escaped) {
578 : escaped = true;
579 : } else {
580 133 : escaped = false;
581 133 : l++;
582 : }
583 : }
584 :
585 : /* alloc new memory */
586 51 : if (*dst == NULL || *len < l) {
587 51 : GDKfree(*dst);
588 51 : *dst = GDKmalloc(*len = l);
589 51 : if (*dst == NULL) {
590 0 : *len = 0;
591 0 : return -1;
592 : }
593 : }
594 :
595 51 : return GDKstrFromStr((unsigned char *) *dst,
596 : (const unsigned char *) start,
597 : (ssize_t) (cur - start),
598 : '\0');
599 : }
600 :
601 : /*
602 : * Convert a GDK string value to something printable.
603 : */
604 : /* all but control characters (in range 0 to 31) and DEL */
605 : #define printable_chr(ch) ((' ' <= (ch) && (ch) <= '~') || ((ch) & 0x80) != 0)
606 :
607 : size_t
608 15477154 : escapedStrlen(const char *restrict src, const char *sep1, const char *sep2, int quote)
609 : {
610 15477154 : size_t end, sz = 0;
611 15477154 : size_t sep1len, sep2len;
612 :
613 15477154 : sep1len = sep1 ? strlen(sep1) : 0;
614 15477154 : sep2len = sep2 ? strlen(sep2) : 0;
615 422641539 : for (end = 0; src[end]; end++)
616 407164385 : if (src[end] == '\\'
617 407163994 : || src[end] == quote
618 407093925 : || (sep1len && strncmp(src + end, sep1, sep1len) == 0)
619 407091298 : || (sep2len && strncmp(src + end, sep2, sep2len) == 0)) {
620 77313 : sz += 2;
621 407087072 : } else if (src[end] == (char) '\302' &&
622 8 : 0200 <= ((int) src[end + 1] & 0377) &&
623 8 : ((int) src[end + 1] & 0377) <= 0237) {
624 : /* Unicode control character (code point range
625 : * U-00000080 through U-0000009F encoded in
626 : * UTF-8 */
627 : /* for the first one of the two UTF-8 bytes we
628 : * count a width of 7 and for the second one
629 : * 1, together that's 8, i.e. the width of two
630 : * backslash-escaped octal coded characters */
631 0 : sz += 7;
632 407087072 : } else if (!printable_chr(src[end])) {
633 12566 : sz += 4;
634 : } else {
635 407074506 : sz++;
636 : }
637 15477154 : return sz;
638 : }
639 :
640 : size_t
641 7742141 : escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char *sep1, const char *sep2, int quote)
642 : {
643 7742141 : size_t cur = 0, l = 0;
644 7742141 : size_t sep1len, sep2len;
645 :
646 7742141 : sep1len = sep1 ? strlen(sep1) : 0;
647 7742141 : sep2len = sep2 ? strlen(sep2) : 0;
648 211420257 : for (; src[cur] && l < dstlen; cur++)
649 203678116 : if (!printable_chr(src[cur])
650 203668945 : || (src[cur] == '\302'
651 4 : && 0200 <= (src[cur + 1] & 0377)
652 4 : && ((int) src[cur + 1] & 0377) <= 0237)
653 203668945 : || (cur > 0
654 195930312 : && src[cur - 1] == '\302'
655 4 : && 0200 <= (src[cur] & 0377)
656 4 : && (src[cur] & 0377) <= 0237)) {
657 9171 : dst[l++] = '\\';
658 9171 : switch (src[cur]) {
659 343 : case '\t':
660 343 : dst[l++] = 't';
661 343 : break;
662 8816 : case '\n':
663 8816 : dst[l++] = 'n';
664 8816 : break;
665 3 : case '\r':
666 3 : dst[l++] = 'r';
667 3 : break;
668 2 : case '\f':
669 2 : dst[l++] = 'f';
670 2 : break;
671 7 : default:
672 7 : snprintf(dst + l, dstlen - l, "%03o", (unsigned char) src[cur]);
673 7 : l += 3;
674 7 : break;
675 : }
676 203668945 : } else if (src[cur] == '\\'
677 203668730 : || src[cur] == quote
678 203623127 : || (sep1len && strncmp(src + cur, sep1, sep1len) == 0)
679 203623127 : || (sep2len && strncmp(src + cur, sep2, sep2len) == 0)) {
680 45818 : dst[l++] = '\\';
681 45818 : dst[l++] = src[cur];
682 : } else {
683 203623127 : dst[l++] = src[cur];
684 : }
685 7742141 : assert(l < dstlen);
686 7742141 : dst[l] = 0;
687 7742141 : return l;
688 : }
689 :
690 : ssize_t
691 16639 : strToStr(char **restrict dst, size_t *restrict len, const char *restrict src, bool external)
692 : {
693 16639 : size_t sz;
694 :
695 16639 : if (!external) {
696 9870 : sz = strLen(src);
697 9870 : atommem(sz);
698 9870 : return (ssize_t) strcpy_len(*dst, src, sz);
699 : }
700 6769 : if (strNil(src)) {
701 52 : atommem(4);
702 52 : strcpy(*dst, "nil");
703 52 : return 3;
704 : } else {
705 6717 : ssize_t l = 0;
706 6717 : size_t sz = escapedStrlen(src, NULL, NULL, '"');
707 :
708 6717 : atommem(sz + 3);
709 6717 : l = (ssize_t) escapedStr((*dst) + 1, src, *len - 1, NULL, NULL, '"');
710 6717 : l++;
711 6717 : (*dst)[0] = (*dst)[l++] = '"';
712 6717 : (*dst)[l] = 0;
713 6717 : return l;
714 : }
715 : }
716 :
717 : str
718 92 : strRead(str a, size_t *dstlen, stream *s, size_t cnt)
719 : {
720 92 : int len;
721 :
722 92 : (void) cnt;
723 92 : assert(cnt == 1);
724 92 : if (mnstr_readInt(s, &len) != 1 || len < 0)
725 : return NULL;
726 92 : if (a == NULL || *dstlen < (size_t) len + 1) {
727 0 : if ((a = GDKrealloc(a, len + 1)) == NULL)
728 : return NULL;
729 0 : *dstlen = len + 1;
730 : }
731 92 : if (len && mnstr_read(s, a, len, 1) != 1) {
732 0 : GDKfree(a);
733 0 : return NULL;
734 : }
735 92 : a[len] = 0;
736 92 : return a;
737 : }
738 :
739 : gdk_return
740 92 : strWrite(const char *a, stream *s, size_t cnt)
741 : {
742 92 : size_t len = strlen(a);
743 :
744 92 : (void) cnt;
745 92 : assert(cnt == 1);
746 92 : if (!checkUTF8(a)) {
747 0 : GDKerror("incorrectly encoded UTF-8\n");
748 0 : return GDK_FAIL;
749 : }
750 92 : if (mnstr_writeInt(s, (int) len) && mnstr_write(s, a, len, 1) == 1)
751 : return GDK_SUCCEED;
752 : else
753 0 : return GDK_FAIL;
754 : }
755 :
756 : static gdk_return
757 89 : concat_strings(BAT **bnp, ValPtr pt, BAT *b, oid seqb,
758 : BUN ngrp, struct canditer *restrict ci,
759 : const oid *restrict gids, oid min, oid max, bool skip_nils,
760 : BAT *sep, const char *restrict separator, BUN *has_nils)
761 : {
762 89 : oid gid;
763 89 : BUN i, p, nils = 0;
764 89 : size_t *restrict lengths = NULL, separator_length = 0, next_length;
765 89 : str *restrict astrings = NULL;
766 89 : BATiter bi, bis = (BATiter) {0};
767 89 : BAT *bn = NULL;
768 89 : gdk_return rres = GDK_FAIL;
769 :
770 89 : lng timeoffset = 0;
771 89 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
772 89 : if (qry_ctx != NULL) {
773 89 : timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
774 : }
775 :
776 : /* exactly one of bnp and pt must be NULL, the other non-NULL */
777 89 : assert((bnp == NULL) != (pt == NULL));
778 : /* if pt not NULL, only a single group allowed */
779 89 : assert(pt == NULL || ngrp == 1);
780 :
781 89 : if (bnp) {
782 34 : if ((bn = COLnew(min, TYPE_str, ngrp, TRANSIENT)) == NULL)
783 : return GDK_FAIL;
784 34 : *bnp = bn;
785 : }
786 :
787 89 : bi = bat_iterator(b);
788 89 : bis = bat_iterator(sep);
789 89 : if (separator)
790 58 : separator_length = strlen(separator);
791 :
792 89 : if (ngrp == 1) {
793 62 : size_t offset = 0, single_length = 0;
794 62 : bool empty = true;
795 :
796 62 : if (separator) {
797 43 : assert(sep == NULL);
798 18251 : TIMEOUT_LOOP_IDX(i, ci->ncand, timeoffset) {
799 18121 : p = canditer_next(ci) - seqb;
800 18121 : const char *s = BUNtvar(bi, p);
801 18121 : if (strNil(s)) {
802 15 : if (!skip_nils) {
803 : nils = 1;
804 : break;
805 : }
806 : } else {
807 18106 : single_length += strlen(s);
808 18106 : if (!empty)
809 18065 : single_length += separator_length;
810 : empty = false;
811 : }
812 : }
813 : } else { /* sep case */
814 19 : assert(sep != NULL);
815 354 : TIMEOUT_LOOP_IDX(i, ci->ncand, timeoffset) {
816 297 : p = canditer_next(ci) - seqb;
817 297 : const char *s = BUNtvar(bi, p);
818 297 : const char *sl = BUNtvar(bis, p);
819 297 : if (strNil(s)) {
820 4 : if (!skip_nils) {
821 : nils = 1;
822 : break;
823 : }
824 : } else {
825 293 : single_length += strlen(s);
826 293 : if (!empty) {
827 274 : if (strNil(sl)) {
828 23 : if (!skip_nils) {
829 : nils = 1;
830 : break;
831 : }
832 : } else
833 251 : single_length += strlen(sl);
834 : }
835 : empty = false;
836 : }
837 : }
838 : }
839 62 : canditer_reset(ci);
840 62 : TIMEOUT_CHECK(timeoffset, GOTO_LABEL_TIMEOUT_HANDLER(bailout));
841 :
842 62 : if (nils == 0 && !empty) {
843 60 : char *single_str = NULL;
844 :
845 60 : if ((single_str = GDKmalloc(single_length + 1)) == NULL) {
846 0 : bat_iterator_end(&bi);
847 0 : bat_iterator_end(&bis);
848 0 : BBPreclaim(bn);
849 0 : return GDK_FAIL;
850 : }
851 60 : empty = true;
852 60 : if (separator) {
853 18232 : TIMEOUT_LOOP_IDX(i, ci->ncand, timeoffset) {
854 18108 : p = canditer_next(ci) - seqb;
855 18108 : const char *s = BUNtvar(bi, p);
856 18108 : if (strNil(s))
857 2 : continue;
858 18106 : if (!empty) {
859 18065 : memcpy(single_str + offset, separator, separator_length);
860 18065 : offset += separator_length;
861 : }
862 18106 : next_length = strlen(s);
863 18106 : memcpy(single_str + offset, s, next_length);
864 18106 : offset += next_length;
865 18106 : empty = false;
866 : }
867 : } else { /* sep case */
868 19 : assert(sep != NULL);
869 354 : TIMEOUT_LOOP_IDX(i, ci->ncand, timeoffset) {
870 297 : p = canditer_next(ci) - seqb;
871 297 : const char *s = BUNtvar(bi, p);
872 297 : const char *sl = BUNtvar(bis, p);
873 297 : if (strNil(s))
874 4 : continue;
875 567 : if (!empty && !strNil(sl)) {
876 251 : next_length = strlen(sl);
877 251 : memcpy(single_str + offset, sl, next_length);
878 251 : offset += next_length;
879 : }
880 293 : next_length = strlen(s);
881 293 : memcpy(single_str + offset, s, next_length);
882 293 : offset += next_length;
883 293 : empty = false;
884 : }
885 : }
886 :
887 60 : single_str[offset] = '\0';
888 60 : TIMEOUT_CHECK(timeoffset, do { GDKfree(single_str); GOTO_LABEL_TIMEOUT_HANDLER(bailout); } while (0));
889 60 : if (bn) {
890 7 : if (BUNappend(bn, single_str, false) != GDK_SUCCEED) {
891 0 : GDKfree(single_str);
892 0 : bat_iterator_end(&bi);
893 0 : bat_iterator_end(&bis);
894 0 : BBPreclaim(bn);
895 0 : return GDK_FAIL;
896 : }
897 : } else {
898 53 : pt->len = offset + 1;
899 53 : pt->val.sval = single_str;
900 53 : single_str = NULL; /* don't free */
901 : }
902 67 : GDKfree(single_str);
903 2 : } else if (bn) {
904 0 : if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
905 0 : bat_iterator_end(&bi);
906 0 : bat_iterator_end(&bis);
907 0 : BBPreclaim(bn);
908 0 : return GDK_FAIL;
909 : }
910 : } else {
911 2 : if (VALinit(pt, TYPE_str, str_nil) == NULL) {
912 0 : bat_iterator_end(&bi);
913 0 : bat_iterator_end(&bis);
914 0 : return GDK_FAIL;
915 : }
916 : }
917 62 : bat_iterator_end(&bi);
918 62 : bat_iterator_end(&bis);
919 62 : return GDK_SUCCEED;
920 : } else {
921 : /* first used to calculated the total length of
922 : * each group, then the the total offset */
923 27 : lengths = GDKzalloc(ngrp * sizeof(*lengths));
924 27 : astrings = GDKmalloc(ngrp * sizeof(str));
925 27 : if (lengths == NULL || astrings == NULL) {
926 0 : goto finish;
927 : }
928 : /* at first, set astrings[i] to str_nil, then for each
929 : * non-empty group (even if all strings in the group
930 : * are empty), set to NULL */
931 2141 : for (i = 0; i < ngrp; i++)
932 2114 : astrings[i] = (char *) str_nil;
933 :
934 27 : if (separator) {
935 208 : TIMEOUT_LOOP_IDX(p, ci->ncand, timeoffset) {
936 163 : i = canditer_next(ci) - seqb;
937 163 : if (gids[i] >= min && gids[i] <= max) {
938 163 : gid = gids[i] - min;
939 163 : if (lengths[gid] == (size_t) -1)
940 0 : continue;
941 163 : const char *s = BUNtvar(bi, i);
942 326 : if (!strNil(s)) {
943 155 : lengths[gid] += strlen(s) + separator_length;
944 155 : astrings[gid] = NULL;
945 8 : } else if (!skip_nils) {
946 0 : nils++;
947 0 : lengths[gid] = (size_t) -1;
948 0 : astrings[gid] = (char *) str_nil;
949 : }
950 : }
951 : }
952 : } else { /* sep case */
953 12 : assert(sep != NULL);
954 999759 : TIMEOUT_LOOP_IDX(p, ci->ncand, timeoffset) {
955 999663 : i = canditer_next(ci) - seqb;
956 999663 : if (gids[i] >= min && gids[i] <= max) {
957 999663 : gid = gids[i] - min;
958 999663 : if (lengths[gid] == (size_t) -1)
959 0 : continue;
960 999663 : const char *s = BUNtvar(bi, i);
961 999663 : const char *sl = BUNtvar(bis, i);
962 1999326 : if (!strNil(s)) {
963 999340 : lengths[gid] += strlen(s);
964 1998680 : if (!strNil(sl)) {
965 999209 : next_length = strlen(sl);
966 999209 : lengths[gid] += next_length;
967 : }
968 999340 : astrings[gid] = NULL;
969 323 : } else if (!skip_nils) {
970 0 : nils++;
971 0 : lengths[gid] = (size_t) -1;
972 0 : astrings[gid] = (char *) str_nil;
973 : }
974 : }
975 : }
976 : }
977 27 : TIMEOUT_CHECK(timeoffset, GOTO_LABEL_TIMEOUT_HANDLER(finish));
978 :
979 27 : if (separator) {
980 69 : for (i = 0; i < ngrp; i++) {
981 54 : if (astrings[i] == NULL) {
982 52 : if ((astrings[i] = GDKmalloc(lengths[i] + 1)) == NULL) {
983 0 : goto finish;
984 : }
985 52 : astrings[i][0] = 0;
986 52 : lengths[i] = 0;
987 : } else
988 2 : astrings[i] = NULL;
989 : }
990 : } else { /* sep case */
991 12 : assert(sep != NULL);
992 2072 : for (i = 0; i < ngrp; i++) {
993 2060 : if (astrings[i] == NULL) {
994 2058 : if ((astrings[i] = GDKmalloc(lengths[i] + 1)) == NULL) {
995 0 : goto finish;
996 : }
997 2058 : astrings[i][0] = 0;
998 2058 : lengths[i] = 0;
999 : } else
1000 2 : astrings[i] = NULL;
1001 : }
1002 : }
1003 27 : canditer_reset(ci);
1004 :
1005 27 : if (separator) {
1006 208 : TIMEOUT_LOOP_IDX(p, ci->ncand, timeoffset) {
1007 163 : i = canditer_next(ci) - seqb;
1008 163 : if (gids[i] >= min && gids[i] <= max) {
1009 163 : gid = gids[i] - min;
1010 163 : if (astrings[gid]) {
1011 160 : const char *s = BUNtvar(bi, i);
1012 160 : if (strNil(s))
1013 5 : continue;
1014 155 : if (astrings[gid][lengths[gid]]) {
1015 103 : memcpy(astrings[gid] + lengths[gid], separator, separator_length);
1016 103 : lengths[gid] += separator_length;
1017 : }
1018 155 : next_length = strlen(s);
1019 155 : memcpy(astrings[gid] + lengths[gid], s, next_length);
1020 155 : lengths[gid] += next_length;
1021 155 : astrings[gid][lengths[gid]] = 1;
1022 : }
1023 : }
1024 : }
1025 : } else { /* sep case */
1026 12 : assert(sep != NULL);
1027 999759 : TIMEOUT_LOOP_IDX(p, ci->ncand, timeoffset) {
1028 999663 : i = canditer_next(ci) - seqb;
1029 999663 : if (gids[i] >= min && gids[i] <= max) {
1030 999663 : gid = gids[i] - min;
1031 999663 : if (astrings[gid]) {
1032 999342 : const char *s = BUNtvar(bi, i);
1033 999342 : const char *sl = BUNtvar(bis, i);
1034 999342 : if (strNil(s))
1035 2 : continue;
1036 1996622 : if (astrings[gid][lengths[gid]] && !strNil(sl)) {
1037 997156 : next_length = strlen(sl);
1038 997156 : memcpy(astrings[gid] + lengths[gid], sl, next_length);
1039 997156 : lengths[gid] += next_length;
1040 : }
1041 999340 : next_length = strlen(s);
1042 999340 : memcpy(astrings[gid] + lengths[gid], s, next_length);
1043 999340 : lengths[gid] += next_length;
1044 999340 : astrings[gid][lengths[gid]] = 1;
1045 : }
1046 : }
1047 : }
1048 : }
1049 27 : TIMEOUT_CHECK(timeoffset, GOTO_LABEL_TIMEOUT_HANDLER(finish));
1050 :
1051 2141 : for (i = 0; i < ngrp; i++) {
1052 2114 : if (astrings[i]) {
1053 2110 : astrings[i][lengths[i]] = '\0';
1054 2110 : if (BUNappend(bn, astrings[i], false) != GDK_SUCCEED) {
1055 0 : goto finish;
1056 : }
1057 4 : } else if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
1058 0 : goto finish;
1059 : }
1060 : }
1061 : rres = GDK_SUCCEED;
1062 : }
1063 :
1064 27 : finish:
1065 27 : bat_iterator_end(&bi);
1066 27 : bat_iterator_end(&bis);
1067 27 : if (has_nils)
1068 27 : *has_nils = nils;
1069 27 : GDKfree(lengths);
1070 27 : if (astrings) {
1071 2141 : for (i = 0; i < ngrp; i++) {
1072 2114 : if (astrings[i] != str_nil)
1073 2114 : GDKfree(astrings[i]);
1074 : }
1075 27 : GDKfree(astrings);
1076 : }
1077 27 : if (rres != GDK_SUCCEED)
1078 0 : BBPreclaim(bn);
1079 :
1080 : return rres;
1081 :
1082 0 : bailout:
1083 0 : bat_iterator_end(&bi);
1084 0 : bat_iterator_end(&bis);
1085 0 : BBPreclaim(bn);
1086 : return GDK_FAIL;
1087 : }
1088 :
1089 : gdk_return
1090 56 : BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils,
1091 : bool nil_if_empty, const char *restrict separator)
1092 : {
1093 56 : struct canditer ci;
1094 56 : gdk_return r = GDK_SUCCEED;
1095 56 : bool free_nseparator = false;
1096 56 : char *nseparator = (char *)separator;
1097 :
1098 56 : assert((nseparator && !sep) || (!nseparator && sep)); /* only one of them must be set */
1099 56 : res->vtype = TYPE_str;
1100 :
1101 56 : canditer_init(&ci, b, s);
1102 :
1103 56 : if (sep && BATcount(sep) == 1) { /* Only one element in sep */
1104 0 : BATiter bi = bat_iterator(sep);
1105 0 : nseparator = GDKstrdup(BUNtvar(bi, 0));
1106 0 : bat_iterator_end(&bi);
1107 0 : if (!nseparator)
1108 0 : return GDK_FAIL;
1109 0 : free_nseparator = true;
1110 0 : sep = NULL;
1111 : }
1112 :
1113 56 : if (ci.ncand == 0 || (nseparator && strNil(nseparator))) {
1114 1 : if (VALinit(res, TYPE_str, nil_if_empty ? str_nil : "") == NULL)
1115 0 : r = GDK_FAIL;
1116 1 : if (free_nseparator)
1117 0 : GDKfree(nseparator);
1118 1 : return r;
1119 : }
1120 :
1121 55 : r = concat_strings(NULL, res, b, b->hseqbase, 1, &ci, NULL, 0, 0,
1122 : skip_nils, sep, nseparator, NULL);
1123 55 : if (free_nseparator)
1124 0 : GDKfree(nseparator);
1125 : return r;
1126 : }
1127 :
1128 : BAT *
1129 54 : BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT *sep, bool skip_nils,
1130 : const char *restrict separator)
1131 : {
1132 54 : BAT *bn = NULL;
1133 54 : oid min, max;
1134 54 : BUN ngrp, nils = 0;
1135 54 : struct canditer ci;
1136 54 : const char *err;
1137 54 : gdk_return res;
1138 54 : bool free_nseparator = false;
1139 54 : char *nseparator = (char *)separator;
1140 :
1141 54 : assert((nseparator && !sep) || (!nseparator && sep)); /* only one of them must be set */
1142 54 : (void) skip_nils;
1143 :
1144 54 : if ((err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp,
1145 : &ci)) != NULL) {
1146 0 : GDKerror("%s\n", err);
1147 0 : return NULL;
1148 : }
1149 54 : if (g == NULL) {
1150 0 : GDKerror("b and g must be aligned\n");
1151 0 : return NULL;
1152 : }
1153 :
1154 54 : if (sep && BATcount(sep) == 1) { /* Only one element in sep */
1155 0 : BATiter bi = bat_iterator(sep);
1156 0 : nseparator = GDKstrdup(BUNtvar(bi, 0));
1157 0 : bat_iterator_end(&bi);
1158 0 : if (!nseparator)
1159 0 : return NULL;
1160 0 : free_nseparator = true;
1161 0 : sep = NULL;
1162 : }
1163 :
1164 54 : if (ci.ncand == 0 || ngrp == 0 || (nseparator && strNil(nseparator))) {
1165 : /* trivial: no strings to concat, so return bat
1166 : * aligned with g with nil in the tail */
1167 5 : bn = BATconstant(ngrp == 0 ? 0 : min, TYPE_str, str_nil, ngrp, TRANSIENT);
1168 5 : goto done;
1169 : }
1170 :
1171 49 : if (BATtdense(g) || (g->tkey && g->tnonil)) {
1172 : /* trivial: singleton groups, so all results are equal
1173 : * to the inputs (but possibly a different type) */
1174 15 : bn = BATconvert(b, s, TYPE_str, 0, 0, 0);
1175 15 : goto done;
1176 : }
1177 :
1178 68 : res = concat_strings(&bn, NULL, b, b->hseqbase, ngrp, &ci,
1179 34 : (const oid *) Tloc(g, 0), min, max, skip_nils, sep,
1180 : nseparator, &nils);
1181 34 : if (res != GDK_SUCCEED)
1182 0 : bn = NULL;
1183 :
1184 34 : done:
1185 54 : if (free_nseparator)
1186 0 : GDKfree(nseparator);
1187 54 : return bn;
1188 : }
1189 :
1190 : #define compute_next_single_str(START, END) \
1191 : do { \
1192 : for (oid m = START; m < END; m++) { \
1193 : const char *sb = BUNtvar(bi, m); \
1194 : \
1195 : if (separator) { \
1196 : if (!strNil(sb)) { \
1197 : next_group_length += strlen(sb); \
1198 : if (!empty) \
1199 : next_group_length += separator_length; \
1200 : empty = false; \
1201 : } \
1202 : } else { /* sep case */ \
1203 : assert(sep != NULL); \
1204 : const char *sl = BUNtvar(sepi, m); \
1205 : \
1206 : if (!strNil(sb)) { \
1207 : next_group_length += strlen(sb); \
1208 : if (!empty && !strNil(sl)) \
1209 : next_group_length += strlen(sl); \
1210 : empty = false; \
1211 : } \
1212 : } \
1213 : } \
1214 : if (empty) { \
1215 : if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
1216 : max_group_length = 1; \
1217 : if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
1218 : goto allocation_error; \
1219 : } else if (1 > max_group_length) { \
1220 : max_group_length = 1; \
1221 : if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
1222 : goto allocation_error; \
1223 : single_str = next_single_str; \
1224 : } \
1225 : strcpy(single_str, str_nil); \
1226 : has_nils = true; \
1227 : } else { \
1228 : empty = true; \
1229 : if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
1230 : max_group_length = next_group_length; \
1231 : if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
1232 : goto allocation_error; \
1233 : } else if (next_group_length > max_group_length) { \
1234 : max_group_length = next_group_length; \
1235 : if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
1236 : goto allocation_error; \
1237 : single_str = next_single_str; \
1238 : } \
1239 : \
1240 : for (oid m = START; m < END; m++) { \
1241 : const char *sb = BUNtvar(bi, m); \
1242 : \
1243 : if (separator) { \
1244 : if (strNil(sb)) \
1245 : continue; \
1246 : if (!empty) { \
1247 : memcpy(single_str + offset, separator, separator_length); \
1248 : offset += separator_length; \
1249 : } \
1250 : next_length = strlen(sb); \
1251 : memcpy(single_str + offset, sb, next_length); \
1252 : offset += next_length; \
1253 : empty = false; \
1254 : } else { /* sep case */ \
1255 : assert(sep != NULL); \
1256 : const char *sl = BUNtvar(sepi, m); \
1257 : \
1258 : if (strNil(sb)) \
1259 : continue; \
1260 : if (!empty && !strNil(sl)) { \
1261 : next_length = strlen(sl); \
1262 : memcpy(single_str + offset, sl, next_length); \
1263 : offset += next_length; \
1264 : } \
1265 : next_length = strlen(sb); \
1266 : memcpy(single_str + offset, sb, next_length); \
1267 : offset += next_length; \
1268 : empty = false; \
1269 : } \
1270 : } \
1271 : \
1272 : single_str[offset] = '\0'; \
1273 : } \
1274 : } while (0)
1275 :
1276 : #define ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW \
1277 : do { \
1278 : size_t slice_length = 0; \
1279 : next_group_length = next_length = offset = 0; \
1280 : empty = true; \
1281 : compute_next_single_str(k, i); /* compute the entire string then slice it starting from the beginning */ \
1282 : empty = true; \
1283 : for (; k < i;) { \
1284 : const char *nsep; \
1285 : oid m = k; \
1286 : j = k; \
1287 : do { \
1288 : k++; \
1289 : } while (k < i && !op[k]); \
1290 : for (; j < k; j++) { \
1291 : const char *nstr = BUNtvar(bi, j); \
1292 : if (!strNil(nstr)) { \
1293 : slice_length += strlen(nstr); \
1294 : if (!empty) { \
1295 : if (separator) { \
1296 : nsep = (const char *) separator; \
1297 : } else { /* sep case */ \
1298 : assert(sep != NULL); \
1299 : nsep = BUNtvar(sepi, j); \
1300 : } \
1301 : if (!strNil(nsep)) \
1302 : slice_length += strlen(nsep); \
1303 : } \
1304 : empty = false; \
1305 : } \
1306 : } \
1307 : if (empty) { \
1308 : for (j = m; j < k; j++) \
1309 : if (tfastins_nocheckVAR(r, j, str_nil) != GDK_SUCCEED) \
1310 : goto allocation_error; \
1311 : has_nils = true; \
1312 : } else { \
1313 : char save = single_str[slice_length]; \
1314 : single_str[slice_length] = '\0'; \
1315 : for (j = m; j < k; j++) \
1316 : if (tfastins_nocheckVAR(r, j, single_str) != GDK_SUCCEED) \
1317 : goto allocation_error; \
1318 : single_str[slice_length] = save; \
1319 : } \
1320 : } \
1321 : } while (0)
1322 :
1323 : #define ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS \
1324 : do { \
1325 : next_group_length = next_length = offset = 0; \
1326 : empty = true; \
1327 : compute_next_single_str(k, i); \
1328 : for (; k < i; k++) \
1329 : if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
1330 : goto allocation_error; \
1331 : } while (0)
1332 :
1333 : #define ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW \
1334 : do { \
1335 : for (; k < i; k++) { \
1336 : const char *next = BUNtvar(bi, k); \
1337 : if (tfastins_nocheckVAR(r, k, next) != GDK_SUCCEED) \
1338 : goto allocation_error; \
1339 : has_nils |= strNil(next); \
1340 : } \
1341 : } while (0)
1342 :
1343 : #define ANALYTICAL_STR_GROUP_CONCAT_OTHERS \
1344 : do { \
1345 : for (; k < i; k++) { \
1346 : next_group_length = next_length = offset = 0; \
1347 : empty = true; \
1348 : compute_next_single_str(start[k], end[k]); \
1349 : if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
1350 : goto allocation_error; \
1351 : } \
1352 : } while (0)
1353 :
1354 : #define ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(IMP) \
1355 : do { \
1356 : if (p) { \
1357 : for (; i < cnt; i++) { \
1358 : if (np[i]) \
1359 : IMP; \
1360 : } \
1361 : } \
1362 : i = cnt; \
1363 : IMP; \
1364 : } while (0)
1365 :
1366 : gdk_return
1367 53 : GDKanalytical_str_group_concat(BAT *r, BAT *p, BAT *o, BAT *b, BAT *sep, BAT *s, BAT *e, const char *restrict separator, int frame_type)
1368 : {
1369 53 : bool has_nils = false, empty;
1370 53 : BATiter pi = bat_iterator(p);
1371 53 : BATiter oi = bat_iterator(o);
1372 53 : BATiter bi = bat_iterator(b);
1373 53 : BATiter sepi = bat_iterator(sep);
1374 53 : BATiter si = bat_iterator(s);
1375 53 : BATiter ei = bat_iterator(e);
1376 53 : oid i = 0, j = 0, k = 0, cnt = bi.count, *restrict start = si.base, *restrict end = ei.base;
1377 53 : bit *np = pi.base, *op = oi.base;
1378 53 : str single_str = NULL, next_single_str;
1379 53 : size_t separator_length = 0, next_group_length, max_group_length = 0, next_length, offset;
1380 :
1381 53 : assert((sep && !separator && bi.count == sepi.count) || (!sep && separator));
1382 53 : if (b->ttype != TYPE_str || r->ttype != TYPE_str || (sep && sep->ttype != TYPE_str)) {
1383 0 : GDKerror("only string type is supported\n");
1384 0 : bat_iterator_end(&pi);
1385 0 : bat_iterator_end(&oi);
1386 0 : bat_iterator_end(&bi);
1387 0 : bat_iterator_end(&sepi);
1388 0 : bat_iterator_end(&si);
1389 0 : bat_iterator_end(&ei);
1390 0 : return GDK_FAIL;
1391 : }
1392 30 : if (sep && sepi.count == 1) { /* Only one element in sep */
1393 0 : separator = BUNtvar(sepi, 0);
1394 0 : sep = NULL;
1395 : }
1396 :
1397 53 : if (sep == NULL)
1398 23 : separator_length = strlen(separator);
1399 :
1400 53 : if (cnt > 0) {
1401 52 : switch (frame_type) {
1402 29 : case 3: /* unbounded until current row */
1403 166925 : ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW);
1404 : break;
1405 0 : case 4: /* current row until unbounded */
1406 0 : goto notimplemented;
1407 23 : case 5: /* all rows */
1408 847 : ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS);
1409 : break;
1410 0 : case 6: /* current row */
1411 0 : ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW);
1412 : break;
1413 0 : default:
1414 0 : ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_OTHERS);
1415 : break;
1416 : }
1417 : }
1418 :
1419 53 : bat_iterator_end(&pi);
1420 53 : bat_iterator_end(&oi);
1421 53 : bat_iterator_end(&bi);
1422 53 : bat_iterator_end(&sepi);
1423 53 : bat_iterator_end(&si);
1424 53 : bat_iterator_end(&ei);
1425 53 : GDKfree(single_str);
1426 53 : BATsetcount(r, cnt);
1427 53 : r->tnonil = !has_nils;
1428 53 : r->tnil = has_nils;
1429 53 : return GDK_SUCCEED;
1430 0 : allocation_error:
1431 0 : bat_iterator_end(&pi);
1432 0 : bat_iterator_end(&oi);
1433 0 : bat_iterator_end(&bi);
1434 0 : bat_iterator_end(&sepi);
1435 0 : bat_iterator_end(&si);
1436 0 : bat_iterator_end(&ei);
1437 0 : GDKfree(single_str);
1438 0 : return GDK_FAIL;
1439 0 : notimplemented:
1440 0 : bat_iterator_end(&pi);
1441 0 : bat_iterator_end(&oi);
1442 0 : bat_iterator_end(&bi);
1443 0 : bat_iterator_end(&sepi);
1444 0 : bat_iterator_end(&si);
1445 0 : bat_iterator_end(&ei);
1446 0 : GDKerror("str_group_concat not yet implemented for current row until unbounded case\n");
1447 0 : return GDK_FAIL;
1448 : }
|