Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "gdk.h"
15 : #include "gdk_private.h"
16 : #include "gdk_cand.h"
17 :
18 : /* String Atom Implementation
19 : *
20 : * Strings are stored in two parts. The first part is the normal tail
21 : * heap which contains a list of offsets. The second part is the
22 : * theap which contains the actual strings. The offsets in the tail
23 : * heap (a.k.a. offset heap) point into the theap (a.k.a. string
24 : * heap). Strings are NULL-terminated and are stored without any
25 : * escape sequences. Strings are encoded using the UTF-8 encoding
26 : * of Unicode. This means that individual "characters" (really,
27 : * Unicode code points) can be between one and four bytes long.
28 : *
29 : * Because in many typical situations there are lots of duplicated
30 : * string values that are being stored in a table, but also in many
31 : * (other) typical situations there are very few duplicated string
32 : * values stored, a scheme has been introduced to cater to both
33 : * situations.
34 : *
35 : * When the string heap is "small" (defined as less than 64KiB), the
36 : * string heap is fully duplicate eliminated. When the string heap
37 : * grows beyond this size, the heap is not kept free of duplicate
38 : * strings, but there is then a heuristic that tries to limit the
39 : * number of duplicates.
40 : *
41 : * This is done by having a fixed sized hash table at the start of the
42 : * string heap, and allocating space for collision lists in the first
43 : * 64KiB of the string heap. After the first 64KiB no extra space is
44 : * allocated for lists, so hash collisions cannot be resolved.
45 : */
46 :
47 : /* some of these macros are duplicates from gdk_atoms.c */
48 : #define num08(x) ((x) >= '0' && (x) <= '7')
49 : #define base08(x) ((x) - '0')
50 : #define mult08(x) ((x) << 3)
51 :
52 : #define num16(x) isxdigit((unsigned char) (x))
53 : #define base16(x) (((x) >= 'a' && (x) <= 'f') ? ((x) - 'a' + 10) : ((x) >= 'A' && (x) <= 'F') ? ((x) - 'A' + 10) : (x) - '0')
54 : #define mult16(x) ((x) << 4)
55 :
56 : #define atommem(size) \
57 : do { \
58 : if (*dst == NULL || *len < (size)) { \
59 : GDKfree(*dst); \
60 : *len = (size); \
61 : *dst = GDKmalloc(*len); \
62 : if (*dst == NULL) { \
63 : *len = 0; \
64 : return -1; \
65 : } \
66 : } \
67 : } while (0)
68 :
69 : const char str_nil[2] = { '\200', 0 };
70 :
71 : gdk_return
72 744773 : strHeap(Heap *d, size_t cap)
73 : {
74 744773 : size_t size;
75 :
76 744773 : cap = MAX(cap, BATTINY);
77 744773 : size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * GDK_VARALIGN);
78 744773 : return HEAPalloc(d, size, 1);
79 : }
80 :
81 :
82 : void
83 4613 : strCleanHash(Heap *h, bool rebuild)
84 : {
85 4613 : stridx_t newhash[GDK_STRHASHTABLE];
86 4613 : size_t pad, pos;
87 4613 : BUN off, strhash;
88 4613 : const char *s;
89 :
90 4613 : (void) rebuild;
91 4613 : if (!h->cleanhash)
92 712 : return;
93 3901 : if (h->size < GDK_STRHASHTABLE * sizeof(stridx_t) &&
94 0 : HEAPextend(h, GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN, true) != GDK_SUCCEED) {
95 0 : GDKclrerr();
96 0 : if (h->size > 0)
97 0 : memset(h->base, 0, h->size);
98 0 : return;
99 : }
100 :
101 : /* rebuild hash table for double elimination
102 : *
103 : * If appending strings to the BAT was aborted, if the heap
104 : * was memory mapped, the hash in the string heap may well be
105 : * incorrect. Therefore we don't trust it when we read in a
106 : * string heap and we rebuild the complete table (it is small,
107 : * so this won't take any time at all).
108 : * Note that we will only do this the first time the heap is
109 : * loaded, and only for heaps that existed when the server was
110 : * started. */
111 3901 : memset(newhash, 0, sizeof(newhash));
112 3901 : pos = GDK_STRHASHSIZE;
113 362364 : while (pos < h->free) {
114 358723 : pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
115 358723 : if (pad < sizeof(stridx_t))
116 307279 : pad += GDK_VARALIGN;
117 358723 : pos += pad;
118 358723 : if (pos >= GDK_ELIMLIMIT)
119 : break;
120 358463 : s = h->base + pos;
121 358463 : strhash = strHash(s);
122 358463 : off = strhash & GDK_STRHASHMASK;
123 358463 : newhash[off] = (stridx_t) (pos - sizeof(stridx_t));
124 358463 : pos += strlen(s) + 1;
125 : }
126 : /* only set dirty flag if the hash table actually changed */
127 3901 : if (memcmp(newhash, h->base, sizeof(newhash)) != 0) {
128 197 : memcpy(h->base, newhash, sizeof(newhash));
129 197 : if (h->storage == STORE_MMAP) {
130 31 : if (!(ATOMIC_GET(&GDKdebug) & NOSYNCMASK))
131 0 : (void) MT_msync(h->base, GDK_STRHASHSIZE);
132 : } else
133 166 : h->dirty = true;
134 : }
135 : #ifndef NDEBUG
136 3901 : if (GDK_ELIMDOUBLES(h)) {
137 : pos = GDK_STRHASHSIZE;
138 248273 : while (pos < h->free) {
139 244632 : pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
140 244632 : if (pad < sizeof(stridx_t))
141 204326 : pad += GDK_VARALIGN;
142 244632 : pos += pad;
143 244632 : s = h->base + pos;
144 244632 : assert(strLocate(h, s) != 0);
145 244632 : pos += strlen(s) + 1;
146 : }
147 : }
148 : #endif
149 3901 : h->cleanhash = false;
150 : }
151 :
152 : /*
153 : * The strPut routine. The routine strLocate can be used to identify
154 : * the location of a string in the heap if it exists. Otherwise it
155 : * returns (var_t) -2 (-1 is reserved for error).
156 : */
157 : var_t
158 438248 : strLocate(Heap *h, const char *v)
159 : {
160 438248 : stridx_t *ref, *next;
161 :
162 : /* search hash-table, if double-elimination is still in place */
163 438248 : BUN off;
164 438248 : if (h->free == 0) {
165 : /* empty, so there are no strings */
166 : return (var_t) -2;
167 : }
168 :
169 438248 : off = strHash(v);
170 438248 : off &= GDK_STRHASHMASK;
171 :
172 : /* should only use strLocate iff fully double eliminated */
173 438248 : assert(GDK_ELIMBASE(h->free) == 0);
174 :
175 : /* search the linked list */
176 486964 : for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
177 484049 : next = (stridx_t *) (h->base + *ref);
178 484049 : if (strcmp(v, (str) (next + 1)) == 0)
179 435333 : return (var_t) ((sizeof(stridx_t) + *ref)); /* found */
180 : }
181 : return (var_t) -2;
182 : }
183 :
184 : var_t
185 91639964 : strPut(BAT *b, var_t *dst, const void *V)
186 : {
187 91639964 : const char *v = V;
188 91639964 : Heap *h = b->tvheap;
189 91639964 : size_t pad;
190 91639964 : size_t pos, len = strlen(v) + 1;
191 91639964 : stridx_t *bucket;
192 91639964 : BUN off;
193 :
194 91639964 : if (h->free == 0) {
195 225482 : if (h->size < GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN) {
196 0 : if (HEAPgrow(&b->tvheap, GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN, true) != GDK_SUCCEED) {
197 : return (var_t) -1;
198 : }
199 0 : h = b->tvheap;
200 : }
201 225482 : h->free = GDK_STRHASHTABLE * sizeof(stridx_t);
202 225482 : h->dirty = true;
203 : #ifdef NDEBUG
204 : memset(h->base, 0, h->free);
205 : #else
206 : /* fill should solve initialization problems within valgrind */
207 225482 : memset(h->base, 0, h->size);
208 : #endif
209 : }
210 :
211 91639964 : off = strHash(v);
212 91639964 : off &= GDK_STRHASHMASK;
213 91639964 : bucket = ((stridx_t *) h->base) + off;
214 :
215 91639964 : if (*bucket) {
216 89149005 : assert(*bucket < h->free);
217 : /* the hash list is not empty */
218 89149005 : if (*bucket < GDK_ELIMLIMIT) {
219 : /* small string heap (<64KiB) -- fully double
220 : * eliminated: search the linked list */
221 : const stridx_t *ref = bucket;
222 :
223 43696472 : do {
224 43696472 : pos = *ref + sizeof(stridx_t);
225 43696472 : assert(pos < h->free);
226 43696472 : if (strcmp(v, h->base + pos) == 0) {
227 : /* found */
228 40202714 : return *dst = (var_t) pos;
229 : }
230 3493758 : ref = (stridx_t *) (h->base + *ref);
231 3493758 : } while (*ref);
232 : } else {
233 : /* large string heap (>=64KiB) -- there is no
234 : * linked list, so only look at single
235 : * entry */
236 47798786 : pos = *bucket;
237 47798786 : if (strcmp(v, h->base + pos) == 0) {
238 : /* already in heap: reuse */
239 789403 : return *dst = (var_t) pos;
240 : }
241 : }
242 : }
243 : /* the string was not found in the heap, we need to enter it */
244 :
245 : /* check that string is correctly encoded UTF-8; there was no
246 : * need to do this earlier: if the string was found above, it
247 : * must have gone through here in the past */
248 : #ifndef NDEBUG
249 50647847 : if (!checkUTF8(v)) {
250 0 : GDKerror("incorrectly encoded UTF-8\n");
251 0 : return (var_t) -1;
252 : }
253 : #endif
254 :
255 51036050 : pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - 1));
256 51036050 : if (GDK_ELIMBASE(h->free + pad) == 0) { /* i.e. h->free+pad < GDK_ELIMLIMIT */
257 2876149 : if (pad < sizeof(stridx_t)) {
258 : /* make room for hash link */
259 2210282 : pad += GDK_VARALIGN;
260 : }
261 48159901 : } else if (GDK_ELIMBASE(h->free) != 0) {
262 : /* no extra padding needed when no hash links needed
263 : * (but only when padding doesn't cross duplicate
264 : * elimination boundary) */
265 48201571 : pad = 0;
266 : }
267 :
268 : /* check heap for space (limited to a certain maximum after
269 : * which nils are inserted) */
270 51036050 : if (h->free + pad + len >= h->size) {
271 9037 : size_t newsize = MAX(h->size, 4096);
272 :
273 : /* double the heap size until we have enough space */
274 9068 : do {
275 9068 : if (newsize < 4 * 1024 * 1024)
276 8799 : newsize <<= 1;
277 : else
278 269 : newsize += 4 * 1024 * 1024;
279 9068 : } while (newsize <= h->free + pad + len);
280 :
281 9037 : assert(newsize);
282 :
283 9037 : if (h->free + pad + len >= (size_t) VAR_MAX) {
284 0 : GDKerror("string heap gets larger than %zuGiB.\n", (size_t) VAR_MAX >> 30);
285 0 : return (var_t) -1;
286 : }
287 9037 : TRC_DEBUG(HEAP, "HEAPextend in strPut %s %zu %zu\n", h->filename, h->size, newsize);
288 9037 : if (HEAPgrow(&b->tvheap, newsize, true) != GDK_SUCCEED) {
289 : return (var_t) -1;
290 : }
291 9038 : h = b->tvheap;
292 :
293 : /* make bucket point into the new heap */
294 9038 : bucket = ((stridx_t *) h->base) + off;
295 : }
296 :
297 : /* insert string */
298 51036051 : pos = h->free + pad;
299 51036051 : *dst = (var_t) pos;
300 51036051 : if (pad > 0)
301 2870481 : memset(h->base + h->free, 0, pad);
302 51036051 : memcpy(h->base + pos, v, len);
303 51036051 : h->free += pad + len;
304 51036051 : h->dirty = true;
305 :
306 : /* maintain hash table */
307 51036051 : if (GDK_ELIMBASE(pos) == 0) { /* small string heap: link the next pointer */
308 : /* the stridx_t next pointer directly precedes the
309 : * string */
310 2865899 : pos -= sizeof(stridx_t);
311 2865899 : *(stridx_t *) (h->base + pos) = *bucket;
312 : }
313 51036051 : *bucket = (stridx_t) pos; /* set bucket to the new string */
314 :
315 51036051 : return *dst;
316 : }
317 :
318 : /*
319 : * Convert an "" separated string to a GDK string value, checking that
320 : * the input is correct UTF-8.
321 : */
322 :
323 : #ifdef __GNUC__
324 : /* __builtin_expect returns its first argument; it is expected to be
325 : * equal to the second argument */
326 : #define unlikely(expr) __builtin_expect((expr) != 0, 0)
327 : #define likely(expr) __builtin_expect((expr) != 0, 1)
328 : #else
329 : #define unlikely(expr) (expr)
330 : #define likely(expr) (expr)
331 : #endif
332 :
333 : ssize_t
334 320660098 : GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len, char quote)
335 : {
336 320660098 : unsigned char *p = dst;
337 320660098 : const unsigned char *cur = src, *end = src + len;
338 320660098 : bool escaped = false;
339 320660098 : int mask = 0, n, c, utf8char = 0;
340 :
341 320660098 : if (len >= 2 && strNil((const char *) src)) {
342 0 : strcpy((char *) dst, str_nil);
343 0 : return 1;
344 : }
345 :
346 : /* copy it in, while performing the correct escapes */
347 : /* n is the number of follow-on bytes left in a multi-byte
348 : * UTF-8 sequence */
349 2074720915 : for (cur = src, n = 0; cur < end || escaped; cur++) {
350 : /* first convert any \ escapes and store value in c */
351 1754060820 : if (escaped) {
352 547237 : switch (*cur) {
353 3825 : case '0':
354 : case '1':
355 : case '2':
356 : case '3':
357 : case '4':
358 : case '5':
359 : case '6':
360 : case '7':
361 : /* \ with up to three octal digits */
362 3825 : c = base08(*cur);
363 3825 : if (num08(cur[1])) {
364 3825 : cur++;
365 3825 : c = mult08(c) + base08(*cur);
366 3825 : if (num08(cur[1])) {
367 3825 : if (unlikely(c > 037)) {
368 : /* octal
369 : * escape
370 : * sequence
371 : * out or
372 : * range */
373 1 : GDKerror("not an octal number\n");
374 1 : return -1;
375 : }
376 3824 : cur++;
377 3824 : c = mult08(c) + base08(*cur);
378 3824 : assert(c >= 0 && c <= 0377);
379 : }
380 : }
381 : break;
382 57 : case 'x':
383 : /* \x with one or two hexadecimal digits */
384 57 : if (num16(cur[1])) {
385 57 : cur++;
386 57 : c = base16(*cur);
387 57 : if (num16(cur[1])) {
388 57 : cur++;
389 57 : c = mult16(c) + base16(*cur);
390 : }
391 : } else
392 : c = 'x';
393 : break;
394 0 : case 'u':
395 : case 'U':
396 : /* \u with four hexadecimal digits or
397 : * \U with eight hexadecimal digits */
398 0 : if (unlikely(n > 0)) {
399 : /* not when in the middle of a
400 : * UTF-8 sequence */
401 0 : goto notutf8;
402 : }
403 0 : c = 0;
404 0 : for (n = *cur == 'U' ? 8 : 4; n > 0; n--) {
405 0 : cur++;
406 0 : if (unlikely(!num16(*cur))) {
407 0 : GDKerror("not a Unicode code point escape\n");
408 0 : return -1;
409 : }
410 0 : c = c << 4 | base16(*cur);
411 : }
412 : /* n == 0 now */
413 0 : if (unlikely(c == 0 || c > 0x10FFFF ||
414 : (c & 0xFFF800) == 0xD800)) {
415 0 : GDKerror("illegal Unicode code point\n");
416 0 : return -1;
417 : }
418 0 : if (c < 0x80) {
419 0 : *p++ = (unsigned char) c;
420 : } else {
421 0 : if (c < 0x800) {
422 0 : *p++ = 0xC0 | (c >> 6);
423 : } else {
424 0 : if (c < 0x10000) {
425 0 : *p++ = 0xE0 | (c >> 12);
426 : } else {
427 0 : *p++ = 0xF0 | (c >> 18);
428 0 : *p++ = 0x80 | ((c >> 12) & 0x3F);
429 : }
430 0 : *p++ = 0x80 | ((c >> 6) & 0x3F);
431 : }
432 0 : *p++ = 0x80 | (c & 0x3F);
433 : }
434 0 : escaped = false;
435 0 : continue;
436 : case 'a':
437 : c = '\a';
438 : break;
439 1 : case 'b':
440 1 : c = '\b';
441 1 : break;
442 5 : case 'f':
443 5 : c = '\f';
444 5 : break;
445 10916 : case 'n':
446 10916 : c = '\n';
447 10916 : break;
448 12 : case 'r':
449 12 : c = '\r';
450 12 : break;
451 1999 : case 't':
452 1999 : c = '\t';
453 1999 : break;
454 0 : case '\0':
455 0 : c = '\\';
456 0 : break;
457 530422 : case '\'':
458 : case '\\':
459 : /* \' and \\ can be handled by the
460 : * default case */
461 : default:
462 : /* unrecognized \ escape, just copy
463 : * the backslashed character */
464 530422 : c = *cur;
465 530422 : break;
466 : }
467 : escaped = false;
468 1753513583 : } else if ((c = *cur) == '\\') {
469 547237 : escaped = true;
470 547237 : continue;
471 1752966346 : } else if (c == quote && cur[1] == quote) {
472 5529 : assert(c != 0);
473 5529 : if (unlikely(n > 0))
474 0 : goto notutf8;
475 5529 : *p++ = quote;
476 5529 : cur++;
477 5529 : continue;
478 : }
479 :
480 1753508053 : if (n > 0) {
481 : /* we're still expecting follow-up bytes in a
482 : * UTF-8 sequence */
483 0 : if (unlikely((c & 0xC0) != 0x80)) {
484 : /* incorrect UTF-8 sequence: byte is
485 : * not 10xxxxxx */
486 0 : goto notutf8;
487 : }
488 0 : utf8char = (utf8char << 6) | (c & 0x3F);
489 0 : n--;
490 0 : if (n == 0) {
491 : /* this was the last byte in the sequence */
492 26690 : if (unlikely((utf8char & mask) == 0)) {
493 : /* incorrect UTF-8 sequence:
494 : * not shortest possible */
495 0 : goto notutf8;
496 : }
497 26690 : if (unlikely(utf8char > 0x10FFFF)) {
498 : /* incorrect UTF-8 sequence:
499 : * value too large */
500 0 : goto notutf8;
501 : }
502 26690 : if (unlikely((utf8char & 0x1FFF800) == 0xD800)) {
503 : /* incorrect UTF-8 sequence:
504 : * low or high surrogate
505 : * encoded as UTF-8 */
506 0 : goto notutf8;
507 : }
508 : }
509 1775080736 : } else if ((c & 0x80) == 0) {
510 : ;
511 26692 : } else if ((c & 0xE0) == 0xC0) {
512 1762 : n = 1;
513 1762 : mask = 0x000780;
514 1762 : utf8char = c & 0x1F;
515 24930 : } else if ((c & 0xF0) == 0xE0) {
516 24916 : n = 2;
517 24916 : mask = 0x00F800;
518 24916 : utf8char = c & 0x0F;
519 14 : } else if ((c & 0xF8) == 0xF0) {
520 12 : n = 3;
521 12 : mask = 0x1F0000;
522 12 : utf8char = c & 0x07;
523 : } else {
524 : /* incorrect UTF-8 sequence */
525 2 : goto notutf8;
526 : }
527 1753508051 : *p++ = c;
528 : }
529 320660095 : if (unlikely(n > 0)) {
530 : /* incomplete UTF-8 sequence */
531 0 : goto notutf8;
532 : }
533 320660095 : *p++ = 0;
534 320660095 : return len;
535 2 : notutf8:
536 2 : GDKerror("not a proper UTF-8 sequence\n");
537 2 : return -1;
538 : }
539 :
540 : ssize_t
541 28549937 : strFromStr(const char *restrict src, size_t *restrict len, char **restrict dst, bool external)
542 : {
543 28549937 : const char *cur = src, *start = NULL;
544 28549937 : size_t l = 1;
545 28549937 : bool escaped = false;
546 :
547 28549937 : if (!external) {
548 28549889 : size_t sz = strLen(src);
549 28549889 : atommem(sz);
550 28568608 : return (ssize_t) strcpy_len(*dst, src, sz);
551 : }
552 :
553 48 : if (strNil(src)) {
554 0 : atommem(2);
555 0 : strcpy(*dst, str_nil);
556 0 : return 1;
557 : }
558 :
559 48 : while (GDKisspace(*cur))
560 0 : cur++;
561 48 : if (*cur != '"') {
562 0 : if (strncmp(cur, "nil", 3) == 0) {
563 0 : atommem(2);
564 0 : strcpy(*dst, str_nil);
565 0 : return (ssize_t) (cur - src) + 3;
566 : }
567 0 : GDKerror("not a quoted string\n");
568 0 : return -1;
569 : }
570 :
571 : /* scout the string to find out its length and whether it was
572 : * properly quoted */
573 186 : for (start = ++cur; *cur != '"' || escaped; cur++) {
574 138 : if (*cur == 0) {
575 0 : GDKerror("no closing quotes\n");
576 0 : return -1;
577 138 : } else if (*cur == '\\' && !escaped) {
578 : escaped = true;
579 : } else {
580 129 : escaped = false;
581 129 : l++;
582 : }
583 : }
584 :
585 : /* alloc new memory */
586 48 : if (*dst == NULL || *len < l) {
587 48 : GDKfree(*dst);
588 48 : *dst = GDKmalloc(*len = l);
589 48 : if (*dst == NULL) {
590 0 : *len = 0;
591 0 : return -1;
592 : }
593 : }
594 :
595 48 : return GDKstrFromStr((unsigned char *) *dst,
596 : (const unsigned char *) start,
597 : (ssize_t) (cur - start),
598 : '\0');
599 : }
600 :
601 : /*
602 : * Convert a GDK string value to something printable.
603 : */
604 : /* all but control characters (in range 0 to 31) and DEL */
605 : #define printable_chr(ch) ((' ' <= (ch) && (ch) <= '~') || ((ch) & 0x80) != 0)
606 :
607 : size_t
608 16675143 : escapedStrlen(const char *restrict src, const char *sep1, const char *sep2, int quote)
609 : {
610 16675143 : size_t end, sz = 0;
611 16675143 : size_t sep1len, sep2len;
612 :
613 16675143 : sep1len = sep1 ? strlen(sep1) : 0;
614 16675143 : sep2len = sep2 ? strlen(sep2) : 0;
615 475479680 : for (end = 0; src[end]; end++)
616 458804537 : if (src[end] == '\\'
617 458804154 : || src[end] == quote
618 458733355 : || (sep1len && strncmp(src + end, sep1, sep1len) == 0)
619 458730728 : || (sep2len && strncmp(src + end, sep2, sep2len) == 0)) {
620 78035 : sz += 2;
621 458726502 : } else if (src[end] == (char) '\302' &&
622 4 : 0200 <= ((int) src[end + 1] & 0377) &&
623 4 : ((int) src[end + 1] & 0377) <= 0237) {
624 : /* Unicode control character (code point range
625 : * U-00000080 through U-0000009F encoded in
626 : * UTF-8 */
627 : /* for the first one of the two UTF-8 bytes we
628 : * count a width of 7 and for the second one
629 : * 1, together that's 8, i.e. the width of two
630 : * backslash-escaped octal coded characters */
631 0 : sz += 7;
632 458726502 : } else if (!printable_chr(src[end])) {
633 12740 : sz += 4;
634 : } else {
635 458713762 : sz++;
636 : }
637 16675143 : return sz;
638 : }
639 :
640 : size_t
641 8341375 : escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char *sep1, const char *sep2, int quote)
642 : {
643 8341375 : size_t cur = 0, l = 0;
644 8341375 : size_t sep1len, sep2len;
645 :
646 8341375 : sep1len = sep1 ? strlen(sep1) : 0;
647 8341375 : sep2len = sep2 ? strlen(sep2) : 0;
648 237840377 : for (; src[cur] && l < dstlen; cur++)
649 229499002 : if (!printable_chr(src[cur])
650 229489765 : || (src[cur] == '\302'
651 2 : && 0200 <= (src[cur + 1] & 0377)
652 2 : && ((int) src[cur + 1] & 0377) <= 0237)
653 229489765 : || (cur > 0
654 221151756 : && src[cur - 1] == '\302'
655 2 : && 0200 <= (src[cur] & 0377)
656 2 : && (src[cur] & 0377) <= 0237)) {
657 9237 : dst[l++] = '\\';
658 9237 : switch (src[cur]) {
659 327 : case '\t':
660 327 : dst[l++] = 't';
661 327 : break;
662 8898 : case '\n':
663 8898 : dst[l++] = 'n';
664 8898 : break;
665 3 : case '\r':
666 3 : dst[l++] = 'r';
667 3 : break;
668 2 : case '\f':
669 2 : dst[l++] = 'f';
670 2 : break;
671 7 : default:
672 7 : snprintf(dst + l, dstlen - l, "%03o", (unsigned char) src[cur]);
673 7 : l += 3;
674 7 : break;
675 : }
676 229489765 : } else if (src[cur] == '\\'
677 229489555 : || src[cur] == quote
678 229443567 : || (sep1len && strncmp(src + cur, sep1, sep1len) == 0)
679 229443567 : || (sep2len && strncmp(src + cur, sep2, sep2len) == 0)) {
680 46198 : dst[l++] = '\\';
681 46198 : dst[l++] = src[cur];
682 : } else {
683 229443567 : dst[l++] = src[cur];
684 : }
685 8341375 : assert(l < dstlen);
686 8341375 : dst[l] = 0;
687 8341375 : return l;
688 : }
689 :
690 : ssize_t
691 17160 : strToStr(char **restrict dst, size_t *restrict len, const char *restrict src, bool external)
692 : {
693 17160 : size_t sz;
694 :
695 17160 : if (!external) {
696 9808 : sz = strLen(src);
697 9808 : atommem(sz);
698 9808 : return (ssize_t) strcpy_len(*dst, src, sz);
699 : }
700 7352 : if (strNil(src)) {
701 52 : atommem(4);
702 52 : strcpy(*dst, "nil");
703 52 : return 3;
704 : } else {
705 7300 : ssize_t l = 0;
706 7300 : size_t sz = escapedStrlen(src, NULL, NULL, '"');
707 :
708 7306 : atommem(sz + 3);
709 7314 : l = (ssize_t) escapedStr((*dst) + 1, src, *len - 1, NULL, NULL, '"');
710 7306 : l++;
711 7306 : (*dst)[0] = (*dst)[l++] = '"';
712 7306 : (*dst)[l] = 0;
713 7306 : return l;
714 : }
715 : }
716 :
717 : str
718 96 : strRead(str a, size_t *dstlen, stream *s, size_t cnt)
719 : {
720 96 : int len;
721 :
722 96 : (void) cnt;
723 96 : assert(cnt == 1);
724 96 : if (mnstr_readInt(s, &len) != 1 || len < 0)
725 : return NULL;
726 96 : if (a == NULL || *dstlen < (size_t) len + 1) {
727 0 : if ((a = GDKrealloc(a, len + 1)) == NULL)
728 : return NULL;
729 0 : *dstlen = len + 1;
730 : }
731 96 : if (len && mnstr_read(s, a, len, 1) != 1) {
732 0 : GDKfree(a);
733 0 : return NULL;
734 : }
735 96 : a[len] = 0;
736 96 : return a;
737 : }
738 :
739 : gdk_return
740 96 : strWrite(const char *a, stream *s, size_t cnt)
741 : {
742 96 : size_t len = strlen(a);
743 :
744 96 : (void) cnt;
745 96 : assert(cnt == 1);
746 96 : if (!checkUTF8(a)) {
747 0 : GDKerror("incorrectly encoded UTF-8\n");
748 0 : return GDK_FAIL;
749 : }
750 96 : if (mnstr_writeInt(s, (int) len) && mnstr_write(s, a, len, 1) == 1)
751 : return GDK_SUCCEED;
752 : else
753 0 : return GDK_FAIL;
754 : }
755 :
756 : static gdk_return
757 88 : concat_strings(BAT **bnp, ValPtr pt, BAT *b, oid seqb,
758 : BUN ngrp, struct canditer *restrict ci,
759 : const oid *restrict gids, oid min, oid max, bool skip_nils,
760 : BAT *sep, const char *restrict separator, BUN *has_nils)
761 : {
762 88 : oid gid;
763 88 : BUN i, p, nils = 0;
764 88 : size_t *restrict lengths = NULL, separator_length = 0, next_length;
765 88 : str *restrict astrings = NULL;
766 88 : BATiter bi, bis = (BATiter) {0};
767 88 : BAT *bn = NULL;
768 88 : gdk_return rres = GDK_FAIL;
769 :
770 88 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
771 :
772 : /* exactly one of bnp and pt must be NULL, the other non-NULL */
773 88 : assert((bnp == NULL) != (pt == NULL));
774 : /* if pt not NULL, only a single group allowed */
775 88 : assert(pt == NULL || ngrp == 1);
776 :
777 88 : if (bnp) {
778 34 : if ((bn = COLnew(min, TYPE_str, ngrp, TRANSIENT)) == NULL)
779 : return GDK_FAIL;
780 34 : *bnp = bn;
781 : }
782 :
783 88 : bi = bat_iterator(b);
784 88 : bis = bat_iterator(sep);
785 88 : if (separator)
786 57 : separator_length = strlen(separator);
787 :
788 88 : if (ngrp == 1) {
789 61 : size_t offset = 0, single_length = 0;
790 61 : bool empty = true;
791 :
792 61 : if (separator) {
793 42 : assert(sep == NULL);
794 671 : TIMEOUT_LOOP_IDX(i, ci->ncand, qry_ctx) {
795 545 : p = canditer_next(ci) - seqb;
796 545 : const char *s = BUNtvar(bi, p);
797 545 : if (strNil(s)) {
798 15 : if (!skip_nils) {
799 : nils = 1;
800 : break;
801 : }
802 : } else {
803 530 : single_length += strlen(s);
804 530 : if (!empty)
805 490 : single_length += separator_length;
806 : empty = false;
807 : }
808 : }
809 : } else { /* sep case */
810 19 : assert(sep != NULL);
811 354 : TIMEOUT_LOOP_IDX(i, ci->ncand, qry_ctx) {
812 297 : p = canditer_next(ci) - seqb;
813 297 : const char *s = BUNtvar(bi, p);
814 297 : const char *sl = BUNtvar(bis, p);
815 297 : if (strNil(s)) {
816 4 : if (!skip_nils) {
817 : nils = 1;
818 : break;
819 : }
820 : } else {
821 293 : single_length += strlen(s);
822 293 : if (!empty) {
823 274 : if (strNil(sl)) {
824 23 : if (!skip_nils) {
825 : nils = 1;
826 : break;
827 : }
828 : } else
829 251 : single_length += strlen(sl);
830 : }
831 : empty = false;
832 : }
833 : }
834 : }
835 61 : canditer_reset(ci);
836 61 : TIMEOUT_CHECK(qry_ctx, GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx));
837 :
838 61 : if (nils == 0 && !empty) {
839 59 : char *single_str = NULL;
840 :
841 59 : if ((single_str = GDKmalloc(single_length + 1)) == NULL) {
842 0 : bat_iterator_end(&bi);
843 0 : bat_iterator_end(&bis);
844 0 : BBPreclaim(bn);
845 0 : return GDK_FAIL;
846 : }
847 59 : empty = true;
848 59 : if (separator) {
849 652 : TIMEOUT_LOOP_IDX(i, ci->ncand, qry_ctx) {
850 532 : p = canditer_next(ci) - seqb;
851 532 : const char *s = BUNtvar(bi, p);
852 532 : if (strNil(s))
853 2 : continue;
854 530 : if (!empty) {
855 490 : memcpy(single_str + offset, separator, separator_length);
856 490 : offset += separator_length;
857 : }
858 530 : next_length = strlen(s);
859 530 : memcpy(single_str + offset, s, next_length);
860 530 : offset += next_length;
861 530 : empty = false;
862 : }
863 : } else { /* sep case */
864 19 : assert(sep != NULL);
865 354 : TIMEOUT_LOOP_IDX(i, ci->ncand, qry_ctx) {
866 297 : p = canditer_next(ci) - seqb;
867 297 : const char *s = BUNtvar(bi, p);
868 297 : const char *sl = BUNtvar(bis, p);
869 297 : if (strNil(s))
870 4 : continue;
871 567 : if (!empty && !strNil(sl)) {
872 251 : next_length = strlen(sl);
873 251 : memcpy(single_str + offset, sl, next_length);
874 251 : offset += next_length;
875 : }
876 293 : next_length = strlen(s);
877 293 : memcpy(single_str + offset, s, next_length);
878 293 : offset += next_length;
879 293 : empty = false;
880 : }
881 : }
882 :
883 59 : single_str[offset] = '\0';
884 59 : TIMEOUT_CHECK(qry_ctx, do { GDKfree(single_str); GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx); } while (0));
885 59 : if (bn) {
886 7 : if (BUNappend(bn, single_str, false) != GDK_SUCCEED) {
887 0 : GDKfree(single_str);
888 0 : bat_iterator_end(&bi);
889 0 : bat_iterator_end(&bis);
890 0 : BBPreclaim(bn);
891 0 : return GDK_FAIL;
892 : }
893 : } else {
894 52 : pt->len = offset + 1;
895 52 : pt->val.sval = single_str;
896 52 : single_str = NULL; /* don't free */
897 : }
898 66 : GDKfree(single_str);
899 2 : } else if (bn) {
900 0 : if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
901 0 : bat_iterator_end(&bi);
902 0 : bat_iterator_end(&bis);
903 0 : BBPreclaim(bn);
904 0 : return GDK_FAIL;
905 : }
906 : } else {
907 2 : if (VALinit(pt, TYPE_str, str_nil) == NULL) {
908 0 : bat_iterator_end(&bi);
909 0 : bat_iterator_end(&bis);
910 0 : return GDK_FAIL;
911 : }
912 : }
913 61 : bat_iterator_end(&bi);
914 61 : bat_iterator_end(&bis);
915 61 : return GDK_SUCCEED;
916 : } else {
917 : /* first used to calculated the total length of
918 : * each group, then the the total offset */
919 27 : lengths = GDKzalloc(ngrp * sizeof(*lengths));
920 27 : astrings = GDKmalloc(ngrp * sizeof(str));
921 27 : if (lengths == NULL || astrings == NULL) {
922 0 : goto finish;
923 : }
924 : /* at first, set astrings[i] to str_nil, then for each
925 : * non-empty group (even if all strings in the group
926 : * are empty), set to NULL */
927 2141 : for (i = 0; i < ngrp; i++)
928 2114 : astrings[i] = (char *) str_nil;
929 :
930 27 : if (separator) {
931 208 : TIMEOUT_LOOP_IDX(p, ci->ncand, qry_ctx) {
932 163 : i = canditer_next(ci) - seqb;
933 163 : if (gids[i] >= min && gids[i] <= max) {
934 163 : gid = gids[i] - min;
935 163 : if (lengths[gid] == (size_t) -1)
936 0 : continue;
937 163 : const char *s = BUNtvar(bi, i);
938 326 : if (!strNil(s)) {
939 155 : lengths[gid] += strlen(s) + separator_length;
940 155 : astrings[gid] = NULL;
941 8 : } else if (!skip_nils) {
942 0 : nils++;
943 0 : lengths[gid] = (size_t) -1;
944 0 : astrings[gid] = (char *) str_nil;
945 : }
946 : }
947 : }
948 : } else { /* sep case */
949 12 : assert(sep != NULL);
950 999759 : TIMEOUT_LOOP_IDX(p, ci->ncand, qry_ctx) {
951 999663 : i = canditer_next(ci) - seqb;
952 999663 : if (gids[i] >= min && gids[i] <= max) {
953 999663 : gid = gids[i] - min;
954 999663 : if (lengths[gid] == (size_t) -1)
955 0 : continue;
956 999663 : const char *s = BUNtvar(bi, i);
957 999663 : const char *sl = BUNtvar(bis, i);
958 1999326 : if (!strNil(s)) {
959 999340 : lengths[gid] += strlen(s);
960 1998680 : if (!strNil(sl)) {
961 999209 : next_length = strlen(sl);
962 999209 : lengths[gid] += next_length;
963 : }
964 999340 : astrings[gid] = NULL;
965 323 : } else if (!skip_nils) {
966 0 : nils++;
967 0 : lengths[gid] = (size_t) -1;
968 0 : astrings[gid] = (char *) str_nil;
969 : }
970 : }
971 : }
972 : }
973 27 : TIMEOUT_CHECK(qry_ctx, GOTO_LABEL_TIMEOUT_HANDLER(finish, qry_ctx));
974 :
975 27 : if (separator) {
976 69 : for (i = 0; i < ngrp; i++) {
977 54 : if (astrings[i] == NULL) {
978 52 : if ((astrings[i] = GDKmalloc(lengths[i] + 1)) == NULL) {
979 0 : goto finish;
980 : }
981 52 : astrings[i][0] = 0;
982 52 : lengths[i] = 0;
983 : } else
984 2 : astrings[i] = NULL;
985 : }
986 : } else { /* sep case */
987 12 : assert(sep != NULL);
988 2072 : for (i = 0; i < ngrp; i++) {
989 2060 : if (astrings[i] == NULL) {
990 2058 : if ((astrings[i] = GDKmalloc(lengths[i] + 1)) == NULL) {
991 0 : goto finish;
992 : }
993 2058 : astrings[i][0] = 0;
994 2058 : lengths[i] = 0;
995 : } else
996 2 : astrings[i] = NULL;
997 : }
998 : }
999 27 : canditer_reset(ci);
1000 :
1001 27 : if (separator) {
1002 193 : TIMEOUT_LOOP_IDX(p, ci->ncand, qry_ctx) {
1003 163 : i = canditer_next(ci) - seqb;
1004 163 : if (gids[i] >= min && gids[i] <= max) {
1005 163 : gid = gids[i] - min;
1006 163 : if (astrings[gid]) {
1007 160 : const char *s = BUNtvar(bi, i);
1008 160 : if (strNil(s))
1009 5 : continue;
1010 155 : if (astrings[gid][lengths[gid]]) {
1011 103 : memcpy(astrings[gid] + lengths[gid], separator, separator_length);
1012 103 : lengths[gid] += separator_length;
1013 : }
1014 155 : next_length = strlen(s);
1015 155 : memcpy(astrings[gid] + lengths[gid], s, next_length);
1016 155 : lengths[gid] += next_length;
1017 155 : astrings[gid][lengths[gid]] = 1;
1018 : }
1019 : }
1020 : }
1021 : } else { /* sep case */
1022 12 : assert(sep != NULL);
1023 999747 : TIMEOUT_LOOP_IDX(p, ci->ncand, qry_ctx) {
1024 999663 : i = canditer_next(ci) - seqb;
1025 999663 : if (gids[i] >= min && gids[i] <= max) {
1026 999663 : gid = gids[i] - min;
1027 999663 : if (astrings[gid]) {
1028 999342 : const char *s = BUNtvar(bi, i);
1029 999342 : const char *sl = BUNtvar(bis, i);
1030 999342 : if (strNil(s))
1031 2 : continue;
1032 1996622 : if (astrings[gid][lengths[gid]] && !strNil(sl)) {
1033 997156 : next_length = strlen(sl);
1034 997156 : memcpy(astrings[gid] + lengths[gid], sl, next_length);
1035 997156 : lengths[gid] += next_length;
1036 : }
1037 999340 : next_length = strlen(s);
1038 999340 : memcpy(astrings[gid] + lengths[gid], s, next_length);
1039 999340 : lengths[gid] += next_length;
1040 999340 : astrings[gid][lengths[gid]] = 1;
1041 : }
1042 : }
1043 : }
1044 : }
1045 27 : TIMEOUT_CHECK(qry_ctx, GOTO_LABEL_TIMEOUT_HANDLER(finish, qry_ctx));
1046 :
1047 2141 : for (i = 0; i < ngrp; i++) {
1048 2114 : if (astrings[i]) {
1049 2110 : astrings[i][lengths[i]] = '\0';
1050 2110 : if (BUNappend(bn, astrings[i], false) != GDK_SUCCEED) {
1051 0 : goto finish;
1052 : }
1053 4 : } else if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
1054 0 : goto finish;
1055 : }
1056 : }
1057 : rres = GDK_SUCCEED;
1058 : }
1059 :
1060 27 : finish:
1061 27 : bat_iterator_end(&bi);
1062 27 : bat_iterator_end(&bis);
1063 27 : if (has_nils)
1064 27 : *has_nils = nils;
1065 27 : GDKfree(lengths);
1066 27 : if (astrings) {
1067 2141 : for (i = 0; i < ngrp; i++) {
1068 2114 : if (astrings[i] != str_nil)
1069 2114 : GDKfree(astrings[i]);
1070 : }
1071 27 : GDKfree(astrings);
1072 : }
1073 27 : if (rres != GDK_SUCCEED)
1074 0 : BBPreclaim(bn);
1075 :
1076 : return rres;
1077 :
1078 0 : bailout:
1079 0 : bat_iterator_end(&bi);
1080 0 : bat_iterator_end(&bis);
1081 0 : BBPreclaim(bn);
1082 : return GDK_FAIL;
1083 : }
1084 :
1085 : gdk_return
1086 55 : BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils,
1087 : bool nil_if_empty, const char *restrict separator)
1088 : {
1089 55 : struct canditer ci;
1090 55 : gdk_return r = GDK_SUCCEED;
1091 55 : bool free_nseparator = false;
1092 55 : char *nseparator = (char *)separator;
1093 :
1094 55 : assert((nseparator && !sep) || (!nseparator && sep)); /* only one of them must be set */
1095 55 : *res = (ValRecord) {.vtype = TYPE_str};
1096 :
1097 55 : canditer_init(&ci, b, s);
1098 :
1099 55 : if (sep && BATcount(sep) == 1) { /* Only one element in sep */
1100 0 : BATiter bi = bat_iterator(sep);
1101 0 : nseparator = GDKstrdup(BUNtvar(bi, 0));
1102 0 : bat_iterator_end(&bi);
1103 0 : if (!nseparator)
1104 0 : return GDK_FAIL;
1105 0 : free_nseparator = true;
1106 0 : sep = NULL;
1107 : }
1108 :
1109 55 : if (ci.ncand == 0 || (nseparator && strNil(nseparator))) {
1110 1 : if (VALinit(res, TYPE_str, nil_if_empty ? str_nil : "") == NULL)
1111 0 : r = GDK_FAIL;
1112 1 : if (free_nseparator)
1113 0 : GDKfree(nseparator);
1114 1 : return r;
1115 : }
1116 :
1117 54 : r = concat_strings(NULL, res, b, b->hseqbase, 1, &ci, NULL, 0, 0,
1118 : skip_nils, sep, nseparator, NULL);
1119 54 : if (free_nseparator)
1120 0 : GDKfree(nseparator);
1121 : return r;
1122 : }
1123 :
1124 : BAT *
1125 54 : BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT *sep, bool skip_nils,
1126 : const char *restrict separator)
1127 : {
1128 54 : BAT *bn = NULL;
1129 54 : oid min, max;
1130 54 : BUN ngrp, nils = 0;
1131 54 : struct canditer ci;
1132 54 : const char *err;
1133 54 : gdk_return res;
1134 54 : bool free_nseparator = false;
1135 54 : char *nseparator = (char *)separator;
1136 :
1137 54 : assert((nseparator && !sep) || (!nseparator && sep)); /* only one of them must be set */
1138 54 : (void) skip_nils;
1139 :
1140 54 : if ((err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp,
1141 : &ci)) != NULL) {
1142 0 : GDKerror("%s\n", err);
1143 0 : return NULL;
1144 : }
1145 54 : if (g == NULL) {
1146 0 : GDKerror("b and g must be aligned\n");
1147 0 : return NULL;
1148 : }
1149 :
1150 54 : if (sep && BATcount(sep) == 1) { /* Only one element in sep */
1151 0 : BATiter bi = bat_iterator(sep);
1152 0 : nseparator = GDKstrdup(BUNtvar(bi, 0));
1153 0 : bat_iterator_end(&bi);
1154 0 : if (!nseparator)
1155 0 : return NULL;
1156 0 : free_nseparator = true;
1157 0 : sep = NULL;
1158 : }
1159 :
1160 54 : if (ci.ncand == 0 || ngrp == 0 || (nseparator && strNil(nseparator))) {
1161 : /* trivial: no strings to concat, so return bat
1162 : * aligned with g with nil in the tail */
1163 5 : bn = BATconstant(ngrp == 0 ? 0 : min, TYPE_str, str_nil, ngrp, TRANSIENT);
1164 5 : goto done;
1165 : }
1166 :
1167 49 : if (BATtdense(g) || (g->tkey && g->tnonil)) {
1168 : /* trivial: singleton groups, so all results are equal
1169 : * to the inputs (but possibly a different type) */
1170 15 : bn = BATconvert(b, s, TYPE_str, 0, 0, 0);
1171 15 : goto done;
1172 : }
1173 :
1174 68 : res = concat_strings(&bn, NULL, b, b->hseqbase, ngrp, &ci,
1175 34 : (const oid *) Tloc(g, 0), min, max, skip_nils, sep,
1176 : nseparator, &nils);
1177 34 : if (res != GDK_SUCCEED)
1178 0 : bn = NULL;
1179 :
1180 34 : done:
1181 54 : if (free_nseparator)
1182 0 : GDKfree(nseparator);
1183 54 : return bn;
1184 : }
1185 :
1186 : #define compute_next_single_str(START, END) \
1187 : do { \
1188 : for (oid m = START; m < END; m++) { \
1189 : const char *sb = BUNtvar(bi, m); \
1190 : \
1191 : if (separator) { \
1192 : if (!strNil(sb)) { \
1193 : next_group_length += strlen(sb); \
1194 : if (!empty) \
1195 : next_group_length += separator_length; \
1196 : empty = false; \
1197 : } \
1198 : } else { /* sep case */ \
1199 : assert(sep != NULL); \
1200 : const char *sl = BUNtvar(sepi, m); \
1201 : \
1202 : if (!strNil(sb)) { \
1203 : next_group_length += strlen(sb); \
1204 : if (!empty && !strNil(sl)) \
1205 : next_group_length += strlen(sl); \
1206 : empty = false; \
1207 : } \
1208 : } \
1209 : } \
1210 : if (empty) { \
1211 : if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
1212 : max_group_length = 1; \
1213 : if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
1214 : goto allocation_error; \
1215 : } else if (1 > max_group_length) { \
1216 : max_group_length = 1; \
1217 : if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
1218 : goto allocation_error; \
1219 : single_str = next_single_str; \
1220 : } \
1221 : strcpy(single_str, str_nil); \
1222 : has_nils = true; \
1223 : } else { \
1224 : empty = true; \
1225 : if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
1226 : max_group_length = next_group_length; \
1227 : if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
1228 : goto allocation_error; \
1229 : } else if (next_group_length > max_group_length) { \
1230 : max_group_length = next_group_length; \
1231 : if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
1232 : goto allocation_error; \
1233 : single_str = next_single_str; \
1234 : } \
1235 : \
1236 : for (oid m = START; m < END; m++) { \
1237 : const char *sb = BUNtvar(bi, m); \
1238 : \
1239 : if (separator) { \
1240 : if (strNil(sb)) \
1241 : continue; \
1242 : if (!empty) { \
1243 : memcpy(single_str + offset, separator, separator_length); \
1244 : offset += separator_length; \
1245 : } \
1246 : next_length = strlen(sb); \
1247 : memcpy(single_str + offset, sb, next_length); \
1248 : offset += next_length; \
1249 : empty = false; \
1250 : } else { /* sep case */ \
1251 : assert(sep != NULL); \
1252 : const char *sl = BUNtvar(sepi, m); \
1253 : \
1254 : if (strNil(sb)) \
1255 : continue; \
1256 : if (!empty && !strNil(sl)) { \
1257 : next_length = strlen(sl); \
1258 : memcpy(single_str + offset, sl, next_length); \
1259 : offset += next_length; \
1260 : } \
1261 : next_length = strlen(sb); \
1262 : memcpy(single_str + offset, sb, next_length); \
1263 : offset += next_length; \
1264 : empty = false; \
1265 : } \
1266 : } \
1267 : \
1268 : single_str[offset] = '\0'; \
1269 : } \
1270 : } while (0)
1271 :
1272 : #define ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW \
1273 : do { \
1274 : size_t slice_length = 0; \
1275 : next_group_length = next_length = offset = 0; \
1276 : empty = true; \
1277 : compute_next_single_str(k, i); /* compute the entire string then slice it starting from the beginning */ \
1278 : empty = true; \
1279 : for (; k < i;) { \
1280 : const char *nsep; \
1281 : oid m = k; \
1282 : j = k; \
1283 : do { \
1284 : k++; \
1285 : } while (k < i && !op[k]); \
1286 : for (; j < k; j++) { \
1287 : const char *nstr = BUNtvar(bi, j); \
1288 : if (!strNil(nstr)) { \
1289 : slice_length += strlen(nstr); \
1290 : if (!empty) { \
1291 : if (separator) { \
1292 : nsep = (const char *) separator; \
1293 : } else { /* sep case */ \
1294 : assert(sep != NULL); \
1295 : nsep = BUNtvar(sepi, j); \
1296 : } \
1297 : if (!strNil(nsep)) \
1298 : slice_length += strlen(nsep); \
1299 : } \
1300 : empty = false; \
1301 : } \
1302 : } \
1303 : if (empty) { \
1304 : for (j = m; j < k; j++) \
1305 : if (tfastins_nocheckVAR(r, j, str_nil) != GDK_SUCCEED) \
1306 : goto allocation_error; \
1307 : has_nils = true; \
1308 : } else { \
1309 : char save = single_str[slice_length]; \
1310 : single_str[slice_length] = '\0'; \
1311 : for (j = m; j < k; j++) \
1312 : if (tfastins_nocheckVAR(r, j, single_str) != GDK_SUCCEED) \
1313 : goto allocation_error; \
1314 : single_str[slice_length] = save; \
1315 : } \
1316 : } \
1317 : } while (0)
1318 :
1319 : #define ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS \
1320 : do { \
1321 : next_group_length = next_length = offset = 0; \
1322 : empty = true; \
1323 : compute_next_single_str(k, i); \
1324 : for (; k < i; k++) \
1325 : if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
1326 : goto allocation_error; \
1327 : } while (0)
1328 :
1329 : #define ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW \
1330 : do { \
1331 : for (; k < i; k++) { \
1332 : const char *next = BUNtvar(bi, k); \
1333 : if (tfastins_nocheckVAR(r, k, next) != GDK_SUCCEED) \
1334 : goto allocation_error; \
1335 : has_nils |= strNil(next); \
1336 : } \
1337 : } while (0)
1338 :
1339 : #define ANALYTICAL_STR_GROUP_CONCAT_OTHERS \
1340 : do { \
1341 : for (; k < i; k++) { \
1342 : next_group_length = next_length = offset = 0; \
1343 : empty = true; \
1344 : compute_next_single_str(start[k], end[k]); \
1345 : if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
1346 : goto allocation_error; \
1347 : } \
1348 : } while (0)
1349 :
1350 : #define ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(IMP) \
1351 : do { \
1352 : if (p) { \
1353 : for (; i < cnt; i++) { \
1354 : if (np[i]) \
1355 : IMP; \
1356 : } \
1357 : } \
1358 : i = cnt; \
1359 : IMP; \
1360 : } while (0)
1361 :
1362 : gdk_return
1363 53 : GDKanalytical_str_group_concat(BAT *r, BAT *p, BAT *o, BAT *b, BAT *sep, BAT *s, BAT *e, const char *restrict separator, int frame_type)
1364 : {
1365 53 : bool has_nils = false, empty;
1366 53 : BATiter pi = bat_iterator(p);
1367 53 : BATiter oi = bat_iterator(o);
1368 53 : BATiter bi = bat_iterator(b);
1369 53 : BATiter sepi = bat_iterator(sep);
1370 53 : BATiter si = bat_iterator(s);
1371 53 : BATiter ei = bat_iterator(e);
1372 53 : oid i = 0, j = 0, k = 0, cnt = bi.count, *restrict start = si.base, *restrict end = ei.base;
1373 53 : bit *np = pi.base, *op = oi.base;
1374 53 : str single_str = NULL, next_single_str;
1375 53 : size_t separator_length = 0, next_group_length, max_group_length = 0, next_length, offset;
1376 :
1377 53 : assert((sep && !separator && bi.count == sepi.count) || (!sep && separator));
1378 53 : if (b->ttype != TYPE_str || r->ttype != TYPE_str || (sep && sep->ttype != TYPE_str)) {
1379 0 : GDKerror("only string type is supported\n");
1380 0 : bat_iterator_end(&pi);
1381 0 : bat_iterator_end(&oi);
1382 0 : bat_iterator_end(&bi);
1383 0 : bat_iterator_end(&sepi);
1384 0 : bat_iterator_end(&si);
1385 0 : bat_iterator_end(&ei);
1386 0 : return GDK_FAIL;
1387 : }
1388 30 : if (sep && sepi.count == 1) { /* Only one element in sep */
1389 0 : separator = BUNtvar(sepi, 0);
1390 0 : sep = NULL;
1391 : }
1392 :
1393 53 : if (sep == NULL)
1394 23 : separator_length = strlen(separator);
1395 :
1396 53 : if (cnt > 0) {
1397 52 : switch (frame_type) {
1398 29 : case 3: /* unbounded until current row */
1399 153851 : ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW);
1400 : break;
1401 0 : case 4: /* current row until unbounded */
1402 0 : goto notimplemented;
1403 23 : case 5: /* all rows */
1404 844 : ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS);
1405 : break;
1406 0 : case 6: /* current row */
1407 0 : ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW);
1408 : break;
1409 0 : default:
1410 0 : ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_OTHERS);
1411 : break;
1412 : }
1413 : }
1414 :
1415 53 : bat_iterator_end(&pi);
1416 53 : bat_iterator_end(&oi);
1417 53 : bat_iterator_end(&bi);
1418 53 : bat_iterator_end(&sepi);
1419 53 : bat_iterator_end(&si);
1420 53 : bat_iterator_end(&ei);
1421 53 : GDKfree(single_str);
1422 53 : BATsetcount(r, cnt);
1423 53 : r->tnonil = !has_nils;
1424 53 : r->tnil = has_nils;
1425 53 : return GDK_SUCCEED;
1426 0 : allocation_error:
1427 0 : bat_iterator_end(&pi);
1428 0 : bat_iterator_end(&oi);
1429 0 : bat_iterator_end(&bi);
1430 0 : bat_iterator_end(&sepi);
1431 0 : bat_iterator_end(&si);
1432 0 : bat_iterator_end(&ei);
1433 0 : GDKfree(single_str);
1434 0 : return GDK_FAIL;
1435 0 : notimplemented:
1436 0 : bat_iterator_end(&pi);
1437 0 : bat_iterator_end(&oi);
1438 0 : bat_iterator_end(&bi);
1439 0 : bat_iterator_end(&sepi);
1440 0 : bat_iterator_end(&si);
1441 0 : bat_iterator_end(&ei);
1442 0 : GDKerror("str_group_concat not yet implemented for current row until unbounded case\n");
1443 0 : return GDK_FAIL;
1444 : }
|