LCOV - code coverage report
Current view: top level - gdk - gdk_string.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 571 737 77.5 %
Date: 2024-04-26 00:35:57 Functions: 15 15 100.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : #include "monetdb_config.h"
      14             : #include "gdk.h"
      15             : #include "gdk_private.h"
      16             : #include "gdk_cand.h"
      17             : 
      18             : /* String Atom Implementation
      19             :  *
      20             :  * Strings are stored in two parts.  The first part is the normal tail
      21             :  * heap which contains a list of offsets.  The second part is the
      22             :  * theap which contains the actual strings.  The offsets in the tail
      23             :  * heap (a.k.a. offset heap) point into the theap (a.k.a. string
      24             :  * heap).  Strings are NULL-terminated and are stored without any
      25             :  * escape sequences.  Strings are encoded using the UTF-8 encoding
      26             :  * of Unicode.  This means that individual "characters" (really,
      27             :  * Unicode code points) can be between one and four bytes long.
      28             :  *
      29             :  * Because in many typical situations there are lots of duplicated
      30             :  * string values that are being stored in a table, but also in many
      31             :  * (other) typical situations there are very few duplicated string
      32             :  * values stored, a scheme has been introduced to cater to both
      33             :  * situations.
      34             :  *
      35             :  * When the string heap is "small" (defined as less than 64KiB), the
      36             :  * string heap is fully duplicate eliminated.  When the string heap
      37             :  * grows beyond this size, the heap is not kept free of duplicate
      38             :  * strings, but there is then a heuristic that tries to limit the
      39             :  * number of duplicates.
      40             :  *
      41             :  * This is done by having a fixed sized hash table at the start of the
      42             :  * string heap, and allocating space for collision lists in the first
      43             :  * 64KiB of the string heap.  After the first 64KiB no extra space is
      44             :  * allocated for lists, so hash collisions cannot be resolved.
      45             :  */
      46             : 
      47             : /* some of these macros are duplicates from gdk_atoms.c */
      48             : #define num08(x)        ((x) >= '0' && (x) <= '7')
      49             : #define base08(x)       ((x) - '0')
      50             : #define mult08(x)       ((x) << 3)
      51             : 
      52             : #define num16(x)        isxdigit((unsigned char) (x))
      53             : #define base16(x)       (((x) >= 'a' && (x) <= 'f') ? ((x) - 'a' + 10) : ((x) >= 'A' && (x) <= 'F') ? ((x) - 'A' + 10) : (x) - '0')
      54             : #define mult16(x)       ((x) << 4)
      55             : 
      56             : #define atommem(size)                                   \
      57             :         do {                                            \
      58             :                 if (*dst == NULL || *len < (size)) { \
      59             :                         GDKfree(*dst);                  \
      60             :                         *len = (size);                  \
      61             :                         *dst = GDKmalloc(*len);         \
      62             :                         if (*dst == NULL) {             \
      63             :                                 *len = 0;               \
      64             :                                 return -1;              \
      65             :                         }                               \
      66             :                 }                                       \
      67             :         } while (0)
      68             : 
      69             : const char str_nil[2] = { '\200', 0 };
      70             : 
      71             : gdk_return
      72      744773 : strHeap(Heap *d, size_t cap)
      73             : {
      74      744773 :         size_t size;
      75             : 
      76      744773 :         cap = MAX(cap, BATTINY);
      77      744773 :         size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * GDK_VARALIGN);
      78      744773 :         return HEAPalloc(d, size, 1);
      79             : }
      80             : 
      81             : 
      82             : void
      83        4613 : strCleanHash(Heap *h, bool rebuild)
      84             : {
      85        4613 :         stridx_t newhash[GDK_STRHASHTABLE];
      86        4613 :         size_t pad, pos;
      87        4613 :         BUN off, strhash;
      88        4613 :         const char *s;
      89             : 
      90        4613 :         (void) rebuild;
      91        4613 :         if (!h->cleanhash)
      92         712 :                 return;
      93        3901 :         if (h->size < GDK_STRHASHTABLE * sizeof(stridx_t) &&
      94           0 :             HEAPextend(h, GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN, true) != GDK_SUCCEED) {
      95           0 :                 GDKclrerr();
      96           0 :                 if (h->size > 0)
      97           0 :                         memset(h->base, 0, h->size);
      98           0 :                 return;
      99             :         }
     100             : 
     101             :         /* rebuild hash table for double elimination
     102             :          *
     103             :          * If appending strings to the BAT was aborted, if the heap
     104             :          * was memory mapped, the hash in the string heap may well be
     105             :          * incorrect.  Therefore we don't trust it when we read in a
     106             :          * string heap and we rebuild the complete table (it is small,
     107             :          * so this won't take any time at all).
     108             :          * Note that we will only do this the first time the heap is
     109             :          * loaded, and only for heaps that existed when the server was
     110             :          * started. */
     111        3901 :         memset(newhash, 0, sizeof(newhash));
     112        3901 :         pos = GDK_STRHASHSIZE;
     113      362364 :         while (pos < h->free) {
     114      358723 :                 pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
     115      358723 :                 if (pad < sizeof(stridx_t))
     116      307279 :                         pad += GDK_VARALIGN;
     117      358723 :                 pos += pad;
     118      358723 :                 if (pos >= GDK_ELIMLIMIT)
     119             :                         break;
     120      358463 :                 s = h->base + pos;
     121      358463 :                 strhash = strHash(s);
     122      358463 :                 off = strhash & GDK_STRHASHMASK;
     123      358463 :                 newhash[off] = (stridx_t) (pos - sizeof(stridx_t));
     124      358463 :                 pos += strlen(s) + 1;
     125             :         }
     126             :         /* only set dirty flag if the hash table actually changed */
     127        3901 :         if (memcmp(newhash, h->base, sizeof(newhash)) != 0) {
     128         197 :                 memcpy(h->base, newhash, sizeof(newhash));
     129         197 :                 if (h->storage == STORE_MMAP) {
     130          31 :                         if (!(ATOMIC_GET(&GDKdebug) & NOSYNCMASK))
     131           0 :                                 (void) MT_msync(h->base, GDK_STRHASHSIZE);
     132             :                 } else
     133         166 :                         h->dirty = true;
     134             :         }
     135             : #ifndef NDEBUG
     136        3901 :         if (GDK_ELIMDOUBLES(h)) {
     137             :                 pos = GDK_STRHASHSIZE;
     138      248273 :                 while (pos < h->free) {
     139      244632 :                         pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
     140      244632 :                         if (pad < sizeof(stridx_t))
     141      204326 :                                 pad += GDK_VARALIGN;
     142      244632 :                         pos += pad;
     143      244632 :                         s = h->base + pos;
     144      244632 :                         assert(strLocate(h, s) != 0);
     145      244632 :                         pos += strlen(s) + 1;
     146             :                 }
     147             :         }
     148             : #endif
     149        3901 :         h->cleanhash = false;
     150             : }
     151             : 
     152             : /*
     153             :  * The strPut routine. The routine strLocate can be used to identify
     154             :  * the location of a string in the heap if it exists. Otherwise it
     155             :  * returns (var_t) -2 (-1 is reserved for error).
     156             :  */
     157             : var_t
     158      438248 : strLocate(Heap *h, const char *v)
     159             : {
     160      438248 :         stridx_t *ref, *next;
     161             : 
     162             :         /* search hash-table, if double-elimination is still in place */
     163      438248 :         BUN off;
     164      438248 :         if (h->free == 0) {
     165             :                 /* empty, so there are no strings */
     166             :                 return (var_t) -2;
     167             :         }
     168             : 
     169      438248 :         off = strHash(v);
     170      438248 :         off &= GDK_STRHASHMASK;
     171             : 
     172             :         /* should only use strLocate iff fully double eliminated */
     173      438248 :         assert(GDK_ELIMBASE(h->free) == 0);
     174             : 
     175             :         /* search the linked list */
     176      486964 :         for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
     177      484049 :                 next = (stridx_t *) (h->base + *ref);
     178      484049 :                 if (strcmp(v, (str) (next + 1)) == 0)
     179      435333 :                         return (var_t) ((sizeof(stridx_t) + *ref));     /* found */
     180             :         }
     181             :         return (var_t) -2;
     182             : }
     183             : 
     184             : var_t
     185    91639964 : strPut(BAT *b, var_t *dst, const void *V)
     186             : {
     187    91639964 :         const char *v = V;
     188    91639964 :         Heap *h = b->tvheap;
     189    91639964 :         size_t pad;
     190    91639964 :         size_t pos, len = strlen(v) + 1;
     191    91639964 :         stridx_t *bucket;
     192    91639964 :         BUN off;
     193             : 
     194    91639964 :         if (h->free == 0) {
     195      225482 :                 if (h->size < GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN) {
     196           0 :                         if (HEAPgrow(&b->tvheap, GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN, true) != GDK_SUCCEED) {
     197             :                                 return (var_t) -1;
     198             :                         }
     199           0 :                         h = b->tvheap;
     200             :                 }
     201      225482 :                 h->free = GDK_STRHASHTABLE * sizeof(stridx_t);
     202      225482 :                 h->dirty = true;
     203             : #ifdef NDEBUG
     204             :                 memset(h->base, 0, h->free);
     205             : #else
     206             :                 /* fill should solve initialization problems within valgrind */
     207      225482 :                 memset(h->base, 0, h->size);
     208             : #endif
     209             :         }
     210             : 
     211    91639964 :         off = strHash(v);
     212    91639964 :         off &= GDK_STRHASHMASK;
     213    91639964 :         bucket = ((stridx_t *) h->base) + off;
     214             : 
     215    91639964 :         if (*bucket) {
     216    89149005 :                 assert(*bucket < h->free);
     217             :                 /* the hash list is not empty */
     218    89149005 :                 if (*bucket < GDK_ELIMLIMIT) {
     219             :                         /* small string heap (<64KiB) -- fully double
     220             :                          * eliminated: search the linked list */
     221             :                         const stridx_t *ref = bucket;
     222             : 
     223    43696472 :                         do {
     224    43696472 :                                 pos = *ref + sizeof(stridx_t);
     225    43696472 :                                 assert(pos < h->free);
     226    43696472 :                                 if (strcmp(v, h->base + pos) == 0) {
     227             :                                         /* found */
     228    40202714 :                                         return *dst = (var_t) pos;
     229             :                                 }
     230     3493758 :                                 ref = (stridx_t *) (h->base + *ref);
     231     3493758 :                         } while (*ref);
     232             :                 } else {
     233             :                         /* large string heap (>=64KiB) -- there is no
     234             :                          * linked list, so only look at single
     235             :                          * entry */
     236    47798786 :                         pos = *bucket;
     237    47798786 :                         if (strcmp(v, h->base + pos) == 0) {
     238             :                                 /* already in heap: reuse */
     239      789403 :                                 return *dst = (var_t) pos;
     240             :                         }
     241             :                 }
     242             :         }
     243             :         /* the string was not found in the heap, we need to enter it */
     244             : 
     245             :         /* check that string is correctly encoded UTF-8; there was no
     246             :          * need to do this earlier: if the string was found above, it
     247             :          * must have gone through here in the past */
     248             : #ifndef NDEBUG
     249    50647847 :         if (!checkUTF8(v)) {
     250           0 :                 GDKerror("incorrectly encoded UTF-8\n");
     251           0 :                 return (var_t) -1;
     252             :         }
     253             : #endif
     254             : 
     255    51036050 :         pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - 1));
     256    51036050 :         if (GDK_ELIMBASE(h->free + pad) == 0) {      /* i.e. h->free+pad < GDK_ELIMLIMIT */
     257     2876149 :                 if (pad < sizeof(stridx_t)) {
     258             :                         /* make room for hash link */
     259     2210282 :                         pad += GDK_VARALIGN;
     260             :                 }
     261    48159901 :         } else if (GDK_ELIMBASE(h->free) != 0) {
     262             :                 /* no extra padding needed when no hash links needed
     263             :                  * (but only when padding doesn't cross duplicate
     264             :                  * elimination boundary) */
     265    48201571 :                 pad = 0;
     266             :         }
     267             : 
     268             :         /* check heap for space (limited to a certain maximum after
     269             :          * which nils are inserted) */
     270    51036050 :         if (h->free + pad + len >= h->size) {
     271        9037 :                 size_t newsize = MAX(h->size, 4096);
     272             : 
     273             :                 /* double the heap size until we have enough space */
     274        9068 :                 do {
     275        9068 :                         if (newsize < 4 * 1024 * 1024)
     276        8799 :                                 newsize <<= 1;
     277             :                         else
     278         269 :                                 newsize += 4 * 1024 * 1024;
     279        9068 :                 } while (newsize <= h->free + pad + len);
     280             : 
     281        9037 :                 assert(newsize);
     282             : 
     283        9037 :                 if (h->free + pad + len >= (size_t) VAR_MAX) {
     284           0 :                         GDKerror("string heap gets larger than %zuGiB.\n", (size_t) VAR_MAX >> 30);
     285           0 :                         return (var_t) -1;
     286             :                 }
     287        9037 :                 TRC_DEBUG(HEAP, "HEAPextend in strPut %s %zu %zu\n", h->filename, h->size, newsize);
     288        9037 :                 if (HEAPgrow(&b->tvheap, newsize, true) != GDK_SUCCEED) {
     289             :                         return (var_t) -1;
     290             :                 }
     291        9038 :                 h = b->tvheap;
     292             : 
     293             :                 /* make bucket point into the new heap */
     294        9038 :                 bucket = ((stridx_t *) h->base) + off;
     295             :         }
     296             : 
     297             :         /* insert string */
     298    51036051 :         pos = h->free + pad;
     299    51036051 :         *dst = (var_t) pos;
     300    51036051 :         if (pad > 0)
     301     2870481 :                 memset(h->base + h->free, 0, pad);
     302    51036051 :         memcpy(h->base + pos, v, len);
     303    51036051 :         h->free += pad + len;
     304    51036051 :         h->dirty = true;
     305             : 
     306             :         /* maintain hash table */
     307    51036051 :         if (GDK_ELIMBASE(pos) == 0) {   /* small string heap: link the next pointer */
     308             :                 /* the stridx_t next pointer directly precedes the
     309             :                  * string */
     310     2865899 :                 pos -= sizeof(stridx_t);
     311     2865899 :                 *(stridx_t *) (h->base + pos) = *bucket;
     312             :         }
     313    51036051 :         *bucket = (stridx_t) pos;       /* set bucket to the new string */
     314             : 
     315    51036051 :         return *dst;
     316             : }
     317             : 
     318             : /*
     319             :  * Convert an "" separated string to a GDK string value, checking that
     320             :  * the input is correct UTF-8.
     321             :  */
     322             : 
     323             : #ifdef __GNUC__
     324             : /* __builtin_expect returns its first argument; it is expected to be
     325             :  * equal to the second argument */
     326             : #define unlikely(expr)  __builtin_expect((expr) != 0, 0)
     327             : #define likely(expr)    __builtin_expect((expr) != 0, 1)
     328             : #else
     329             : #define unlikely(expr)  (expr)
     330             : #define likely(expr)    (expr)
     331             : #endif
     332             : 
     333             : ssize_t
     334   320660098 : GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len, char quote)
     335             : {
     336   320660098 :         unsigned char *p = dst;
     337   320660098 :         const unsigned char *cur = src, *end = src + len;
     338   320660098 :         bool escaped = false;
     339   320660098 :         int mask = 0, n, c, utf8char = 0;
     340             : 
     341   320660098 :         if (len >= 2 && strNil((const char *) src)) {
     342           0 :                 strcpy((char *) dst, str_nil);
     343           0 :                 return 1;
     344             :         }
     345             : 
     346             :         /* copy it in, while performing the correct escapes */
     347             :         /* n is the number of follow-on bytes left in a multi-byte
     348             :          * UTF-8 sequence */
     349  2074720915 :         for (cur = src, n = 0; cur < end || escaped; cur++) {
     350             :                 /* first convert any \ escapes and store value in c */
     351  1754060820 :                 if (escaped) {
     352      547237 :                         switch (*cur) {
     353        3825 :                         case '0':
     354             :                         case '1':
     355             :                         case '2':
     356             :                         case '3':
     357             :                         case '4':
     358             :                         case '5':
     359             :                         case '6':
     360             :                         case '7':
     361             :                                 /* \ with up to three octal digits */
     362        3825 :                                 c = base08(*cur);
     363        3825 :                                 if (num08(cur[1])) {
     364        3825 :                                         cur++;
     365        3825 :                                         c = mult08(c) + base08(*cur);
     366        3825 :                                         if (num08(cur[1])) {
     367        3825 :                                                 if (unlikely(c > 037)) {
     368             :                                                         /* octal
     369             :                                                          * escape
     370             :                                                          * sequence
     371             :                                                          * out or
     372             :                                                          * range */
     373           1 :                                                         GDKerror("not an octal number\n");
     374           1 :                                                         return -1;
     375             :                                                 }
     376        3824 :                                                 cur++;
     377        3824 :                                                 c = mult08(c) + base08(*cur);
     378        3824 :                                                 assert(c >= 0 && c <= 0377);
     379             :                                         }
     380             :                                 }
     381             :                                 break;
     382          57 :                         case 'x':
     383             :                                 /* \x with one or two hexadecimal digits */
     384          57 :                                 if (num16(cur[1])) {
     385          57 :                                         cur++;
     386          57 :                                         c = base16(*cur);
     387          57 :                                         if (num16(cur[1])) {
     388          57 :                                                 cur++;
     389          57 :                                                 c = mult16(c) + base16(*cur);
     390             :                                         }
     391             :                                 } else
     392             :                                         c = 'x';
     393             :                                 break;
     394           0 :                         case 'u':
     395             :                         case 'U':
     396             :                                 /* \u with four hexadecimal digits or
     397             :                                  * \U with eight hexadecimal digits */
     398           0 :                                 if (unlikely(n > 0)) {
     399             :                                         /* not when in the middle of a
     400             :                                          * UTF-8 sequence */
     401           0 :                                         goto notutf8;
     402             :                                 }
     403           0 :                                 c = 0;
     404           0 :                                 for (n = *cur == 'U' ? 8 : 4; n > 0; n--) {
     405           0 :                                         cur++;
     406           0 :                                         if (unlikely(!num16(*cur))) {
     407           0 :                                                 GDKerror("not a Unicode code point escape\n");
     408           0 :                                                 return -1;
     409             :                                         }
     410           0 :                                         c = c << 4 | base16(*cur);
     411             :                                 }
     412             :                                 /* n == 0 now */
     413           0 :                                 if (unlikely(c == 0 || c > 0x10FFFF ||
     414             :                                              (c & 0xFFF800) == 0xD800)) {
     415           0 :                                         GDKerror("illegal Unicode code point\n");
     416           0 :                                         return -1;
     417             :                                 }
     418           0 :                                 if (c < 0x80) {
     419           0 :                                         *p++ = (unsigned char) c;
     420             :                                 } else {
     421           0 :                                         if (c < 0x800) {
     422           0 :                                                 *p++ = 0xC0 | (c >> 6);
     423             :                                         } else {
     424           0 :                                                 if (c < 0x10000) {
     425           0 :                                                         *p++ = 0xE0 | (c >> 12);
     426             :                                                 } else {
     427           0 :                                                         *p++ = 0xF0 | (c >> 18);
     428           0 :                                                         *p++ = 0x80 | ((c >> 12) & 0x3F);
     429             :                                                 }
     430           0 :                                                 *p++ = 0x80 | ((c >> 6) & 0x3F);
     431             :                                         }
     432           0 :                                         *p++ = 0x80 | (c & 0x3F);
     433             :                                 }
     434           0 :                                 escaped = false;
     435           0 :                                 continue;
     436             :                         case 'a':
     437             :                                 c = '\a';
     438             :                                 break;
     439           1 :                         case 'b':
     440           1 :                                 c = '\b';
     441           1 :                                 break;
     442           5 :                         case 'f':
     443           5 :                                 c = '\f';
     444           5 :                                 break;
     445       10916 :                         case 'n':
     446       10916 :                                 c = '\n';
     447       10916 :                                 break;
     448          12 :                         case 'r':
     449          12 :                                 c = '\r';
     450          12 :                                 break;
     451        1999 :                         case 't':
     452        1999 :                                 c = '\t';
     453        1999 :                                 break;
     454           0 :                         case '\0':
     455           0 :                                 c = '\\';
     456           0 :                                 break;
     457      530422 :                         case '\'':
     458             :                         case '\\':
     459             :                                 /* \' and \\ can be handled by the
     460             :                                  * default case */
     461             :                         default:
     462             :                                 /* unrecognized \ escape, just copy
     463             :                                  * the backslashed character */
     464      530422 :                                 c = *cur;
     465      530422 :                                 break;
     466             :                         }
     467             :                         escaped = false;
     468  1753513583 :                 } else if ((c = *cur) == '\\') {
     469      547237 :                         escaped = true;
     470      547237 :                         continue;
     471  1752966346 :                 } else if (c == quote && cur[1] == quote) {
     472        5529 :                         assert(c != 0);
     473        5529 :                         if (unlikely(n > 0))
     474           0 :                                 goto notutf8;
     475        5529 :                         *p++ = quote;
     476        5529 :                         cur++;
     477        5529 :                         continue;
     478             :                 }
     479             : 
     480  1753508053 :                 if (n > 0) {
     481             :                         /* we're still expecting follow-up bytes in a
     482             :                          * UTF-8 sequence */
     483           0 :                         if (unlikely((c & 0xC0) != 0x80)) {
     484             :                                 /* incorrect UTF-8 sequence: byte is
     485             :                                  * not 10xxxxxx */
     486           0 :                                 goto notutf8;
     487             :                         }
     488           0 :                         utf8char = (utf8char << 6) | (c & 0x3F);
     489           0 :                         n--;
     490           0 :                         if (n == 0) {
     491             :                                 /* this was the last byte in the sequence */
     492       26690 :                                 if (unlikely((utf8char & mask) == 0)) {
     493             :                                         /* incorrect UTF-8 sequence:
     494             :                                          * not shortest possible */
     495           0 :                                         goto notutf8;
     496             :                                 }
     497       26690 :                                 if (unlikely(utf8char > 0x10FFFF)) {
     498             :                                         /* incorrect UTF-8 sequence:
     499             :                                          * value too large */
     500           0 :                                         goto notutf8;
     501             :                                 }
     502       26690 :                                 if (unlikely((utf8char & 0x1FFF800) == 0xD800)) {
     503             :                                         /* incorrect UTF-8 sequence:
     504             :                                          * low or high surrogate
     505             :                                          * encoded as UTF-8 */
     506           0 :                                         goto notutf8;
     507             :                                 }
     508             :                         }
     509  1775080736 :                 } else if ((c & 0x80) == 0) {
     510             :                         ;
     511       26692 :                 } else if ((c & 0xE0) == 0xC0) {
     512        1762 :                         n = 1;
     513        1762 :                         mask = 0x000780;
     514        1762 :                         utf8char = c & 0x1F;
     515       24930 :                 } else if ((c & 0xF0) == 0xE0) {
     516       24916 :                         n = 2;
     517       24916 :                         mask = 0x00F800;
     518       24916 :                         utf8char = c & 0x0F;
     519          14 :                 } else if ((c & 0xF8) == 0xF0) {
     520          12 :                         n = 3;
     521          12 :                         mask = 0x1F0000;
     522          12 :                         utf8char = c & 0x07;
     523             :                 } else {
     524             :                         /* incorrect UTF-8 sequence */
     525           2 :                         goto notutf8;
     526             :                 }
     527  1753508051 :                 *p++ = c;
     528             :         }
     529   320660095 :         if (unlikely(n > 0)) {
     530             :                 /* incomplete UTF-8 sequence */
     531           0 :                 goto notutf8;
     532             :         }
     533   320660095 :         *p++ = 0;
     534   320660095 :         return len;
     535           2 :   notutf8:
     536           2 :         GDKerror("not a proper UTF-8 sequence\n");
     537           2 :         return -1;
     538             : }
     539             : 
     540             : ssize_t
     541    28549937 : strFromStr(const char *restrict src, size_t *restrict len, char **restrict dst, bool external)
     542             : {
     543    28549937 :         const char *cur = src, *start = NULL;
     544    28549937 :         size_t l = 1;
     545    28549937 :         bool escaped = false;
     546             : 
     547    28549937 :         if (!external) {
     548    28549889 :                 size_t sz = strLen(src);
     549    28549889 :                 atommem(sz);
     550    28568608 :                 return (ssize_t) strcpy_len(*dst, src, sz);
     551             :         }
     552             : 
     553          48 :         if (strNil(src)) {
     554           0 :                 atommem(2);
     555           0 :                 strcpy(*dst, str_nil);
     556           0 :                 return 1;
     557             :         }
     558             : 
     559          48 :         while (GDKisspace(*cur))
     560           0 :                 cur++;
     561          48 :         if (*cur != '"') {
     562           0 :                 if (strncmp(cur, "nil", 3) == 0) {
     563           0 :                         atommem(2);
     564           0 :                         strcpy(*dst, str_nil);
     565           0 :                         return (ssize_t) (cur - src) + 3;
     566             :                 }
     567           0 :                 GDKerror("not a quoted string\n");
     568           0 :                 return -1;
     569             :         }
     570             : 
     571             :         /* scout the string to find out its length and whether it was
     572             :          * properly quoted */
     573         186 :         for (start = ++cur; *cur != '"' || escaped; cur++) {
     574         138 :                 if (*cur == 0) {
     575           0 :                         GDKerror("no closing quotes\n");
     576           0 :                         return -1;
     577         138 :                 } else if (*cur == '\\' && !escaped) {
     578             :                         escaped = true;
     579             :                 } else {
     580         129 :                         escaped = false;
     581         129 :                         l++;
     582             :                 }
     583             :         }
     584             : 
     585             :         /* alloc new memory */
     586          48 :         if (*dst == NULL || *len < l) {
     587          48 :                 GDKfree(*dst);
     588          48 :                 *dst = GDKmalloc(*len = l);
     589          48 :                 if (*dst == NULL) {
     590           0 :                         *len = 0;
     591           0 :                         return -1;
     592             :                 }
     593             :         }
     594             : 
     595          48 :         return GDKstrFromStr((unsigned char *) *dst,
     596             :                              (const unsigned char *) start,
     597             :                              (ssize_t) (cur - start),
     598             :                              '\0');
     599             : }
     600             : 
     601             : /*
     602             :  * Convert a GDK string value to something printable.
     603             :  */
     604             : /* all but control characters (in range 0 to 31) and DEL */
     605             : #define printable_chr(ch)       ((' ' <= (ch) && (ch) <= '~') || ((ch) & 0x80) != 0)
     606             : 
     607             : size_t
     608    16675143 : escapedStrlen(const char *restrict src, const char *sep1, const char *sep2, int quote)
     609             : {
     610    16675143 :         size_t end, sz = 0;
     611    16675143 :         size_t sep1len, sep2len;
     612             : 
     613    16675143 :         sep1len = sep1 ? strlen(sep1) : 0;
     614    16675143 :         sep2len = sep2 ? strlen(sep2) : 0;
     615   475479680 :         for (end = 0; src[end]; end++)
     616   458804537 :                 if (src[end] == '\\'
     617   458804154 :                     || src[end] == quote
     618   458733355 :                     || (sep1len && strncmp(src + end, sep1, sep1len) == 0)
     619   458730728 :                     || (sep2len && strncmp(src + end, sep2, sep2len) == 0)) {
     620       78035 :                         sz += 2;
     621   458726502 :                 } else if (src[end] == (char) '\302' &&
     622           4 :                            0200 <= ((int) src[end + 1] & 0377) &&
     623           4 :                            ((int) src[end + 1] & 0377) <= 0237) {
     624             :                         /* Unicode control character (code point range
     625             :                          * U-00000080 through U-0000009F encoded in
     626             :                          * UTF-8 */
     627             :                         /* for the first one of the two UTF-8 bytes we
     628             :                          * count a width of 7 and for the second one
     629             :                          * 1, together that's 8, i.e. the width of two
     630             :                          * backslash-escaped octal coded characters */
     631           0 :                         sz += 7;
     632   458726502 :                 } else if (!printable_chr(src[end])) {
     633       12740 :                         sz += 4;
     634             :                 } else {
     635   458713762 :                         sz++;
     636             :                 }
     637    16675143 :         return sz;
     638             : }
     639             : 
     640             : size_t
     641     8341375 : escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char *sep1, const char *sep2, int quote)
     642             : {
     643     8341375 :         size_t cur = 0, l = 0;
     644     8341375 :         size_t sep1len, sep2len;
     645             : 
     646     8341375 :         sep1len = sep1 ? strlen(sep1) : 0;
     647     8341375 :         sep2len = sep2 ? strlen(sep2) : 0;
     648   237840377 :         for (; src[cur] && l < dstlen; cur++)
     649   229499002 :                 if (!printable_chr(src[cur])
     650   229489765 :                     || (src[cur] == '\302'
     651           2 :                         && 0200 <= (src[cur + 1] & 0377)
     652           2 :                         && ((int) src[cur + 1] & 0377) <= 0237)
     653   229489765 :                     || (cur > 0
     654   221151756 :                         && src[cur - 1] == '\302'
     655           2 :                         && 0200 <= (src[cur] & 0377)
     656           2 :                         && (src[cur] & 0377) <= 0237)) {
     657        9237 :                         dst[l++] = '\\';
     658        9237 :                         switch (src[cur]) {
     659         327 :                         case '\t':
     660         327 :                                 dst[l++] = 't';
     661         327 :                                 break;
     662        8898 :                         case '\n':
     663        8898 :                                 dst[l++] = 'n';
     664        8898 :                                 break;
     665           3 :                         case '\r':
     666           3 :                                 dst[l++] = 'r';
     667           3 :                                 break;
     668           2 :                         case '\f':
     669           2 :                                 dst[l++] = 'f';
     670           2 :                                 break;
     671           7 :                         default:
     672           7 :                                 snprintf(dst + l, dstlen - l, "%03o", (unsigned char) src[cur]);
     673           7 :                                 l += 3;
     674           7 :                                 break;
     675             :                         }
     676   229489765 :                 } else if (src[cur] == '\\'
     677   229489555 :                            || src[cur] == quote
     678   229443567 :                            || (sep1len && strncmp(src + cur, sep1, sep1len) == 0)
     679   229443567 :                            || (sep2len && strncmp(src + cur, sep2, sep2len) == 0)) {
     680       46198 :                         dst[l++] = '\\';
     681       46198 :                         dst[l++] = src[cur];
     682             :                 } else {
     683   229443567 :                         dst[l++] = src[cur];
     684             :                 }
     685     8341375 :         assert(l < dstlen);
     686     8341375 :         dst[l] = 0;
     687     8341375 :         return l;
     688             : }
     689             : 
     690             : ssize_t
     691       17160 : strToStr(char **restrict dst, size_t *restrict len, const char *restrict src, bool external)
     692             : {
     693       17160 :         size_t sz;
     694             : 
     695       17160 :         if (!external) {
     696        9808 :                 sz = strLen(src);
     697        9808 :                 atommem(sz);
     698        9808 :                 return (ssize_t) strcpy_len(*dst, src, sz);
     699             :         }
     700        7352 :         if (strNil(src)) {
     701          52 :                 atommem(4);
     702          52 :                 strcpy(*dst, "nil");
     703          52 :                 return 3;
     704             :         } else {
     705        7300 :                 ssize_t l = 0;
     706        7300 :                 size_t sz = escapedStrlen(src, NULL, NULL, '"');
     707             : 
     708        7306 :                 atommem(sz + 3);
     709        7314 :                 l = (ssize_t) escapedStr((*dst) + 1, src, *len - 1, NULL, NULL, '"');
     710        7306 :                 l++;
     711        7306 :                 (*dst)[0] = (*dst)[l++] = '"';
     712        7306 :                 (*dst)[l] = 0;
     713        7306 :                 return l;
     714             :         }
     715             : }
     716             : 
     717             : str
     718          96 : strRead(str a, size_t *dstlen, stream *s, size_t cnt)
     719             : {
     720          96 :         int len;
     721             : 
     722          96 :         (void) cnt;
     723          96 :         assert(cnt == 1);
     724          96 :         if (mnstr_readInt(s, &len) != 1 || len < 0)
     725             :                 return NULL;
     726          96 :         if (a == NULL || *dstlen < (size_t) len + 1) {
     727           0 :                 if ((a = GDKrealloc(a, len + 1)) == NULL)
     728             :                         return NULL;
     729           0 :                 *dstlen = len + 1;
     730             :         }
     731          96 :         if (len && mnstr_read(s, a, len, 1) != 1) {
     732           0 :                 GDKfree(a);
     733           0 :                 return NULL;
     734             :         }
     735          96 :         a[len] = 0;
     736          96 :         return a;
     737             : }
     738             : 
     739             : gdk_return
     740          96 : strWrite(const char *a, stream *s, size_t cnt)
     741             : {
     742          96 :         size_t len = strlen(a);
     743             : 
     744          96 :         (void) cnt;
     745          96 :         assert(cnt == 1);
     746          96 :         if (!checkUTF8(a)) {
     747           0 :                 GDKerror("incorrectly encoded UTF-8\n");
     748           0 :                 return GDK_FAIL;
     749             :         }
     750          96 :         if (mnstr_writeInt(s, (int) len) && mnstr_write(s, a, len, 1) == 1)
     751             :                 return GDK_SUCCEED;
     752             :         else
     753           0 :                 return GDK_FAIL;
     754             : }
     755             : 
     756             : static gdk_return
     757          88 : concat_strings(BAT **bnp, ValPtr pt, BAT *b, oid seqb,
     758             :                BUN ngrp, struct canditer *restrict ci,
     759             :                const oid *restrict gids, oid min, oid max, bool skip_nils,
     760             :                BAT *sep, const char *restrict separator, BUN *has_nils)
     761             : {
     762          88 :         oid gid;
     763          88 :         BUN i, p, nils = 0;
     764          88 :         size_t *restrict lengths = NULL, separator_length = 0, next_length;
     765          88 :         str *restrict astrings = NULL;
     766          88 :         BATiter bi, bis = (BATiter) {0};
     767          88 :         BAT *bn = NULL;
     768          88 :         gdk_return rres = GDK_FAIL;
     769             : 
     770          88 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
     771             : 
     772             :         /* exactly one of bnp and pt must be NULL, the other non-NULL */
     773          88 :         assert((bnp == NULL) != (pt == NULL));
     774             :         /* if pt not NULL, only a single group allowed */
     775          88 :         assert(pt == NULL || ngrp == 1);
     776             : 
     777          88 :         if (bnp) {
     778          34 :                 if ((bn = COLnew(min, TYPE_str, ngrp, TRANSIENT)) == NULL)
     779             :                         return GDK_FAIL;
     780          34 :                 *bnp = bn;
     781             :         }
     782             : 
     783          88 :         bi = bat_iterator(b);
     784          88 :         bis = bat_iterator(sep);
     785          88 :         if (separator)
     786          57 :                 separator_length = strlen(separator);
     787             : 
     788          88 :         if (ngrp == 1) {
     789          61 :                 size_t offset = 0, single_length = 0;
     790          61 :                 bool empty = true;
     791             : 
     792          61 :                 if (separator) {
     793          42 :                         assert(sep == NULL);
     794         671 :                         TIMEOUT_LOOP_IDX(i, ci->ncand, qry_ctx) {
     795         545 :                                 p = canditer_next(ci) - seqb;
     796         545 :                                 const char *s = BUNtvar(bi, p);
     797         545 :                                 if (strNil(s)) {
     798          15 :                                         if (!skip_nils) {
     799             :                                                 nils = 1;
     800             :                                                 break;
     801             :                                         }
     802             :                                 } else {
     803         530 :                                         single_length += strlen(s);
     804         530 :                                         if (!empty)
     805         490 :                                                 single_length += separator_length;
     806             :                                         empty = false;
     807             :                                 }
     808             :                         }
     809             :                 } else { /* sep case */
     810          19 :                         assert(sep != NULL);
     811         354 :                         TIMEOUT_LOOP_IDX(i, ci->ncand, qry_ctx) {
     812         297 :                                 p = canditer_next(ci) - seqb;
     813         297 :                                 const char *s = BUNtvar(bi, p);
     814         297 :                                 const char *sl = BUNtvar(bis, p);
     815         297 :                                 if (strNil(s)) {
     816           4 :                                         if (!skip_nils) {
     817             :                                                 nils = 1;
     818             :                                                 break;
     819             :                                         }
     820             :                                 } else {
     821         293 :                                         single_length += strlen(s);
     822         293 :                                         if (!empty) {
     823         274 :                                                 if (strNil(sl)) {
     824          23 :                                                         if (!skip_nils) {
     825             :                                                                 nils = 1;
     826             :                                                                 break;
     827             :                                                         }
     828             :                                                 } else
     829         251 :                                                         single_length += strlen(sl);
     830             :                                         }
     831             :                                         empty = false;
     832             :                                 }
     833             :                         }
     834             :                 }
     835          61 :                 canditer_reset(ci);
     836          61 :                 TIMEOUT_CHECK(qry_ctx, GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx));
     837             : 
     838          61 :                 if (nils == 0 && !empty) {
     839          59 :                         char *single_str = NULL;
     840             : 
     841          59 :                         if ((single_str = GDKmalloc(single_length + 1)) == NULL) {
     842           0 :                                 bat_iterator_end(&bi);
     843           0 :                                 bat_iterator_end(&bis);
     844           0 :                                 BBPreclaim(bn);
     845           0 :                                 return GDK_FAIL;
     846             :                         }
     847          59 :                         empty = true;
     848          59 :                         if (separator) {
     849         652 :                                 TIMEOUT_LOOP_IDX(i, ci->ncand, qry_ctx) {
     850         532 :                                         p = canditer_next(ci) - seqb;
     851         532 :                                         const char *s = BUNtvar(bi, p);
     852         532 :                                         if (strNil(s))
     853           2 :                                                 continue;
     854         530 :                                         if (!empty) {
     855         490 :                                                 memcpy(single_str + offset, separator, separator_length);
     856         490 :                                                 offset += separator_length;
     857             :                                         }
     858         530 :                                         next_length = strlen(s);
     859         530 :                                         memcpy(single_str + offset, s, next_length);
     860         530 :                                         offset += next_length;
     861         530 :                                         empty = false;
     862             :                                 }
     863             :                         } else { /* sep case */
     864          19 :                                 assert(sep != NULL);
     865         354 :                                 TIMEOUT_LOOP_IDX(i, ci->ncand, qry_ctx) {
     866         297 :                                         p = canditer_next(ci) - seqb;
     867         297 :                                         const char *s = BUNtvar(bi, p);
     868         297 :                                         const char *sl = BUNtvar(bis, p);
     869         297 :                                         if (strNil(s))
     870           4 :                                                 continue;
     871         567 :                                         if (!empty && !strNil(sl)) {
     872         251 :                                                 next_length = strlen(sl);
     873         251 :                                                 memcpy(single_str + offset, sl, next_length);
     874         251 :                                                 offset += next_length;
     875             :                                         }
     876         293 :                                         next_length = strlen(s);
     877         293 :                                         memcpy(single_str + offset, s, next_length);
     878         293 :                                         offset += next_length;
     879         293 :                                         empty = false;
     880             :                                 }
     881             :                         }
     882             : 
     883          59 :                         single_str[offset] = '\0';
     884          59 :                         TIMEOUT_CHECK(qry_ctx, do { GDKfree(single_str); GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx); } while (0));
     885          59 :                         if (bn) {
     886           7 :                                 if (BUNappend(bn, single_str, false) != GDK_SUCCEED) {
     887           0 :                                         GDKfree(single_str);
     888           0 :                                         bat_iterator_end(&bi);
     889           0 :                                         bat_iterator_end(&bis);
     890           0 :                                         BBPreclaim(bn);
     891           0 :                                         return GDK_FAIL;
     892             :                                 }
     893             :                         } else {
     894          52 :                                 pt->len = offset + 1;
     895          52 :                                 pt->val.sval = single_str;
     896          52 :                                 single_str = NULL;      /* don't free */
     897             :                         }
     898          66 :                         GDKfree(single_str);
     899           2 :                 } else if (bn) {
     900           0 :                         if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
     901           0 :                                 bat_iterator_end(&bi);
     902           0 :                                 bat_iterator_end(&bis);
     903           0 :                                 BBPreclaim(bn);
     904           0 :                                 return GDK_FAIL;
     905             :                         }
     906             :                 } else {
     907           2 :                         if (VALinit(pt, TYPE_str, str_nil) == NULL) {
     908           0 :                                 bat_iterator_end(&bi);
     909           0 :                                 bat_iterator_end(&bis);
     910           0 :                                 return GDK_FAIL;
     911             :                         }
     912             :                 }
     913          61 :                 bat_iterator_end(&bi);
     914          61 :                 bat_iterator_end(&bis);
     915          61 :                 return GDK_SUCCEED;
     916             :         } else {
     917             :                 /* first used to calculated the total length of
     918             :                  * each group, then the the total offset */
     919          27 :                 lengths = GDKzalloc(ngrp * sizeof(*lengths));
     920          27 :                 astrings = GDKmalloc(ngrp * sizeof(str));
     921          27 :                 if (lengths == NULL || astrings == NULL) {
     922           0 :                         goto finish;
     923             :                 }
     924             :                 /* at first, set astrings[i] to str_nil, then for each
     925             :                  * non-empty group (even if all strings in the group
     926             :                  * are empty), set to NULL */
     927        2141 :                 for (i = 0; i < ngrp; i++)
     928        2114 :                         astrings[i] = (char *) str_nil;
     929             : 
     930          27 :                 if (separator) {
     931         208 :                         TIMEOUT_LOOP_IDX(p, ci->ncand, qry_ctx) {
     932         163 :                                 i = canditer_next(ci) - seqb;
     933         163 :                                 if (gids[i] >= min && gids[i] <= max) {
     934         163 :                                         gid = gids[i] - min;
     935         163 :                                         if (lengths[gid] == (size_t) -1)
     936           0 :                                                 continue;
     937         163 :                                         const char *s = BUNtvar(bi, i);
     938         326 :                                         if (!strNil(s)) {
     939         155 :                                                 lengths[gid] += strlen(s) + separator_length;
     940         155 :                                                 astrings[gid] = NULL;
     941           8 :                                         } else if (!skip_nils) {
     942           0 :                                                 nils++;
     943           0 :                                                 lengths[gid] = (size_t) -1;
     944           0 :                                                 astrings[gid] = (char *) str_nil;
     945             :                                         }
     946             :                                 }
     947             :                         }
     948             :                 } else { /* sep case */
     949          12 :                         assert(sep != NULL);
     950      999759 :                         TIMEOUT_LOOP_IDX(p, ci->ncand, qry_ctx) {
     951      999663 :                                 i = canditer_next(ci) - seqb;
     952      999663 :                                 if (gids[i] >= min && gids[i] <= max) {
     953      999663 :                                         gid = gids[i] - min;
     954      999663 :                                         if (lengths[gid] == (size_t) -1)
     955           0 :                                                 continue;
     956      999663 :                                         const char *s = BUNtvar(bi, i);
     957      999663 :                                         const char *sl = BUNtvar(bis, i);
     958     1999326 :                                         if (!strNil(s)) {
     959      999340 :                                                 lengths[gid] += strlen(s);
     960     1998680 :                                                 if (!strNil(sl)) {
     961      999209 :                                                         next_length = strlen(sl);
     962      999209 :                                                         lengths[gid] += next_length;
     963             :                                                 }
     964      999340 :                                                 astrings[gid] = NULL;
     965         323 :                                         } else if (!skip_nils) {
     966           0 :                                                 nils++;
     967           0 :                                                 lengths[gid] = (size_t) -1;
     968           0 :                                                 astrings[gid] = (char *) str_nil;
     969             :                                         }
     970             :                                 }
     971             :                         }
     972             :                 }
     973          27 :                 TIMEOUT_CHECK(qry_ctx, GOTO_LABEL_TIMEOUT_HANDLER(finish, qry_ctx));
     974             : 
     975          27 :                 if (separator) {
     976          69 :                         for (i = 0; i < ngrp; i++) {
     977          54 :                                 if (astrings[i] == NULL) {
     978          52 :                                         if ((astrings[i] = GDKmalloc(lengths[i] + 1)) == NULL) {
     979           0 :                                                 goto finish;
     980             :                                         }
     981          52 :                                         astrings[i][0] = 0;
     982          52 :                                         lengths[i] = 0;
     983             :                                 } else
     984           2 :                                         astrings[i] = NULL;
     985             :                         }
     986             :                 } else { /* sep case */
     987          12 :                         assert(sep != NULL);
     988        2072 :                         for (i = 0; i < ngrp; i++) {
     989        2060 :                                 if (astrings[i] == NULL) {
     990        2058 :                                         if ((astrings[i] = GDKmalloc(lengths[i] + 1)) == NULL) {
     991           0 :                                                 goto finish;
     992             :                                         }
     993        2058 :                                         astrings[i][0] = 0;
     994        2058 :                                         lengths[i] = 0;
     995             :                                 } else
     996           2 :                                         astrings[i] = NULL;
     997             :                         }
     998             :                 }
     999          27 :                 canditer_reset(ci);
    1000             : 
    1001          27 :                 if (separator) {
    1002         193 :                         TIMEOUT_LOOP_IDX(p, ci->ncand, qry_ctx) {
    1003         163 :                                 i = canditer_next(ci) - seqb;
    1004         163 :                                 if (gids[i] >= min && gids[i] <= max) {
    1005         163 :                                         gid = gids[i] - min;
    1006         163 :                                         if (astrings[gid]) {
    1007         160 :                                                 const char *s = BUNtvar(bi, i);
    1008         160 :                                                 if (strNil(s))
    1009           5 :                                                         continue;
    1010         155 :                                                 if (astrings[gid][lengths[gid]]) {
    1011         103 :                                                         memcpy(astrings[gid] + lengths[gid], separator, separator_length);
    1012         103 :                                                         lengths[gid] += separator_length;
    1013             :                                                 }
    1014         155 :                                                 next_length = strlen(s);
    1015         155 :                                                 memcpy(astrings[gid] + lengths[gid], s, next_length);
    1016         155 :                                                 lengths[gid] += next_length;
    1017         155 :                                                 astrings[gid][lengths[gid]] = 1;
    1018             :                                         }
    1019             :                                 }
    1020             :                         }
    1021             :                 } else { /* sep case */
    1022          12 :                         assert(sep != NULL);
    1023      999747 :                         TIMEOUT_LOOP_IDX(p, ci->ncand, qry_ctx) {
    1024      999663 :                                 i = canditer_next(ci) - seqb;
    1025      999663 :                                 if (gids[i] >= min && gids[i] <= max) {
    1026      999663 :                                         gid = gids[i] - min;
    1027      999663 :                                         if (astrings[gid]) {
    1028      999342 :                                                 const char *s = BUNtvar(bi, i);
    1029      999342 :                                                 const char *sl = BUNtvar(bis, i);
    1030      999342 :                                                 if (strNil(s))
    1031           2 :                                                         continue;
    1032     1996622 :                                                 if (astrings[gid][lengths[gid]] && !strNil(sl)) {
    1033      997156 :                                                         next_length = strlen(sl);
    1034      997156 :                                                         memcpy(astrings[gid] + lengths[gid], sl, next_length);
    1035      997156 :                                                         lengths[gid] += next_length;
    1036             :                                                 }
    1037      999340 :                                                 next_length = strlen(s);
    1038      999340 :                                                 memcpy(astrings[gid] + lengths[gid], s, next_length);
    1039      999340 :                                                 lengths[gid] += next_length;
    1040      999340 :                                                 astrings[gid][lengths[gid]] = 1;
    1041             :                                         }
    1042             :                                 }
    1043             :                         }
    1044             :                 }
    1045          27 :                 TIMEOUT_CHECK(qry_ctx, GOTO_LABEL_TIMEOUT_HANDLER(finish, qry_ctx));
    1046             : 
    1047        2141 :                 for (i = 0; i < ngrp; i++) {
    1048        2114 :                         if (astrings[i]) {
    1049        2110 :                                 astrings[i][lengths[i]] = '\0';
    1050        2110 :                                 if (BUNappend(bn, astrings[i], false) != GDK_SUCCEED) {
    1051           0 :                                         goto finish;
    1052             :                                 }
    1053           4 :                         } else if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
    1054           0 :                                 goto finish;
    1055             :                         }
    1056             :                 }
    1057             :                 rres = GDK_SUCCEED;
    1058             :         }
    1059             : 
    1060          27 :   finish:
    1061          27 :         bat_iterator_end(&bi);
    1062          27 :         bat_iterator_end(&bis);
    1063          27 :         if (has_nils)
    1064          27 :                 *has_nils = nils;
    1065          27 :         GDKfree(lengths);
    1066          27 :         if (astrings) {
    1067        2141 :                 for (i = 0; i < ngrp; i++) {
    1068        2114 :                         if (astrings[i] != str_nil)
    1069        2114 :                                 GDKfree(astrings[i]);
    1070             :                 }
    1071          27 :                 GDKfree(astrings);
    1072             :         }
    1073          27 :         if (rres != GDK_SUCCEED)
    1074           0 :                 BBPreclaim(bn);
    1075             : 
    1076             :         return rres;
    1077             : 
    1078           0 :   bailout:
    1079           0 :         bat_iterator_end(&bi);
    1080           0 :         bat_iterator_end(&bis);
    1081           0 :         BBPreclaim(bn);
    1082             :         return GDK_FAIL;
    1083             : }
    1084             : 
    1085             : gdk_return
    1086          55 : BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils,
    1087             :                     bool nil_if_empty, const char *restrict separator)
    1088             : {
    1089          55 :         struct canditer ci;
    1090          55 :         gdk_return r = GDK_SUCCEED;
    1091          55 :         bool free_nseparator = false;
    1092          55 :         char *nseparator = (char *)separator;
    1093             : 
    1094          55 :         assert((nseparator && !sep) || (!nseparator && sep)); /* only one of them must be set */
    1095          55 :         *res = (ValRecord) {.vtype = TYPE_str};
    1096             : 
    1097          55 :         canditer_init(&ci, b, s);
    1098             : 
    1099          55 :         if (sep && BATcount(sep) == 1) { /* Only one element in sep */
    1100           0 :                 BATiter bi = bat_iterator(sep);
    1101           0 :                 nseparator = GDKstrdup(BUNtvar(bi, 0));
    1102           0 :                 bat_iterator_end(&bi);
    1103           0 :                 if (!nseparator)
    1104           0 :                         return GDK_FAIL;
    1105           0 :                 free_nseparator = true;
    1106           0 :                 sep = NULL;
    1107             :         }
    1108             : 
    1109          55 :         if (ci.ncand == 0 || (nseparator && strNil(nseparator))) {
    1110           1 :                 if (VALinit(res, TYPE_str, nil_if_empty ? str_nil : "") == NULL)
    1111           0 :                         r = GDK_FAIL;
    1112           1 :                 if (free_nseparator)
    1113           0 :                         GDKfree(nseparator);
    1114           1 :                 return r;
    1115             :         }
    1116             : 
    1117          54 :         r = concat_strings(NULL, res, b, b->hseqbase, 1, &ci, NULL, 0, 0,
    1118             :                               skip_nils, sep, nseparator, NULL);
    1119          54 :         if (free_nseparator)
    1120           0 :                 GDKfree(nseparator);
    1121             :         return r;
    1122             : }
    1123             : 
    1124             : BAT *
    1125          54 : BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT *sep, bool skip_nils,
    1126             :                          const char *restrict separator)
    1127             : {
    1128          54 :         BAT *bn = NULL;
    1129          54 :         oid min, max;
    1130          54 :         BUN ngrp, nils = 0;
    1131          54 :         struct canditer ci;
    1132          54 :         const char *err;
    1133          54 :         gdk_return res;
    1134          54 :         bool free_nseparator = false;
    1135          54 :         char *nseparator = (char *)separator;
    1136             : 
    1137          54 :         assert((nseparator && !sep) || (!nseparator && sep)); /* only one of them must be set */
    1138          54 :         (void) skip_nils;
    1139             : 
    1140          54 :         if ((err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp,
    1141             :                                     &ci)) != NULL) {
    1142           0 :                 GDKerror("%s\n", err);
    1143           0 :                 return NULL;
    1144             :         }
    1145          54 :         if (g == NULL) {
    1146           0 :                 GDKerror("b and g must be aligned\n");
    1147           0 :                 return NULL;
    1148             :         }
    1149             : 
    1150          54 :         if (sep && BATcount(sep) == 1) { /* Only one element in sep */
    1151           0 :                 BATiter bi = bat_iterator(sep);
    1152           0 :                 nseparator = GDKstrdup(BUNtvar(bi, 0));
    1153           0 :                 bat_iterator_end(&bi);
    1154           0 :                 if (!nseparator)
    1155           0 :                         return NULL;
    1156           0 :                 free_nseparator = true;
    1157           0 :                 sep = NULL;
    1158             :         }
    1159             : 
    1160          54 :         if (ci.ncand == 0 || ngrp == 0 || (nseparator && strNil(nseparator))) {
    1161             :                 /* trivial: no strings to concat, so return bat
    1162             :                  * aligned with g with nil in the tail */
    1163           5 :                 bn = BATconstant(ngrp == 0 ? 0 : min, TYPE_str, str_nil, ngrp, TRANSIENT);
    1164           5 :                 goto done;
    1165             :         }
    1166             : 
    1167          49 :         if (BATtdense(g) || (g->tkey && g->tnonil)) {
    1168             :                 /* trivial: singleton groups, so all results are equal
    1169             :                  * to the inputs (but possibly a different type) */
    1170          15 :                 bn = BATconvert(b, s, TYPE_str, 0, 0, 0);
    1171          15 :                 goto done;
    1172             :         }
    1173             : 
    1174          68 :         res = concat_strings(&bn, NULL, b, b->hseqbase, ngrp, &ci,
    1175          34 :                              (const oid *) Tloc(g, 0), min, max, skip_nils, sep,
    1176             :                              nseparator, &nils);
    1177          34 :         if (res != GDK_SUCCEED)
    1178           0 :                 bn = NULL;
    1179             : 
    1180          34 : done:
    1181          54 :         if (free_nseparator)
    1182           0 :                 GDKfree(nseparator);
    1183          54 :         return bn;
    1184             : }
    1185             : 
    1186             : #define compute_next_single_str(START, END)                             \
    1187             :         do {                                                            \
    1188             :                 for (oid m = START; m < END; m++) {                  \
    1189             :                         const char *sb = BUNtvar(bi, m);                \
    1190             :                                                                         \
    1191             :                         if (separator) {                                \
    1192             :                                 if (!strNil(sb)) {                      \
    1193             :                                         next_group_length += strlen(sb); \
    1194             :                                         if (!empty)                     \
    1195             :                                                 next_group_length += separator_length; \
    1196             :                                         empty = false;                  \
    1197             :                                 }                                       \
    1198             :                         } else { /* sep case */                         \
    1199             :                                 assert(sep != NULL);                    \
    1200             :                                 const char *sl = BUNtvar(sepi, m);      \
    1201             :                                                                         \
    1202             :                                 if (!strNil(sb)) {                      \
    1203             :                                         next_group_length += strlen(sb); \
    1204             :                                         if (!empty && !strNil(sl))      \
    1205             :                                                 next_group_length += strlen(sl); \
    1206             :                                         empty = false;                  \
    1207             :                                 }                                       \
    1208             :                         }                                               \
    1209             :                 }                                                       \
    1210             :                 if (empty) {                                            \
    1211             :                         if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
    1212             :                                 max_group_length = 1;                   \
    1213             :                                 if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
    1214             :                                         goto allocation_error;          \
    1215             :                         } else if (1 > max_group_length) {           \
    1216             :                                 max_group_length = 1;                   \
    1217             :                                 if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
    1218             :                                         goto allocation_error;          \
    1219             :                                 single_str = next_single_str;           \
    1220             :                         }                                               \
    1221             :                         strcpy(single_str, str_nil);                    \
    1222             :                         has_nils = true;                                \
    1223             :                 } else {                                                \
    1224             :                         empty = true;                                   \
    1225             :                         if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
    1226             :                                 max_group_length = next_group_length;   \
    1227             :                                 if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
    1228             :                                         goto allocation_error;          \
    1229             :                         } else if (next_group_length > max_group_length) { \
    1230             :                                 max_group_length = next_group_length;   \
    1231             :                                 if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
    1232             :                                         goto allocation_error;          \
    1233             :                                 single_str = next_single_str;           \
    1234             :                         }                                               \
    1235             :                                                                         \
    1236             :                         for (oid m = START; m < END; m++) {          \
    1237             :                                 const char *sb = BUNtvar(bi, m);        \
    1238             :                                                                         \
    1239             :                                 if (separator) {                        \
    1240             :                                         if (strNil(sb))                 \
    1241             :                                                 continue;               \
    1242             :                                         if (!empty) {                   \
    1243             :                                                 memcpy(single_str + offset, separator, separator_length); \
    1244             :                                                 offset += separator_length; \
    1245             :                                         }                               \
    1246             :                                         next_length = strlen(sb);       \
    1247             :                                         memcpy(single_str + offset, sb, next_length); \
    1248             :                                         offset += next_length;          \
    1249             :                                         empty = false;                  \
    1250             :                                 } else { /* sep case */                 \
    1251             :                                         assert(sep != NULL);            \
    1252             :                                         const char *sl = BUNtvar(sepi, m); \
    1253             :                                                                         \
    1254             :                                         if (strNil(sb))                 \
    1255             :                                                 continue;               \
    1256             :                                         if (!empty && !strNil(sl)) {    \
    1257             :                                                 next_length = strlen(sl); \
    1258             :                                                 memcpy(single_str + offset, sl, next_length); \
    1259             :                                                 offset += next_length;  \
    1260             :                                         }                               \
    1261             :                                         next_length = strlen(sb);       \
    1262             :                                         memcpy(single_str + offset, sb, next_length); \
    1263             :                                         offset += next_length;          \
    1264             :                                         empty = false;                  \
    1265             :                                 }                                       \
    1266             :                         }                                               \
    1267             :                                                                         \
    1268             :                         single_str[offset] = '\0';                      \
    1269             :                 }                                                       \
    1270             : } while (0)
    1271             : 
    1272             : #define ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW          \
    1273             :         do {                                                            \
    1274             :                 size_t slice_length = 0;                                \
    1275             :                 next_group_length = next_length = offset = 0;           \
    1276             :                 empty = true;                                           \
    1277             :                 compute_next_single_str(k, i); /* compute the entire string then slice it starting from the beginning */ \
    1278             :                 empty = true;                                           \
    1279             :                 for (; k < i;) {                                     \
    1280             :                         const char *nsep;                               \
    1281             :                         oid m = k;                                      \
    1282             :                         j = k;                                          \
    1283             :                         do {                                            \
    1284             :                                 k++;                                    \
    1285             :                         } while (k < i && !op[k]);                   \
    1286             :                         for (; j < k; j++) {                         \
    1287             :                                 const char *nstr = BUNtvar(bi, j);      \
    1288             :                                 if (!strNil(nstr)) {                    \
    1289             :                                         slice_length += strlen(nstr);   \
    1290             :                                         if (!empty) {                   \
    1291             :                                                 if (separator) {        \
    1292             :                                                         nsep = (const char *) separator; \
    1293             :                                                 } else { /* sep case */ \
    1294             :                                                         assert(sep != NULL); \
    1295             :                                                         nsep = BUNtvar(sepi, j); \
    1296             :                                                 }                       \
    1297             :                                                 if (!strNil(nsep))      \
    1298             :                                                         slice_length += strlen(nsep); \
    1299             :                                         }                               \
    1300             :                                         empty = false;                  \
    1301             :                                 }                                       \
    1302             :                         }                                               \
    1303             :                         if (empty) {                                    \
    1304             :                                 for (j = m; j < k; j++)                      \
    1305             :                                         if (tfastins_nocheckVAR(r, j, str_nil) != GDK_SUCCEED) \
    1306             :                                                 goto allocation_error;  \
    1307             :                                 has_nils = true;                        \
    1308             :                         } else {                                        \
    1309             :                                 char save = single_str[slice_length];   \
    1310             :                                 single_str[slice_length] = '\0';        \
    1311             :                                 for (j = m; j < k; j++)                      \
    1312             :                                         if (tfastins_nocheckVAR(r, j, single_str) != GDK_SUCCEED) \
    1313             :                                                 goto allocation_error;  \
    1314             :                                 single_str[slice_length] = save;        \
    1315             :                         }                                               \
    1316             :                 }                                                       \
    1317             :         } while (0)
    1318             : 
    1319             : #define ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS                            \
    1320             :         do {                                                            \
    1321             :                 next_group_length = next_length = offset = 0;           \
    1322             :                 empty = true;                                           \
    1323             :                 compute_next_single_str(k, i);                          \
    1324             :                 for (; k < i; k++)                                   \
    1325             :                         if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
    1326             :                                 goto allocation_error;                  \
    1327             :         } while (0)
    1328             : 
    1329             : #define ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW                         \
    1330             :         do {                                                            \
    1331             :                 for (; k < i; k++) {                                 \
    1332             :                         const char *next = BUNtvar(bi, k);              \
    1333             :                         if (tfastins_nocheckVAR(r, k, next) != GDK_SUCCEED) \
    1334             :                                 goto allocation_error;                  \
    1335             :                         has_nils |= strNil(next);                       \
    1336             :                 }                                                       \
    1337             :         } while (0)
    1338             : 
    1339             : #define ANALYTICAL_STR_GROUP_CONCAT_OTHERS                              \
    1340             :         do {                                                            \
    1341             :                 for (; k < i; k++) {                                 \
    1342             :                         next_group_length = next_length = offset = 0;   \
    1343             :                         empty = true;                                   \
    1344             :                         compute_next_single_str(start[k], end[k]);      \
    1345             :                         if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
    1346             :                                 goto allocation_error;                  \
    1347             :                 }                                                       \
    1348             :         } while (0)
    1349             : 
    1350             : #define ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(IMP)     \
    1351             :         do {                                            \
    1352             :                 if (p) {                                \
    1353             :                         for (; i < cnt; i++) {               \
    1354             :                                 if (np[i])              \
    1355             :                                         IMP;            \
    1356             :                         }                               \
    1357             :                 }                                       \
    1358             :                 i = cnt;                                \
    1359             :                 IMP;                                    \
    1360             :         } while (0)
    1361             : 
    1362             : gdk_return
    1363          53 : GDKanalytical_str_group_concat(BAT *r, BAT *p, BAT *o, BAT *b, BAT *sep, BAT *s, BAT *e, const char *restrict separator, int frame_type)
    1364             : {
    1365          53 :         bool has_nils = false, empty;
    1366          53 :         BATiter pi = bat_iterator(p);
    1367          53 :         BATiter oi = bat_iterator(o);
    1368          53 :         BATiter bi = bat_iterator(b);
    1369          53 :         BATiter sepi = bat_iterator(sep);
    1370          53 :         BATiter si = bat_iterator(s);
    1371          53 :         BATiter ei = bat_iterator(e);
    1372          53 :         oid i = 0, j = 0, k = 0, cnt = bi.count, *restrict start = si.base, *restrict end = ei.base;
    1373          53 :         bit *np = pi.base, *op = oi.base;
    1374          53 :         str single_str = NULL, next_single_str;
    1375          53 :         size_t separator_length = 0, next_group_length, max_group_length = 0, next_length, offset;
    1376             : 
    1377          53 :         assert((sep && !separator && bi.count == sepi.count) || (!sep && separator));
    1378          53 :         if (b->ttype != TYPE_str || r->ttype != TYPE_str || (sep && sep->ttype != TYPE_str)) {
    1379           0 :                 GDKerror("only string type is supported\n");
    1380           0 :                 bat_iterator_end(&pi);
    1381           0 :                 bat_iterator_end(&oi);
    1382           0 :                 bat_iterator_end(&bi);
    1383           0 :                 bat_iterator_end(&sepi);
    1384           0 :                 bat_iterator_end(&si);
    1385           0 :                 bat_iterator_end(&ei);
    1386           0 :                 return GDK_FAIL;
    1387             :         }
    1388          30 :         if (sep && sepi.count == 1) { /* Only one element in sep */
    1389           0 :                 separator = BUNtvar(sepi, 0);
    1390           0 :                 sep = NULL;
    1391             :         }
    1392             : 
    1393          53 :         if (sep == NULL)
    1394          23 :                 separator_length = strlen(separator);
    1395             : 
    1396          53 :         if (cnt > 0) {
    1397          52 :                 switch (frame_type) {
    1398          29 :                 case 3: /* unbounded until current row */
    1399      153851 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW);
    1400             :                         break;
    1401           0 :                 case 4: /* current row until unbounded */
    1402           0 :                         goto notimplemented;
    1403          23 :                 case 5: /* all rows */
    1404         844 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS);
    1405             :                         break;
    1406           0 :                 case 6: /* current row */
    1407           0 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW);
    1408             :                         break;
    1409           0 :                 default:
    1410           0 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_OTHERS);
    1411             :                         break;
    1412             :                 }
    1413             :         }
    1414             : 
    1415          53 :         bat_iterator_end(&pi);
    1416          53 :         bat_iterator_end(&oi);
    1417          53 :         bat_iterator_end(&bi);
    1418          53 :         bat_iterator_end(&sepi);
    1419          53 :         bat_iterator_end(&si);
    1420          53 :         bat_iterator_end(&ei);
    1421          53 :         GDKfree(single_str);
    1422          53 :         BATsetcount(r, cnt);
    1423          53 :         r->tnonil = !has_nils;
    1424          53 :         r->tnil = has_nils;
    1425          53 :         return GDK_SUCCEED;
    1426           0 :   allocation_error:
    1427           0 :         bat_iterator_end(&pi);
    1428           0 :         bat_iterator_end(&oi);
    1429           0 :         bat_iterator_end(&bi);
    1430           0 :         bat_iterator_end(&sepi);
    1431           0 :         bat_iterator_end(&si);
    1432           0 :         bat_iterator_end(&ei);
    1433           0 :         GDKfree(single_str);
    1434           0 :         return GDK_FAIL;
    1435           0 :   notimplemented:
    1436           0 :         bat_iterator_end(&pi);
    1437           0 :         bat_iterator_end(&oi);
    1438           0 :         bat_iterator_end(&bi);
    1439           0 :         bat_iterator_end(&sepi);
    1440           0 :         bat_iterator_end(&si);
    1441           0 :         bat_iterator_end(&ei);
    1442           0 :         GDKerror("str_group_concat not yet implemented for current row until unbounded case\n");
    1443           0 :         return GDK_FAIL;
    1444             : }

Generated by: LCOV version 1.14