Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * N.J. Nes, M.L. Kersten
15 : * The String Module
16 : * Strings can be created in many ways. Already in the built-in
17 : * operations each atom can be cast to a string using the str(atom)
18 : * mil command. The string module gives the possibility of
19 : * construction string as a substring of the a given string (s). There
20 : * are two such construction functions. The first is the substring
21 : * from some position (offset) until the end of the string. The second
22 : * start again on the given offset position but only copies count
23 : * number of bytes. The functions fail when the position and count
24 : * fall out of bounds. A negative position indicates that the position
25 : * is computed from the end of the source string.
26 : *
27 : * The strings can be compared using the "=" and "!=" operators.
28 : *
29 : * The operator "+" concatenates a string and an atom. The atom will
30 : * be converted to a string using the atom to string c function. The
31 : * string and the result of the conversion are concatenated to form a
32 : * new string. This string is returned.
33 : *
34 : * The length function returns the length of the string. The length is
35 : * the number of characters in the string. The maximum string length
36 : * handled by the kernel is 32-bits long.
37 : *
38 : * chrAt() returns the character at position index in the string
39 : * s. The function will fail when the index is out of range. The range
40 : * is from 0 to length(s)-1.
41 : *
42 : * The startsWith and endsWith functions test if the string s starts
43 : * with or ends with the given prefix or suffix.
44 : *
45 : * The toLower and toUpper functions cast the string to lower or upper
46 : * case characters.
47 : *
48 : * The search(str,chr) function searches for the first occurrence of a
49 : * character from the beginning of the string. The search(chr,str)
50 : * searches for the last occurrence (or first from the end of the
51 : * string). The last search function locates the position of first
52 : * occurrence of the string s2 in string s. All search functions
53 : * return -1 when the search failed. Otherwise the position is
54 : * returned.
55 : *
56 : * All string functions fail when an incorrect string (NULL pointer)
57 : * is given. In the current implementation, a fail is signaled by
58 : * returning nil, since this facilitates the use of the string module
59 : * in bulk operations.
60 : *
61 : * All functions in the module have now been converted to
62 : * Unicode. Internally, we use UTF-8 to store strings as Unicode in
63 : * zero-terminated byte-sequences.
64 : */
65 : #include "monetdb_config.h"
66 : #include "str.h"
67 : #include <string.h>
68 : #include "mal_interpreter.h"
69 : #include "mutf8.h"
70 :
71 : #define UTF8_assert(s) assert(checkUTF8(s))
72 :
73 : /* return the number of codepoints in `s' before `end'. */
74 : static inline int
75 740 : UTF8_strpos(const char *s, const char *end)
76 : {
77 740 : int pos = 0;
78 :
79 740 : UTF8_assert(s);
80 :
81 740 : if (s > end) {
82 : return -1;
83 : }
84 58041 : while (s < end) {
85 : /* just count leading bytes of encoded code points; only works
86 : * for correctly encoded UTF-8 */
87 57301 : pos += (*s++ & 0xC0) != 0x80;
88 : }
89 : return pos;
90 : }
91 :
92 : /* return a pointer to the byte that starts the pos'th (0-based)
93 : * codepoint in s */
94 : static inline char *
95 7300387 : UTF8_strtail(const char *s, int pos)
96 : {
97 7300387 : UTF8_assert(s);
98 98987045 : while (*s) {
99 98139599 : if ((*s & 0xC0) != 0x80) {
100 98139529 : if (pos <= 0)
101 : break;
102 91550901 : pos--;
103 : }
104 91550971 : s++;
105 : }
106 7436074 : return (char *) s;
107 : }
108 :
109 : /* copy n Unicode codepoints from s to dst, return pointer to new end */
110 : static inline str
111 215 : UTF8_strncpy(char *restrict dst, const char *restrict s, int n)
112 : {
113 215 : UTF8_assert(s);
114 1449 : while (*s && n) {
115 1234 : if ((*s & 0xF8) == 0xF0) {
116 : /* 4 byte UTF-8 sequence */
117 0 : *dst++ = *s++;
118 0 : *dst++ = *s++;
119 0 : *dst++ = *s++;
120 0 : *dst++ = *s++;
121 1234 : } else if ((*s & 0xF0) == 0xE0) {
122 : /* 3 byte UTF-8 sequence */
123 6 : *dst++ = *s++;
124 6 : *dst++ = *s++;
125 6 : *dst++ = *s++;
126 1228 : } else if ((*s & 0xE0) == 0xC0) {
127 : /* 2 byte UTF-8 sequence */
128 0 : *dst++ = *s++;
129 0 : *dst++ = *s++;
130 : } else {
131 : /* 1 byte UTF-8 "sequence" */
132 1228 : *dst++ = *s++;
133 : }
134 1234 : n--;
135 : }
136 215 : *dst = '\0';
137 215 : return dst;
138 : }
139 :
140 : /* return number of Unicode codepoints in s; s is not nil */
141 : int
142 25306623 : UTF8_strlen(const char *s)
143 : { /* This function assumes, s is never nil */
144 25306623 : size_t pos = 0;
145 :
146 25306623 : UTF8_assert(s);
147 50492058 : assert(!strNil(s));
148 :
149 930161419 : while (*s) {
150 : /* just count leading bytes of encoded code points; only works
151 : * for correctly encoded UTF-8 */
152 904915390 : pos += (*s++ & 0xC0) != 0x80;
153 : }
154 25246029 : assert(pos < INT_MAX);
155 25246029 : return (int) pos;
156 : }
157 :
158 : /* return (int) strlen(s); s is not nil */
159 : int
160 3762796 : str_strlen(const char *s)
161 : { /* This function assumes s is never nil */
162 3762796 : UTF8_assert(s);
163 7526502 : assert(!strNil(s));
164 :
165 3763251 : return (int) strlen(s);
166 : }
167 :
168 : /* return the display width of s */
169 : int
170 4650528 : UTF8_strwidth(const char *S)
171 : {
172 4650528 : if (strNil(S))
173 505049 : return int_nil;
174 :
175 4145479 : const uint8_t *s = (const uint8_t *) S;
176 4145479 : int len = 0;
177 :
178 142420798 : for (uint32_t state = 0, codepoint = 0; *s; s++) {
179 138275319 : switch (decode(&state, &codepoint, (uint8_t) *s)) {
180 138266092 : case UTF8_ACCEPT: {
181 138266092 : int n = charwidth(codepoint);
182 138266092 : if (n >= 0)
183 138262268 : len += n;
184 : else
185 3824 : len++; /* assume width 1 if unprintable */
186 : break;
187 : }
188 : default:
189 : break;
190 : case UTF8_REJECT:
191 0 : assert(0);
192 : }
193 : }
194 4145479 : return len;
195 : }
196 :
197 : /*
198 : * Here you find the wrappers around the version 4 library code
199 : * It also contains the direct implementation of the string
200 : * matching support routines.
201 : */
202 : #include "mal_exception.h"
203 :
204 : /*
205 : * The SQL like function return a boolean
206 : */
207 : static bool
208 0 : STRlike(const char *s, const char *pat, const char *esc)
209 : {
210 0 : const char *t, *p;
211 :
212 0 : t = s;
213 0 : for (p = pat; *p && *t; p++) {
214 0 : if (esc && *p == *esc) {
215 0 : p++;
216 0 : if (*p != *t)
217 : return false;
218 0 : t++;
219 0 : } else if (*p == '_')
220 0 : t++;
221 0 : else if (*p == '%') {
222 0 : p++;
223 0 : while (*p == '%')
224 0 : p++;
225 0 : if (*p == 0)
226 : return true; /* tail is acceptable */
227 0 : for (; *p && *t; t++)
228 0 : if (STRlike(t, p, esc))
229 : return true;
230 0 : if (*p == 0 && *t == 0)
231 : return true;
232 : return false;
233 0 : } else if (*p == *t)
234 0 : t++;
235 : else
236 : return false;
237 : }
238 0 : if (*p == '%' && *(p + 1) == 0)
239 : return true;
240 0 : return *t == 0 && *p == 0;
241 : }
242 :
243 : static str
244 0 : STRlikewrap3(bit *ret, const char *const *s, const char *const *pat, const char *const *esc)
245 : {
246 0 : if (strNil(*s) || strNil(*pat) || strNil(*esc))
247 0 : *ret = bit_nil;
248 : else
249 0 : *ret = (bit) STRlike(*s, *pat, *esc);
250 0 : return MAL_SUCCEED;
251 : }
252 :
253 : static str
254 0 : STRlikewrap(bit *ret, const char *const *s, const char *const *pat)
255 : {
256 0 : if (strNil(*s) || strNil(*pat))
257 0 : *ret = bit_nil;
258 : else
259 0 : *ret = (bit) STRlike(*s, *pat, NULL);
260 0 : return MAL_SUCCEED;
261 : }
262 :
263 : static str
264 0 : STRtostr(str *res, const char *const *src)
265 : {
266 0 : if (*src == 0)
267 0 : *res = GDKstrdup(str_nil);
268 : else
269 0 : *res = GDKstrdup(*src);
270 0 : if (*res == NULL)
271 0 : throw(MAL, "str.str", SQLSTATE(HY013) MAL_MALLOC_FAIL);
272 : return MAL_SUCCEED;
273 : }
274 :
275 : static str
276 97 : STRLength(int *res, const char *const *arg1)
277 : {
278 97 : const char *s = *arg1;
279 :
280 194 : *res = strNil(s) ? int_nil : UTF8_strlen(s);
281 97 : return MAL_SUCCEED;
282 : }
283 :
284 : static str
285 3 : STRBytes(int *res, const char *const *arg1)
286 : {
287 3 : const char *s = *arg1;
288 :
289 6 : *res = strNil(s) ? int_nil : str_strlen(s);
290 3 : return MAL_SUCCEED;
291 : }
292 :
293 : str
294 3847 : str_tail(str *buf, size_t *buflen, const char *s, int off)
295 : {
296 3847 : if (off < 0) {
297 1 : off += UTF8_strlen(s);
298 1 : if (off < 0)
299 : off = 0;
300 : }
301 3847 : const char *tail = UTF8_strtail(s, off);
302 3843 : size_t nextlen = strlen(tail) + 1;
303 3843 : CHECK_STR_BUFFER_LENGTH(buf, buflen, nextlen, "str.tail");
304 3843 : strcpy(*buf, tail);
305 3843 : return MAL_SUCCEED;
306 : }
307 :
308 : static str
309 1 : STRTail(str *res, const char *const *arg1, const int *offset)
310 : {
311 1 : str buf = NULL, msg = MAL_SUCCEED;
312 1 : const char *s = *arg1;
313 1 : int off = *offset;
314 :
315 2 : if (strNil(s) || is_int_nil(off)) {
316 0 : *res = GDKstrdup(str_nil);
317 : } else {
318 1 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
319 :
320 1 : *res = NULL;
321 1 : if (!(buf = GDKmalloc(buflen)))
322 0 : throw(MAL, "str.tail", SQLSTATE(HY013) MAL_MALLOC_FAIL);
323 1 : if ((msg = str_tail(&buf, &buflen, s, off)) != MAL_SUCCEED) {
324 0 : GDKfree(buf);
325 0 : return msg;
326 : }
327 1 : *res = GDKstrdup(buf);
328 : }
329 :
330 1 : GDKfree(buf);
331 1 : if (!*res)
332 0 : msg = createException(MAL, "str.tail", SQLSTATE(HY013) MAL_MALLOC_FAIL);
333 : return msg;
334 : }
335 :
336 : /* copy the substring s[off:off+l] into *buf, replacing *buf with a
337 : * freshly allocated buffer if the substring doesn't fit; off is 0
338 : * based, and both off and l count in Unicode codepoints (i.e. not
339 : * bytes); if off < 0, off counts from the end of the string */
340 : str
341 3806074 : str_Sub_String(str *buf, size_t *buflen, const char *s, int off, int l)
342 : {
343 3806074 : size_t len;
344 :
345 3806074 : if (off < 0) {
346 4 : off += UTF8_strlen(s);
347 4 : if (off < 0) {
348 3 : l += off;
349 3 : off = 0;
350 : }
351 : }
352 : /* here, off >= 0 */
353 3806074 : if (l < 0) {
354 1040 : strcpy(*buf, "");
355 1040 : return MAL_SUCCEED;
356 : }
357 3805034 : s = UTF8_strtail(s, off);
358 3792837 : len = (size_t) (UTF8_strtail(s, l) - s + 1);
359 3814689 : CHECK_STR_BUFFER_LENGTH(buf, buflen, len, "str.substring");
360 3814689 : strcpy_len(*buf, s, len);
361 3814689 : return MAL_SUCCEED;
362 : }
363 :
364 : static str
365 4 : STRSubString(str *res, const char *const *arg1, const int *offset, const int *length)
366 : {
367 4 : str buf = NULL, msg = MAL_SUCCEED;
368 4 : const char *s = *arg1;
369 4 : int off = *offset, len = *length;
370 :
371 7 : if (strNil(s) || is_int_nil(off) || is_int_nil(len)) {
372 1 : *res = GDKstrdup(str_nil);
373 : } else {
374 3 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
375 :
376 3 : *res = NULL;
377 3 : if (!(buf = GDKmalloc(buflen)))
378 0 : throw(MAL, "str.substring", SQLSTATE(HY013) MAL_MALLOC_FAIL);
379 3 : if ((msg = str_Sub_String(&buf, &buflen, s, off, len)) != MAL_SUCCEED) {
380 0 : GDKfree(buf);
381 0 : return msg;
382 : }
383 3 : *res = GDKstrdup(buf);
384 : }
385 :
386 4 : GDKfree(buf);
387 4 : if (!*res)
388 0 : msg = createException(MAL, "str.substring",
389 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
390 : return msg;
391 : }
392 :
393 : str
394 4 : str_from_wchr(str *buf, size_t *buflen, int c)
395 : {
396 4 : CHECK_STR_BUFFER_LENGTH(buf, buflen, 5, "str.unicode");
397 4 : str s = *buf;
398 4 : UTF8_PUTCHAR(c, s);
399 4 : *s = 0;
400 4 : return MAL_SUCCEED;
401 0 : illegal:
402 0 : throw(MAL, "str.unicode", SQLSTATE(42000) "Illegal Unicode code point");
403 : }
404 :
405 : static str
406 2 : STRFromWChr(str *res, const int *c)
407 : {
408 2 : str buf = NULL, msg = MAL_SUCCEED;
409 2 : int cc = *c;
410 :
411 2 : if (is_int_nil(cc)) {
412 0 : *res = GDKstrdup(str_nil);
413 : } else {
414 2 : size_t buflen = MAX(strlen(str_nil) + 1, 8);
415 :
416 2 : *res = NULL;
417 2 : if (!(buf = GDKmalloc(buflen)))
418 0 : throw(MAL, "str.unicode", SQLSTATE(HY013) MAL_MALLOC_FAIL);
419 2 : if ((msg = str_from_wchr(&buf, &buflen, cc)) != MAL_SUCCEED) {
420 0 : GDKfree(buf);
421 0 : return msg;
422 : }
423 2 : *res = GDKstrdup(buf);
424 : }
425 :
426 2 : GDKfree(buf);
427 2 : if (!*res)
428 0 : msg = createException(MAL, "str.unicode",
429 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
430 : return msg;
431 : }
432 :
433 : /* return the Unicode code point of arg1 at position at */
434 : str
435 31 : str_wchr_at(int *res, const char *s, int at)
436 : {
437 : /* 64bit: should have lng arg */
438 60 : if (strNil(s) || is_int_nil(at) || at < 0) {
439 2 : *res = int_nil;
440 2 : return MAL_SUCCEED;
441 : }
442 29 : s = UTF8_strtail(s, at);
443 29 : if (s == NULL || *s == 0) {
444 6 : *res = int_nil;
445 6 : return MAL_SUCCEED;
446 : }
447 23 : uint32_t state = 0, codepoint;
448 25 : while (*s) {
449 25 : if (decode(&state, &codepoint, (uint8_t) *s) == UTF8_ACCEPT) {
450 23 : *res = codepoint;
451 23 : return MAL_SUCCEED;
452 : }
453 2 : s++;
454 : }
455 0 : throw(MAL, "str.unicodeAt", SQLSTATE(42000) "Illegal Unicode code point");
456 : }
457 :
458 : static str
459 0 : STRWChrAt(int *res, const char *const *arg1, const int *at)
460 : {
461 0 : return str_wchr_at(res, *arg1, *at);
462 : }
463 :
464 : static inline str
465 103977 : doStrConvert(str *res, const char *arg1, gdk_return (*func)(char **restrict, size_t *restrict, const char *restrict))
466 : {
467 103977 : str buf = NULL, msg = MAL_SUCCEED;
468 :
469 103977 : if (strNil(arg1)) {
470 647 : *res = GDKstrdup(str_nil);
471 : } else {
472 103330 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
473 :
474 103330 : *res = NULL;
475 103330 : if (!(buf = GDKmalloc(buflen)))
476 0 : throw(MAL, "str.lower", SQLSTATE(HY013) MAL_MALLOC_FAIL);
477 103329 : if ((*func)(&buf, &buflen, arg1) != GDK_SUCCEED) {
478 0 : GDKfree(buf);
479 0 : throw(MAL, "str.lower", GDK_EXCEPTION);
480 : }
481 103330 : *res = GDKstrdup(buf);
482 : }
483 :
484 103977 : GDKfree(buf);
485 103977 : if (!*res)
486 0 : msg = createException(MAL, "str.lower",
487 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
488 : return msg;
489 : }
490 :
491 : static inline str
492 2035 : STRlower(str *res, const char *const *arg1)
493 : {
494 1947 : return doStrConvert(res, *arg1, GDKtolower);
495 : }
496 :
497 : static inline str
498 101942 : STRupper(str *res, const char *const *arg1)
499 : {
500 101942 : return doStrConvert(res, *arg1, GDKtoupper);
501 : }
502 :
503 : static inline str
504 0 : STRcasefold(str *res, const char *const *arg1)
505 : {
506 0 : return doStrConvert(res, *arg1, GDKcasefold);
507 : }
508 :
509 : /* returns whether arg1 starts with arg2 */
510 : int
511 1324 : str_is_prefix(const char *s, const char *prefix, int plen)
512 : {
513 1324 : return strncmp(s, prefix, plen);
514 : }
515 :
516 : int
517 67 : str_is_iprefix(const char *s, const char *prefix, int plen)
518 : {
519 67 : return GDKstrncasecmp(s, prefix, SIZE_MAX, plen);
520 : }
521 :
522 : int
523 2138 : str_is_suffix(const char *s, const char *suffix, int sul)
524 : {
525 2138 : int sl = str_strlen(s);
526 :
527 2137 : if (sl < sul)
528 : return -1;
529 : else
530 2124 : return strcmp(s + sl - sul, suffix);
531 : }
532 :
533 : /* case insensitive endswith check */
534 : int
535 124 : str_is_isuffix(const char *s, const char *suffix, int sul)
536 : {
537 124 : const char *e = s + strlen(s);
538 124 : const char *sf;
539 :
540 124 : (void) sul;
541 : /* note that the uppercase and lowercase forms of a character aren't
542 : * necessarily the same length in their UTF-8 encodings */
543 893 : for (sf = suffix; *sf && e > s; sf++) {
544 769 : if ((*sf & 0xC0) != 0x80) {
545 775 : while ((*--e & 0xC0) == 0x80)
546 : ;
547 : }
548 : }
549 126 : while ((*sf & 0xC0) == 0x80)
550 2 : sf++;
551 124 : return *sf != 0 || GDKstrcasecmp(e, suffix) != 0;
552 : }
553 :
554 : int
555 13846 : str_contains(const char *h, const char *n, int nlen)
556 : {
557 13846 : (void) nlen;
558 13846 : return strstr(h, n) == NULL;
559 : }
560 :
561 : int
562 1156 : str_icontains(const char *h, const char *n, int nlen)
563 : {
564 1156 : (void) nlen;
565 1156 : return GDKstrcasestr(h, n) == NULL;
566 : }
567 :
568 : static str
569 4 : STRstartswith(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
570 : {
571 4 : (void) cntxt;
572 4 : (void) mb;
573 :
574 4 : bit *r = getArgReference_bit(stk, pci, 0);
575 4 : const char *s1 = *getArgReference_str(stk, pci, 1);
576 4 : const char *s2 = *getArgReference_str(stk, pci, 2);
577 4 : bit icase = pci->argc == 4 && *getArgReference_bit(stk, pci, 3);
578 :
579 8 : if (strNil(s1) || strNil(s2)) {
580 0 : *r = bit_nil;
581 : } else {
582 4 : int s2_len = str_strlen(s2);
583 8 : *r = icase ?
584 2 : str_is_iprefix(s1, s2, s2_len) == 0 :
585 2 : str_is_prefix(s1, s2, s2_len) == 0;
586 : }
587 4 : return MAL_SUCCEED;
588 : }
589 :
590 : static str
591 2 : STRendswith(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
592 : {
593 2 : (void) cntxt;
594 2 : (void) mb;
595 :
596 2 : bit *r = getArgReference_bit(stk, pci, 0);
597 2 : const char *s1 = *getArgReference_str(stk, pci, 1);
598 2 : const char *s2 = *getArgReference_str(stk, pci, 2);
599 2 : bit icase = pci->argc == 4 && *getArgReference_bit(stk, pci, 3);
600 :
601 4 : if (strNil(s1) || strNil(s2)) {
602 0 : *r = bit_nil;
603 : } else {
604 2 : int s2_len = str_strlen(s2);
605 4 : *r = icase ?
606 2 : str_is_isuffix(s1, s2, s2_len) == 0 :
607 0 : str_is_suffix(s1, s2, s2_len) == 0;
608 : }
609 2 : return MAL_SUCCEED;
610 : }
611 :
612 : /* returns whether haystack contains needle */
613 : static str
614 2 : STRcontains(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
615 : {
616 2 : (void) cntxt;
617 2 : (void) mb;
618 :
619 2 : bit *r = getArgReference_bit(stk, pci, 0);
620 2 : const char *s1 = *getArgReference_str(stk, pci, 1);
621 2 : const char *s2 = *getArgReference_str(stk, pci, 2);
622 2 : bit icase = pci->argc == 4 && *getArgReference_bit(stk, pci, 3);
623 :
624 4 : if (strNil(s1) || strNil(s2)) {
625 0 : *r = bit_nil;
626 : } else {
627 2 : int s2_len = str_strlen(s2);
628 4 : *r = icase ?
629 2 : str_icontains(s1, s2, s2_len) == 0 :
630 0 : str_contains(s1, s2, s2_len) == 0;
631 : }
632 2 : return MAL_SUCCEED;
633 : }
634 :
635 : int
636 3871 : str_search(const char *haystack, const char *needle)
637 : {
638 3871 : needle = strstr(haystack, needle);
639 3871 : if (needle == NULL)
640 : return -1;
641 :
642 740 : return UTF8_strpos(haystack, needle);
643 : }
644 :
645 : int
646 0 : str_isearch(const char *haystack, const char *needle)
647 : {
648 0 : needle = GDKstrcasestr(haystack, needle);
649 0 : if (needle == NULL)
650 : return -1;
651 :
652 0 : return UTF8_strpos(haystack, needle);
653 : }
654 :
655 : /* find first occurrence of needle in haystack */
656 : static str
657 0 : STRstr_search(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
658 : {
659 0 : (void) cntxt;
660 0 : (void) mb;
661 0 : bit *res = getArgReference(stk, pci, 0);
662 0 : const char *haystack = *getArgReference_str(stk, pci, 1);
663 0 : const char *needle = *getArgReference_str(stk, pci, 2);
664 0 : bit icase = pci->argc == 4 && *getArgReference_bit(stk, pci, 3);
665 :
666 0 : if (strNil(haystack) || strNil(needle)) {
667 0 : *res = bit_nil;
668 : } else {
669 0 : *res = icase ?
670 0 : str_isearch(haystack, needle) :
671 0 : str_search(haystack, needle);
672 : }
673 0 : return MAL_SUCCEED;
674 : }
675 :
676 : int
677 0 : str_reverse_str_search(const char *haystack, const char *needle)
678 : {
679 0 : int nulen = UTF8_strlen(needle);
680 0 : size_t nlen = strlen(needle);
681 :
682 0 : for (int pos = str_strlen(haystack) - 1; pos >= 0; pos--) {
683 0 : if ((haystack[pos] & 0xC0) != 0x80) {
684 0 : if (nulen > 0)
685 0 : nulen--;
686 0 : else if (strncmp(haystack + pos, needle, nlen) == 0)
687 0 : return pos;
688 : }
689 : }
690 : return -1;
691 : }
692 :
693 : int
694 0 : str_reverse_str_isearch(const char *haystack, const char *needle)
695 : {
696 0 : int nulen = UTF8_strlen(needle);
697 0 : size_t nlen = strlen(needle);
698 :
699 0 : for (int pos = str_strlen(haystack) - 1; pos >= 0; pos--) {
700 0 : if ((haystack[pos] & 0xC0) != 0x80) {
701 0 : if (nulen > 0)
702 0 : nulen--;
703 0 : else if (GDKstrncasecmp(haystack + pos, needle, SIZE_MAX, nlen) == 0)
704 0 : return pos;
705 : }
706 : }
707 : return -1;
708 : }
709 :
710 : /* find last occurrence of arg2 in arg1 */
711 : static str
712 0 : STRrevstr_search(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
713 : {
714 0 : (void) cntxt;
715 0 : (void) mb;
716 0 : int *res = getArgReference_int(stk, pci, 0);
717 0 : const str haystack = *getArgReference_str(stk, pci, 1);
718 0 : const str needle = *getArgReference_str(stk, pci, 2);
719 0 : bit icase = pci->argc == 4 && *getArgReference_bit(stk, pci, 3);
720 :
721 0 : if (strNil(haystack) || strNil(needle)) {
722 0 : *res = bit_nil;
723 : } else {
724 0 : *res = icase ?
725 0 : str_reverse_str_isearch(haystack, needle) :
726 0 : str_reverse_str_search(haystack, needle);
727 : }
728 0 : return MAL_SUCCEED;
729 : }
730 :
731 : str
732 148 : str_splitpart(str *buf, size_t *buflen, const char *s, const char *s2, int f)
733 : {
734 148 : size_t len;
735 148 : char *p = NULL;
736 :
737 148 : if (f <= 0)
738 4 : throw(MAL, "str.splitpart",
739 : SQLSTATE(42000) "field position must be greater than zero");
740 :
741 144 : len = strlen(s2);
742 144 : if (len) {
743 153 : while ((p = strstr(s, s2)) != NULL && f > 1) {
744 13 : s = p + len;
745 13 : f--;
746 : }
747 : }
748 :
749 144 : if (f != 1) {
750 12 : strcpy(*buf, "");
751 12 : return MAL_SUCCEED;
752 : }
753 :
754 132 : if (p == NULL) {
755 10 : len = strlen(s);
756 : } else {
757 122 : len = (size_t) (p - s);
758 : }
759 :
760 132 : len++;
761 132 : CHECK_STR_BUFFER_LENGTH(buf, buflen, len, "str.splitpart");
762 132 : strcpy_len(*buf, s, len);
763 132 : return MAL_SUCCEED;
764 : }
765 :
766 : static str
767 23 : STRsplitpart(str *res, const char *const *haystack, const char *const *needle, const int *field)
768 : {
769 23 : str buf = NULL, msg = MAL_SUCCEED;
770 23 : const char *s = *haystack, *s2 = *needle;
771 23 : int f = *field;
772 :
773 69 : if (strNil(s) || strNil(s2) || is_int_nil(f)) {
774 0 : *res = GDKstrdup(str_nil);
775 : } else {
776 23 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
777 :
778 23 : *res = NULL;
779 23 : if (!(buf = GDKmalloc(buflen)))
780 4 : throw(MAL, "str.splitpart", SQLSTATE(HY013) MAL_MALLOC_FAIL);
781 23 : if ((msg = str_splitpart(&buf, &buflen, s, s2, f)) != MAL_SUCCEED) {
782 4 : GDKfree(buf);
783 4 : return msg;
784 : }
785 19 : *res = GDKstrdup(buf);
786 : }
787 :
788 19 : GDKfree(buf);
789 19 : if (!*res)
790 0 : msg = createException(MAL, "str.splitpart",
791 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
792 : return msg;
793 : }
794 :
795 : /* returns number of bytes to remove from left to strip the codepoints in rm */
796 : static size_t
797 341 : lstrip(const char *s, size_t len, const uint32_t *rm, size_t nrm)
798 : {
799 341 : uint32_t state = 0, codepoint;
800 341 : size_t skip = 0;
801 :
802 507 : for (size_t n = 0; n < len;) {
803 492 : if (decode(&state, &codepoint, (uint8_t) s[n++]) == UTF8_ACCEPT) {
804 : size_t i;
805 7739 : for (i = 0; i < nrm; i++) {
806 7412 : if (rm[i] == codepoint) {
807 : break;
808 : }
809 : }
810 469 : if (i == nrm)
811 327 : return skip;
812 : skip = n;
813 : }
814 : }
815 : return skip;
816 : }
817 :
818 : /* returns the resulting length of s after stripping codepoints in rm
819 : * from the right */
820 : static size_t
821 411 : rstrip(const char *s, size_t len, const uint32_t *rm, size_t nrm)
822 : {
823 411 : uint32_t c;
824 411 : size_t i, n;
825 :
826 580 : while (len > 0) {
827 569 : UTF8_LASTCHAR(c, n, s, len);
828 569 : assert(n > 0 && n <= len);
829 9739 : for (i = 0; i < nrm; i++) {
830 9339 : if (rm[i] == c) {
831 169 : len -= n;
832 169 : break;
833 : }
834 : }
835 569 : if (i == nrm)
836 : break;
837 : }
838 411 : return len;
839 : }
840 :
841 : const uint32_t whitespace[] = {
842 : ' ', /* space */
843 : '\t', /* tab (character tabulation) */
844 : '\n', /* line feed */
845 : '\r', /* carriage return */
846 : '\f', /* form feed */
847 : '\v', /* vertical tab (line tabulation) */
848 : /* below the code points that have the Unicode Zs (space separator) property */
849 : 0x00A0, /* no-break space */
850 : 0x1680, /* ogham space mark */
851 : 0x2000, /* en quad */
852 : 0x2001, /* em quad */
853 : 0x2002, /* en space */
854 : 0x2003, /* em space */
855 : 0x2004, /* three-per-em space */
856 : 0x2005, /* four-per-em space */
857 : 0x2006, /* six-per-em space */
858 : 0x2007, /* figure space */
859 : 0x2008, /* punctuation space */
860 : 0x2009, /* thin space */
861 : 0x200A, /* hair space */
862 : 0x202F, /* narrow no-break space */
863 : 0x205F, /* medium mathematical space */
864 : 0x3000, /* ideographic space */
865 : /* below the code points that have the Unicode Zl (line separator) property */
866 : 0x2028, /* line separator */
867 : /* below the code points that have the Unicode Zp (paragraph separator)
868 : * property */
869 : 0x2029, /* paragraph separator */
870 : };
871 :
872 : #define NSPACES (sizeof(whitespace) / sizeof(whitespace[0]))
873 :
874 : str
875 279 : str_strip(str *buf, size_t *buflen, const char *s)
876 : {
877 279 : size_t len = strlen(s);
878 279 : size_t n = lstrip(s, len, whitespace, NSPACES);
879 279 : s += n;
880 279 : len -= n;
881 279 : n = rstrip(s, len, whitespace, NSPACES);
882 :
883 278 : n++;
884 278 : CHECK_STR_BUFFER_LENGTH(buf, buflen, n, "str.strip");
885 278 : strcpy_len(*buf, s, n);
886 278 : return MAL_SUCCEED;
887 : }
888 :
889 : /* remove all whitespace from either side of arg1 */
890 : static str
891 8 : STRStrip(str *res, const char *const *arg1)
892 : {
893 8 : str buf = NULL, msg = MAL_SUCCEED;
894 8 : const char *s = *arg1;
895 :
896 8 : if (strNil(s)) {
897 0 : *res = GDKstrdup(str_nil);
898 : } else {
899 8 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
900 :
901 8 : *res = NULL;
902 8 : if (!(buf = GDKmalloc(buflen)))
903 0 : throw(MAL, "str.strip", SQLSTATE(HY013) MAL_MALLOC_FAIL);
904 8 : if ((msg = str_strip(&buf, &buflen, s)) != MAL_SUCCEED) {
905 0 : GDKfree(buf);
906 0 : return msg;
907 : }
908 8 : *res = GDKstrdup(buf);
909 : }
910 :
911 8 : GDKfree(buf);
912 8 : if (!*res)
913 0 : msg = createException(MAL, "str.strip",
914 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
915 : return msg;
916 : }
917 :
918 : str
919 27 : str_ltrim(str *buf, size_t *buflen, const char *s)
920 : {
921 27 : size_t len = strlen(s);
922 27 : size_t n = lstrip(s, len, whitespace, NSPACES);
923 27 : size_t nallocate = len - n + 1;
924 :
925 27 : CHECK_STR_BUFFER_LENGTH(buf, buflen, nallocate, "str.ltrim");
926 27 : strcpy_len(*buf, s + n, nallocate);
927 27 : return MAL_SUCCEED;
928 : }
929 :
930 : /* remove all whitespace from the start (left) of arg1 */
931 : static str
932 19 : STRLtrim(str *res, const char *const *arg1)
933 : {
934 19 : str buf = NULL, msg = MAL_SUCCEED;
935 19 : const char *s = *arg1;
936 :
937 19 : if (strNil(s)) {
938 0 : *res = GDKstrdup(str_nil);
939 : } else {
940 19 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
941 :
942 19 : *res = NULL;
943 19 : if (!(buf = GDKmalloc(buflen)))
944 0 : throw(MAL, "str.ltrim", SQLSTATE(HY013) MAL_MALLOC_FAIL);
945 19 : if ((msg = str_ltrim(&buf, &buflen, s)) != MAL_SUCCEED) {
946 0 : GDKfree(buf);
947 0 : return msg;
948 : }
949 19 : *res = GDKstrdup(buf);
950 : }
951 :
952 19 : GDKfree(buf);
953 19 : if (!*res)
954 0 : msg = createException(MAL, "str.ltrim",
955 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
956 : return msg;
957 : }
958 :
959 : str
960 96 : str_rtrim(str *buf, size_t *buflen, const char *s)
961 : {
962 96 : size_t len = strlen(s);
963 96 : size_t n = rstrip(s, len, whitespace, NSPACES);
964 :
965 96 : n++;
966 96 : CHECK_STR_BUFFER_LENGTH(buf, buflen, n, "str.rtrim");
967 96 : strcpy_len(*buf, s, n);
968 96 : return MAL_SUCCEED;
969 : }
970 :
971 : /* remove all whitespace from the end (right) of arg1 */
972 : static str
973 6 : STRRtrim(str *res, const char *const *arg1)
974 : {
975 6 : str buf = NULL, msg = MAL_SUCCEED;
976 6 : const char *s = *arg1;
977 :
978 6 : if (strNil(s)) {
979 0 : *res = GDKstrdup(str_nil);
980 : } else {
981 6 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
982 :
983 6 : *res = NULL;
984 6 : if (!(buf = GDKmalloc(buflen)))
985 0 : throw(MAL, "str.rtrim", SQLSTATE(HY013) MAL_MALLOC_FAIL);
986 6 : if ((msg = str_rtrim(&buf, &buflen, s)) != MAL_SUCCEED) {
987 0 : GDKfree(buf);
988 0 : return msg;
989 : }
990 6 : *res = GDKstrdup(buf);
991 : }
992 :
993 6 : GDKfree(buf);
994 6 : if (!*res)
995 0 : msg = createException(MAL, "str.rtrim",
996 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
997 : return msg;
998 : }
999 :
1000 : /* return a list of codepoints in s */
1001 : static str
1002 49 : trimchars(str *buf, size_t *buflen, size_t *n, const char *s, size_t len_s,
1003 : const char *malfunc)
1004 : {
1005 49 : size_t len = 0, nlen = len_s * sizeof(int);
1006 49 : uint32_t *cbuf;
1007 :
1008 49 : assert(s);
1009 49 : CHECK_STR_BUFFER_LENGTH(buf, buflen, nlen, malfunc);
1010 49 : cbuf = *(uint32_t **) buf;
1011 :
1012 49 : uint32_t state = 0;
1013 49 : uint32_t codepoint;
1014 288 : while (*s) {
1015 239 : if (decode(&state, &codepoint, (uint8_t) *s) == UTF8_ACCEPT) {
1016 180 : cbuf[len++] = codepoint;
1017 : }
1018 239 : s++;
1019 : }
1020 49 : if (state != UTF8_ACCEPT)
1021 0 : throw(MAL, malfunc, SQLSTATE(42000) "Illegal Unicode code point");
1022 49 : *n = len;
1023 49 : return MAL_SUCCEED;
1024 : }
1025 :
1026 : str
1027 23 : str_strip2(str *buf, size_t *buflen, const char *s, const char *s2)
1028 : {
1029 23 : str msg = MAL_SUCCEED;
1030 23 : size_t len, n, n2, n3;
1031 :
1032 23 : if ((n2 = strlen(s2)) == 0) {
1033 0 : len = strlen(s) + 1;
1034 0 : CHECK_STR_BUFFER_LENGTH(buf, buflen, len, "str.strip2");
1035 0 : strcpy(*buf, s);
1036 0 : return MAL_SUCCEED;
1037 : } else {
1038 23 : if ((msg = trimchars(buf, buflen, &n3, s2, n2, "str.strip2")) != MAL_SUCCEED)
1039 : return msg;
1040 23 : len = strlen(s);
1041 23 : n = lstrip(s, len, *(uint32_t **) buf, n3);
1042 23 : s += n;
1043 23 : len -= n;
1044 23 : n = rstrip(s, len, *(uint32_t **) buf, n3);
1045 :
1046 23 : n++;
1047 23 : CHECK_STR_BUFFER_LENGTH(buf, buflen, n, "str.strip2");
1048 23 : strcpy_len(*buf, s, n);
1049 23 : return MAL_SUCCEED;
1050 : }
1051 : }
1052 :
1053 : /* remove the longest string containing only characters from arg2 from
1054 : * either side of arg1 */
1055 : static str
1056 20 : STRStrip2(str *res, const char *const *arg1, const char *const *arg2)
1057 : {
1058 20 : str buf = NULL, msg = MAL_SUCCEED;
1059 20 : const char *s = *arg1, *s2 = *arg2;
1060 :
1061 38 : if (strNil(s) || strNil(s2)) {
1062 3 : *res = GDKstrdup(str_nil);
1063 : } else {
1064 17 : size_t buflen = INITIAL_STR_BUFFER_LENGTH * sizeof(int);
1065 :
1066 17 : *res = NULL;
1067 17 : if (!(buf = GDKmalloc(buflen)))
1068 0 : throw(MAL, "str.strip2", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1069 17 : if ((msg = str_strip2(&buf, &buflen, s, s2)) != MAL_SUCCEED) {
1070 0 : GDKfree(buf);
1071 0 : return msg;
1072 : }
1073 17 : *res = GDKstrdup(buf);
1074 : }
1075 :
1076 20 : GDKfree(buf);
1077 20 : if (!*res)
1078 0 : msg = createException(MAL, "str.strip2",
1079 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1080 : return msg;
1081 : }
1082 :
1083 : str
1084 14 : str_ltrim2(str *buf, size_t *buflen, const char *s, const char *s2)
1085 : {
1086 14 : str msg = MAL_SUCCEED;
1087 14 : size_t len, n, n2, n3, nallocate;
1088 :
1089 14 : if ((n2 = strlen(s2)) == 0) {
1090 1 : len = strlen(s) + 1;
1091 1 : CHECK_STR_BUFFER_LENGTH(buf, buflen, len, "str.ltrim2");
1092 1 : strcpy(*buf, s);
1093 1 : return MAL_SUCCEED;
1094 : } else {
1095 13 : if ((msg = trimchars(buf, buflen, &n3, s2, n2, "str.ltrim2")) != MAL_SUCCEED)
1096 : return msg;
1097 13 : len = strlen(s);
1098 13 : n = lstrip(s, len, *(uint32_t **) buf, n3);
1099 13 : nallocate = len - n + 1;
1100 :
1101 13 : CHECK_STR_BUFFER_LENGTH(buf, buflen, nallocate, "str.ltrim2");
1102 13 : strcpy_len(*buf, s + n, nallocate);
1103 13 : return MAL_SUCCEED;
1104 : }
1105 : }
1106 :
1107 : /* remove the longest string containing only characters from arg2 from
1108 : * the start (left) of arg1 */
1109 : static str
1110 8 : STRLtrim2(str *res, const char *const *arg1, const char *const *arg2)
1111 : {
1112 8 : str buf = NULL, msg = MAL_SUCCEED;
1113 8 : const char *s = *arg1, *s2 = *arg2;
1114 :
1115 16 : if (strNil(s) || strNil(s2)) {
1116 0 : *res = GDKstrdup(str_nil);
1117 : } else {
1118 8 : size_t buflen = INITIAL_STR_BUFFER_LENGTH * sizeof(int);
1119 :
1120 8 : *res = NULL;
1121 8 : if (!(buf = GDKmalloc(buflen)))
1122 0 : throw(MAL, "str.ltrim2", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1123 8 : if ((msg = str_ltrim2(&buf, &buflen, s, s2)) != MAL_SUCCEED) {
1124 0 : GDKfree(buf);
1125 0 : return msg;
1126 : }
1127 8 : *res = GDKstrdup(buf);
1128 : }
1129 :
1130 8 : GDKfree(buf);
1131 8 : if (!*res)
1132 0 : msg = createException(MAL, "str.ltrim2",
1133 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1134 : return msg;
1135 : }
1136 :
1137 : str
1138 15 : str_rtrim2(str *buf, size_t *buflen, const char *s, const char *s2)
1139 : {
1140 15 : str msg = MAL_SUCCEED;
1141 15 : size_t len, n, n2, n3;
1142 :
1143 15 : if ((n2 = strlen(s2)) == 0) {
1144 2 : len = strlen(s) + 1;
1145 2 : CHECK_STR_BUFFER_LENGTH(buf, buflen, len, "str.rtrim2");
1146 2 : strcpy(*buf, s);
1147 2 : return MAL_SUCCEED;
1148 : } else {
1149 13 : if ((msg = trimchars(buf, buflen, &n3, s2, n2, "str.ltrim2")) != MAL_SUCCEED)
1150 : return msg;
1151 13 : len = strlen(s);
1152 13 : n = rstrip(s, len, *(uint32_t **) buf, n3);
1153 13 : n++;
1154 :
1155 13 : CHECK_STR_BUFFER_LENGTH(buf, buflen, n, "str.rtrim2");
1156 13 : strcpy_len(*buf, s, n);
1157 13 : return MAL_SUCCEED;
1158 : }
1159 : }
1160 :
1161 : /* remove the longest string containing only characters from arg2 from
1162 : * the end (right) of arg1 */
1163 : static str
1164 9 : STRRtrim2(str *res, const char *const *arg1, const char *const *arg2)
1165 : {
1166 9 : str buf = NULL, msg = MAL_SUCCEED;
1167 9 : const char *s = *arg1, *s2 = *arg2;
1168 :
1169 18 : if (strNil(s) || strNil(s2)) {
1170 0 : *res = GDKstrdup(str_nil);
1171 : } else {
1172 9 : size_t buflen = INITIAL_STR_BUFFER_LENGTH * sizeof(int);
1173 :
1174 9 : *res = NULL;
1175 9 : if (!(buf = GDKmalloc(buflen)))
1176 0 : throw(MAL, "str.rtrim2", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1177 9 : if ((msg = str_rtrim2(&buf, &buflen, s, s2)) != MAL_SUCCEED) {
1178 0 : GDKfree(buf);
1179 0 : return msg;
1180 : }
1181 9 : *res = GDKstrdup(buf);
1182 : }
1183 :
1184 9 : GDKfree(buf);
1185 9 : if (!*res)
1186 0 : msg = createException(MAL, "str.rtrim2",
1187 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1188 : return msg;
1189 : }
1190 :
1191 : static str
1192 60 : pad(str *buf, size_t *buflen, const char *s, const char *pad, int len, int left,
1193 : const char *malfunc)
1194 : {
1195 60 : size_t slen, padlen, repeats, residual, i, nlen;
1196 60 : char *res;
1197 :
1198 60 : if (len < 0)
1199 : len = 0;
1200 :
1201 60 : slen = (size_t) UTF8_strlen(s);
1202 60 : if (slen > (size_t) len) {
1203 : /* truncate */
1204 20 : pad = UTF8_strtail(s, len);
1205 20 : slen = pad - s + 1;
1206 :
1207 20 : CHECK_STR_BUFFER_LENGTH(buf, buflen, slen, malfunc);
1208 20 : strcpy_len(*buf, s, slen);
1209 20 : return MAL_SUCCEED;
1210 : }
1211 :
1212 40 : padlen = (size_t) UTF8_strlen(pad);
1213 40 : if (slen == (size_t) len || padlen == 0) {
1214 : /* nothing to do (no padding if there is no pad string) */
1215 0 : slen = strlen(s) + 1;
1216 0 : CHECK_STR_BUFFER_LENGTH(buf, buflen, slen, malfunc);
1217 0 : strcpy(*buf, s);
1218 0 : return MAL_SUCCEED;
1219 : }
1220 :
1221 40 : repeats = ((size_t) len - slen) / padlen;
1222 40 : residual = ((size_t) len - slen) % padlen;
1223 40 : if (residual > 0)
1224 20 : residual = (size_t) (UTF8_strtail(pad, (int) residual) - pad);
1225 40 : padlen = strlen(pad);
1226 40 : slen = strlen(s);
1227 :
1228 40 : nlen = slen + repeats * padlen + residual + 1;
1229 40 : CHECK_STR_BUFFER_LENGTH(buf, buflen, nlen, malfunc);
1230 40 : res = *buf;
1231 40 : if (left) {
1232 87 : for (i = 0; i < repeats; i++)
1233 67 : memcpy(res + i * padlen, pad, padlen);
1234 20 : if (residual > 0)
1235 10 : memcpy(res + repeats * padlen, pad, residual);
1236 20 : if (slen > 0)
1237 20 : memcpy(res + repeats * padlen + residual, s, slen);
1238 : } else {
1239 20 : if (slen > 0)
1240 20 : memcpy(res, s, slen);
1241 87 : for (i = 0; i < repeats; i++)
1242 67 : memcpy(res + slen + i * padlen, pad, padlen);
1243 20 : if (residual > 0)
1244 10 : memcpy(res + slen + repeats * padlen, pad, residual);
1245 : }
1246 40 : res[repeats * padlen + residual + slen] = 0;
1247 40 : return MAL_SUCCEED;
1248 : }
1249 :
1250 : str
1251 8 : str_lpad(str *buf, size_t *buflen, const char *s, int len)
1252 : {
1253 4 : return pad(buf, buflen, s, " ", len, 1, "str.lpad");
1254 : }
1255 :
1256 : /* Fill up 'arg1' to length 'len' by prepending whitespaces.
1257 : * If 'arg1' is already longer than 'len', then it's truncated on the right
1258 : * (NB: this is the PostgreSQL definition).
1259 : *
1260 : * Example: lpad('hi', 5)
1261 : * Result: ' hi'
1262 : */
1263 : static str
1264 4 : STRLpad(str *res, const char *const *arg1, const int *len)
1265 : {
1266 4 : str buf = NULL, msg = MAL_SUCCEED;
1267 4 : const char *s = *arg1;
1268 4 : int l = *len;
1269 :
1270 8 : if (strNil(s) || is_int_nil(l)) {
1271 0 : *res = GDKstrdup(str_nil);
1272 : } else {
1273 4 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1274 :
1275 4 : *res = NULL;
1276 4 : if (!(buf = GDKmalloc(buflen)))
1277 0 : throw(MAL, "str.lpad", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1278 4 : if ((msg = str_lpad(&buf, &buflen, s, l)) != MAL_SUCCEED) {
1279 0 : GDKfree(buf);
1280 0 : return msg;
1281 : }
1282 4 : *res = GDKstrdup(buf);
1283 : }
1284 :
1285 4 : GDKfree(buf);
1286 4 : if (!*res)
1287 0 : msg = createException(MAL, "str.lpad", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1288 : return msg;
1289 : }
1290 :
1291 : str
1292 8 : str_rpad(str *buf, size_t *buflen, const char *s, int len)
1293 : {
1294 4 : return pad(buf, buflen, s, " ", len, 0, "str.lpad");
1295 : }
1296 :
1297 : /* Fill up 'arg1' to length 'len' by appending whitespaces.
1298 : * If 'arg1' is already longer than 'len', then it's truncated (on the right)
1299 : * (NB: this is the PostgreSQL definition).
1300 : *
1301 : * Example: rpad('hi', 5)
1302 : * Result: 'hi '
1303 : */
1304 : static str
1305 4 : STRRpad(str *res, const char *const *arg1, const int *len)
1306 : {
1307 4 : str buf = NULL, msg = MAL_SUCCEED;
1308 4 : const char *s = *arg1;
1309 4 : int l = *len;
1310 :
1311 8 : if (strNil(s) || is_int_nil(l)) {
1312 0 : *res = GDKstrdup(str_nil);
1313 : } else {
1314 4 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1315 :
1316 4 : *res = NULL;
1317 4 : if (!(buf = GDKmalloc(buflen)))
1318 0 : throw(MAL, "str.rpad", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1319 4 : if ((msg = str_rpad(&buf, &buflen, s, l)) != MAL_SUCCEED) {
1320 0 : GDKfree(buf);
1321 0 : return msg;
1322 : }
1323 4 : *res = GDKstrdup(buf);
1324 : }
1325 :
1326 4 : GDKfree(buf);
1327 4 : if (!*res)
1328 0 : msg = createException(MAL, "str.rpad", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1329 : return msg;
1330 : }
1331 :
1332 : str
1333 22 : str_lpad3(str *buf, size_t *buflen, const char *s, int len, const char *s2)
1334 : {
1335 16 : return pad(buf, buflen, s, s2, len, 1, "str.lpad2");
1336 : }
1337 :
1338 : /* Fill up 'arg1' to length 'len' by prepending characters from 'arg2'
1339 : * If 'arg1' is already longer than 'len', then it's truncated on the right
1340 : * (NB: this is the PostgreSQL definition).
1341 : *
1342 : * Example: lpad('hi', 5, 'xy')
1343 : * Result: xyxhi
1344 : */
1345 : static str
1346 6 : STRLpad3(str *res, const char *const *arg1, const int *len, const char *const *arg2)
1347 : {
1348 6 : str buf = NULL, msg = MAL_SUCCEED;
1349 6 : const char *s = *arg1, *s2 = *arg2;
1350 6 : int l = *len;
1351 :
1352 18 : if (strNil(s) || strNil(s2) || is_int_nil(l)) {
1353 0 : *res = GDKstrdup(str_nil);
1354 : } else {
1355 6 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1356 :
1357 6 : *res = NULL;
1358 6 : if (!(buf = GDKmalloc(buflen)))
1359 0 : throw(MAL, "str.lpad2", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1360 6 : if ((msg = str_lpad3(&buf, &buflen, s, l, s2)) != MAL_SUCCEED) {
1361 0 : GDKfree(buf);
1362 0 : return msg;
1363 : }
1364 6 : *res = GDKstrdup(buf);
1365 : }
1366 :
1367 6 : GDKfree(buf);
1368 6 : if (!*res)
1369 0 : msg = createException(MAL, "str.lpad2",
1370 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1371 : return msg;
1372 : }
1373 :
1374 : str
1375 22 : str_rpad3(str *buf, size_t *buflen, const char *s, int len, const char *s2)
1376 : {
1377 16 : return pad(buf, buflen, s, s2, len, 0, "str.rpad2");
1378 : }
1379 :
1380 : /* Fill up 'arg1' to length 'len' by appending characters from 'arg2'
1381 : * If 'arg1' is already longer than 'len', then it's truncated (on the right)
1382 : * (NB: this is the PostgreSQL definition).
1383 : *
1384 : * Example: rpad('hi', 5, 'xy')
1385 : * Result: hixyx
1386 : */
1387 : static str
1388 6 : STRRpad3(str *res, const char *const *arg1, const int *len, const char *const *arg2)
1389 : {
1390 6 : str buf = NULL, msg = MAL_SUCCEED;
1391 6 : const char *s = *arg1, *s2 = *arg2;
1392 6 : int l = *len;
1393 :
1394 18 : if (strNil(s) || strNil(s2) || is_int_nil(l)) {
1395 0 : *res = GDKstrdup(str_nil);
1396 : } else {
1397 6 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1398 :
1399 6 : *res = NULL;
1400 6 : if (!(buf = GDKmalloc(buflen)))
1401 0 : throw(MAL, "str.rpad2", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1402 6 : if ((msg = str_rpad3(&buf, &buflen, s, l, s2)) != MAL_SUCCEED) {
1403 0 : GDKfree(buf);
1404 0 : return msg;
1405 : }
1406 6 : *res = GDKstrdup(buf);
1407 : }
1408 :
1409 6 : GDKfree(buf);
1410 6 : if (!*res)
1411 0 : msg = createException(MAL, "str.rpad2",
1412 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1413 : return msg;
1414 : }
1415 :
1416 : str
1417 116470 : str_substitute(str *buf, size_t *buflen, const char *s, const char *src,
1418 : const char *dst, bit repeat)
1419 : {
1420 116470 : size_t lsrc = strlen(src), ldst = strlen(dst), n, l = strlen(s);
1421 116470 : char *b, *fnd;
1422 116470 : const char *pfnd;
1423 :
1424 116470 : if (!lsrc || !l) { /* s/src is an empty string, there's nothing to substitute */
1425 7 : l++;
1426 7 : CHECK_STR_BUFFER_LENGTH(buf, buflen, l, "str.substitute");
1427 7 : strcpy(*buf, s);
1428 7 : return MAL_SUCCEED;
1429 : }
1430 :
1431 116463 : n = l + ldst;
1432 116463 : if (repeat && ldst > lsrc)
1433 77305 : n = (ldst * l) / lsrc; /* max length */
1434 :
1435 116463 : n++;
1436 116463 : CHECK_STR_BUFFER_LENGTH(buf, buflen, n, "str.substitute");
1437 116463 : b = *buf;
1438 116463 : pfnd = s;
1439 125243 : do {
1440 125243 : fnd = strstr(pfnd, src);
1441 125243 : if (fnd == NULL)
1442 : break;
1443 8778 : n = fnd - pfnd;
1444 8778 : if (n > 0) {
1445 7478 : strcpy_len(b, pfnd, n + 1);
1446 7478 : b += n;
1447 : }
1448 8778 : if (ldst > 0) {
1449 711 : strcpy_len(b, dst, ldst + 1);
1450 713 : b += ldst;
1451 : }
1452 8780 : if (*fnd == 0)
1453 : break;
1454 8780 : pfnd = fnd + lsrc;
1455 8780 : } while (repeat);
1456 116465 : strcpy(b, pfnd);
1457 116465 : return MAL_SUCCEED;
1458 : }
1459 :
1460 : static str
1461 196 : STRSubstitute(str *res, const char *const *arg1, const char *const *arg2, const char *const *arg3,
1462 : const bit *g)
1463 : {
1464 196 : str buf = NULL, msg = MAL_SUCCEED;
1465 196 : const char *s = *arg1, *s2 = *arg2, *s3 = *arg3;
1466 :
1467 587 : if (strNil(s) || strNil(s2) || strNil(s3)) {
1468 2 : *res = GDKstrdup(str_nil);
1469 : } else {
1470 194 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1471 :
1472 194 : *res = NULL;
1473 194 : if (!(buf = GDKmalloc(buflen)))
1474 0 : throw(MAL, "str.substitute", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1475 194 : if ((msg = str_substitute(&buf, &buflen, s, s2, s3, *g)) != MAL_SUCCEED) {
1476 0 : GDKfree(buf);
1477 0 : return msg;
1478 : }
1479 194 : *res = GDKstrdup(buf);
1480 : }
1481 :
1482 196 : GDKfree(buf);
1483 196 : if (!*res)
1484 0 : msg = createException(MAL, "str.substitute",
1485 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1486 : return msg;
1487 : }
1488 :
1489 : static str
1490 9 : STRascii(int *ret, const char *const *s)
1491 : {
1492 9 : return str_wchr_at(ret, *s, 0);
1493 : }
1494 :
1495 : str
1496 3841 : str_substring_tail(str *buf, size_t *buflen, const char *s, int start)
1497 : {
1498 3841 : if (start < 1)
1499 : start = 1;
1500 3841 : start--;
1501 3835 : return str_tail(buf, buflen, s, start);
1502 : }
1503 :
1504 : static str
1505 6 : STRsubstringTail(str *res, const char *const *arg1, const int *start)
1506 : {
1507 6 : str buf = NULL, msg = MAL_SUCCEED;
1508 6 : const char *s = *arg1;
1509 6 : int st = *start;
1510 :
1511 12 : if (strNil(s) || is_int_nil(st)) {
1512 0 : *res = GDKstrdup(str_nil);
1513 : } else {
1514 6 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1515 :
1516 6 : *res = NULL;
1517 6 : if (!(buf = GDKmalloc(buflen)))
1518 0 : throw(MAL, "str.substringTail", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1519 6 : if ((msg = str_substring_tail(&buf, &buflen, s, st)) != MAL_SUCCEED) {
1520 0 : GDKfree(buf);
1521 0 : return msg;
1522 : }
1523 6 : *res = GDKstrdup(buf);
1524 : }
1525 :
1526 6 : GDKfree(buf);
1527 6 : if (!*res)
1528 0 : msg = createException(MAL, "str.substringTail",
1529 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1530 : return msg;
1531 : }
1532 :
1533 : str
1534 3815642 : str_sub_string(str *buf, size_t *buflen, const char *s, int start, int l)
1535 : {
1536 3815642 : if (start < 1)
1537 : start = 1;
1538 3815642 : start--;
1539 3815621 : return str_Sub_String(buf, buflen, s, start, l);
1540 : }
1541 :
1542 : static str
1543 24 : STRsubstring(str *res, const char *const *arg1, const int *start, const int *ll)
1544 : {
1545 24 : str buf = NULL, msg = MAL_SUCCEED;
1546 24 : const char *s = *arg1;
1547 24 : int st = *start, l = *ll;
1548 :
1549 48 : if (strNil(s) || is_int_nil(st) || is_int_nil(l)) {
1550 3 : *res = GDKstrdup(str_nil);
1551 : } else {
1552 21 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1553 :
1554 21 : *res = NULL;
1555 21 : if (!(buf = GDKmalloc(buflen)))
1556 0 : throw(MAL, "str.substring", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1557 21 : if ((msg = str_sub_string(&buf, &buflen, s, st, l)) != MAL_SUCCEED) {
1558 0 : GDKfree(buf);
1559 0 : return msg;
1560 : }
1561 21 : *res = GDKstrdup(buf);
1562 : }
1563 :
1564 24 : GDKfree(buf);
1565 24 : if (!*res)
1566 0 : msg = createException(MAL, "str.substring",
1567 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1568 : return msg;
1569 : }
1570 :
1571 : static str
1572 20 : STRprefix(str *res, const char *const *arg1, const int *ll)
1573 : {
1574 20 : str buf = NULL, msg = MAL_SUCCEED;
1575 20 : const char *s = *arg1;
1576 20 : int l = *ll;
1577 :
1578 40 : if (strNil(s) || is_int_nil(l)) {
1579 0 : *res = GDKstrdup(str_nil);
1580 : } else {
1581 20 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1582 :
1583 20 : *res = NULL;
1584 20 : if (!(buf = GDKmalloc(buflen)))
1585 0 : throw(MAL, "str.prefix", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1586 20 : if ((msg = str_Sub_String(&buf, &buflen, s, 0, l)) != MAL_SUCCEED) {
1587 0 : GDKfree(buf);
1588 0 : return msg;
1589 : }
1590 20 : *res = GDKstrdup(buf);
1591 : }
1592 :
1593 20 : GDKfree(buf);
1594 20 : if (!*res)
1595 0 : msg = createException(MAL, "str.prefix",
1596 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1597 : return msg;
1598 : }
1599 :
1600 : str
1601 9 : str_suffix(str *buf, size_t *buflen, const char *s, int l)
1602 : {
1603 9 : int start = (int) (strlen(s) - l);
1604 9 : return str_Sub_String(buf, buflen, s, start, l);
1605 : }
1606 :
1607 : static str
1608 5 : STRsuffix(str *res, const char *const *arg1, const int *ll)
1609 : {
1610 5 : str buf = NULL, msg = MAL_SUCCEED;
1611 5 : const char *s = *arg1;
1612 5 : int l = *ll;
1613 :
1614 10 : if (strNil(s) || is_int_nil(l)) {
1615 0 : *res = GDKstrdup(str_nil);
1616 : } else {
1617 5 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1618 :
1619 5 : *res = NULL;
1620 5 : if (!(buf = GDKmalloc(buflen)))
1621 0 : throw(MAL, "str.suffix", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1622 5 : if ((msg = str_suffix(&buf, &buflen, s, l)) != MAL_SUCCEED) {
1623 0 : GDKfree(buf);
1624 0 : return msg;
1625 : }
1626 5 : *res = GDKstrdup(buf);
1627 : }
1628 :
1629 5 : GDKfree(buf);
1630 5 : if (!*res)
1631 0 : msg = createException(MAL, "str.suffix",
1632 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1633 : return msg;
1634 : }
1635 :
1636 : int
1637 3860 : str_locate2(const char *needle, const char *haystack, int start)
1638 : {
1639 3860 : int off, res;
1640 3860 : const char *s;
1641 :
1642 3860 : off = start <= 0 ? 1 : start;
1643 3860 : s = UTF8_strtail(haystack, off - 1);
1644 3862 : res = str_search(s, needle);
1645 3862 : return res >= 0 ? res + off : 0;
1646 : }
1647 :
1648 : static str
1649 28758 : STRlocate3(int *ret, const char *const *needle, const char *const *haystack, const int *start)
1650 : {
1651 28758 : const char *s = *needle, *s2 = *haystack;
1652 28758 : int st = *start;
1653 :
1654 57583 : *ret = (strNil(s) || strNil(s2) || is_int_nil(st)) ?
1655 28758 : int_nil :
1656 67 : str_locate2(s, s2, st);
1657 28758 : return MAL_SUCCEED;
1658 : }
1659 :
1660 : static str
1661 16 : STRlocate(int *ret, const char *const *needle, const char *const *haystack)
1662 : {
1663 16 : const char *s = *needle, *s2 = *haystack;
1664 :
1665 45 : *ret = (strNil(s) || strNil(s2)) ? int_nil : str_locate2(s, s2, 1);
1666 16 : return MAL_SUCCEED;
1667 : }
1668 :
1669 : str
1670 222 : str_insert(str *buf, size_t *buflen, const char *s, int strt, int l,
1671 : const char *s2)
1672 : {
1673 222 : str v;
1674 222 : int l1 = UTF8_strlen(s);
1675 222 : size_t nextlen;
1676 :
1677 222 : if (l < 0)
1678 0 : throw(MAL, "str.insert",
1679 : SQLSTATE(42000)
1680 : "The number of characters for insert function must be non negative");
1681 222 : if (strt < 0) {
1682 0 : if (-strt <= l1)
1683 0 : strt = l1 + strt;
1684 : else
1685 : strt = 0;
1686 : }
1687 222 : if (strt > l1)
1688 : strt = l1;
1689 :
1690 222 : nextlen = strlen(s) + strlen(s2) + 1;
1691 222 : CHECK_STR_BUFFER_LENGTH(buf, buflen, nextlen, "str.insert");
1692 222 : v = *buf;
1693 222 : if (strt > 0)
1694 215 : v = UTF8_strncpy(v, s, strt);
1695 222 : strcpy(v, s2);
1696 222 : if (strt + l < l1)
1697 10 : strcat(v, UTF8_strtail(s, strt + l));
1698 : return MAL_SUCCEED;
1699 : }
1700 :
1701 : static str
1702 224 : STRinsert(str *res, const char *const *input, const int *start, const int *nchars,
1703 : const char *const *input2)
1704 : {
1705 224 : str buf = NULL, msg = MAL_SUCCEED;
1706 224 : const char *s = *input, *s2 = *input2;
1707 224 : int st = *start, n = *nchars;
1708 :
1709 447 : if (strNil(s) || is_int_nil(st) || is_int_nil(n) || strNil(s2)) {
1710 2 : *res = GDKstrdup(str_nil);
1711 : } else {
1712 222 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1713 :
1714 222 : *res = NULL;
1715 222 : if (!(buf = GDKmalloc(buflen)))
1716 0 : throw(MAL, "str.insert", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1717 222 : if ((msg = str_insert(&buf, &buflen, s, st, n, s2)) != MAL_SUCCEED) {
1718 0 : GDKfree(buf);
1719 0 : return msg;
1720 : }
1721 222 : *res = GDKstrdup(buf);
1722 : }
1723 :
1724 224 : GDKfree(buf);
1725 224 : if (!*res)
1726 0 : msg = createException(MAL, "str.insert",
1727 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1728 : return msg;
1729 : }
1730 :
1731 : static str
1732 196 : STRreplace(str *ret, const char *const *s1, const char *const *s2, const char *const *s3)
1733 : {
1734 196 : bit flag = TRUE;
1735 196 : return STRSubstitute(ret, s1, s2, s3, &flag);
1736 : }
1737 :
1738 : str
1739 15 : str_repeat(str *buf, size_t *buflen, const char *s, int c)
1740 : {
1741 15 : size_t l = strlen(s), nextlen;
1742 :
1743 15 : if (l >= INT_MAX)
1744 0 : throw(MAL, "str.repeat", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1745 15 : nextlen = (size_t) c *l + 1;
1746 :
1747 15 : CHECK_STR_BUFFER_LENGTH(buf, buflen, nextlen, "str.repeat");
1748 15 : str t = *buf;
1749 15 : *t = 0;
1750 160043 : for (int i = c; i > 0; i--, t += l)
1751 160028 : strcpy(t, s);
1752 : return MAL_SUCCEED;
1753 : }
1754 :
1755 : static str
1756 11 : STRrepeat(str *res, const char *const *arg1, const int *c)
1757 : {
1758 11 : str buf = NULL, msg = MAL_SUCCEED;
1759 11 : const char *s = *arg1;
1760 11 : int cc = *c;
1761 :
1762 21 : if (strNil(s) || is_int_nil(cc) || cc < 0) {
1763 1 : *res = GDKstrdup(str_nil);
1764 : } else {
1765 10 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1766 :
1767 10 : *res = NULL;
1768 10 : if (!(buf = GDKmalloc(buflen)))
1769 0 : throw(MAL, "str.repeat", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1770 10 : if ((msg = str_repeat(&buf, &buflen, s, cc)) != MAL_SUCCEED) {
1771 0 : GDKfree(buf);
1772 0 : return msg;
1773 : }
1774 10 : *res = GDKstrdup(buf);
1775 : }
1776 :
1777 11 : GDKfree(buf);
1778 11 : if (!*res)
1779 0 : msg = createException(MAL, "str.repeat",
1780 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1781 : return msg;
1782 : }
1783 :
1784 : static str
1785 1 : STRspace(str *res, const int *ll)
1786 : {
1787 1 : str buf = NULL, msg = MAL_SUCCEED;
1788 1 : int l = *ll;
1789 :
1790 1 : if (is_int_nil(l) || l < 0) {
1791 0 : *res = GDKstrdup(str_nil);
1792 : } else {
1793 1 : const char space[] = " ", *s = space;
1794 1 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
1795 :
1796 1 : *res = NULL;
1797 1 : if (!(buf = GDKmalloc(buflen)))
1798 0 : throw(MAL, "str.space", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1799 1 : if ((msg = str_repeat(&buf, &buflen, s, l)) != MAL_SUCCEED) {
1800 0 : GDKfree(buf);
1801 0 : return msg;
1802 : }
1803 1 : *res = GDKstrdup(buf);
1804 : }
1805 :
1806 1 : GDKfree(buf);
1807 1 : if (!*res)
1808 0 : msg = createException(MAL, "str.space",
1809 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1810 : return msg;
1811 : }
1812 :
1813 : static str
1814 4 : STRasciify(str *r, const char *const *s)
1815 : {
1816 4 : char *buf = NULL;
1817 4 : size_t buflen = 0;
1818 4 : if (GDKasciify(&buf, &buflen, *s) != GDK_SUCCEED)
1819 0 : throw(MAL, "str.asciify", GDK_EXCEPTION);
1820 4 : *r = buf;
1821 4 : return MAL_SUCCEED;
1822 : }
1823 :
1824 : static inline void
1825 206 : BBPnreclaim(int nargs, ...)
1826 : {
1827 206 : va_list valist;
1828 206 : va_start(valist, nargs);
1829 875 : for (int i = 0; i < nargs; i++) {
1830 669 : BAT *b = va_arg(valist, BAT *);
1831 1069 : BBPreclaim(b);
1832 : }
1833 206 : va_end(valist);
1834 206 : }
1835 :
1836 : #define HANDLE_TIMEOUT(qc) \
1837 : do { \
1838 : TIMEOUT_ERROR(qc, __FILE__, __func__, __LINE__); \
1839 : msg = createException(MAL, fname, GDK_EXCEPTION); \
1840 : } while (0)
1841 :
1842 : #define scanloop(TEST, canditer_next) \
1843 : do { \
1844 : const oid off = b->hseqbase; \
1845 : TIMEOUT_LOOP(ci.ncand, qry_ctx) { \
1846 : oid o = canditer_next(&ci); \
1847 : const char *restrict v = BUNtvar(bi, o - off); \
1848 : assert(rcnt < BATcapacity(bn)); \
1849 : if (TEST) \
1850 : vals[rcnt++] = o; \
1851 : } \
1852 : } while (0)
1853 :
1854 : static str
1855 57 : STRselect(MalStkPtr stk, InstrPtr pci,
1856 : int (*str_icmp)(const char *, const char *, int),
1857 : int (*str_cmp)(const char *, const char *, int),
1858 : const char *fname)
1859 : {
1860 57 : str msg = MAL_SUCCEED;
1861 :
1862 57 : bat *r_id = getArgReference_bat(stk, pci, 0);
1863 57 : bat b_id = *getArgReference_bat(stk, pci, 1);
1864 57 : bat cb_id = *getArgReference_bat(stk, pci, 2);
1865 57 : const char *key = *getArgReference_str(stk, pci, 3);
1866 57 : bit icase = pci->argc != 5;
1867 57 : bit anti = pci->argc == 5 ? *getArgReference_bit(stk, pci, 4) :
1868 26 : *getArgReference_bit(stk, pci, 5);
1869 :
1870 57 : BAT *b, *cb = NULL, *bn = NULL, *old_s = NULL;;
1871 57 : BUN rcnt = 0;
1872 57 : struct canditer ci;
1873 57 : bool with_strimps = false,
1874 57 : with_strimps_anti = false;
1875 :
1876 57 : if (!(b = BATdescriptor(b_id)))
1877 0 : throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1878 :
1879 57 : if (!is_bat_nil(cb_id) && !(cb = BATdescriptor(cb_id))) {
1880 0 : BBPreclaim(b);
1881 0 : throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1882 : }
1883 :
1884 57 : assert(ATOMstorage(b->ttype) == TYPE_str);
1885 :
1886 57 : if (BAThasstrimps(b)) {
1887 4 : BAT *tmp_s;
1888 4 : if (STRMPcreate(b, NULL) == GDK_SUCCEED && (tmp_s = STRMPfilter(b, cb, key, anti)) != NULL) {
1889 4 : old_s = cb;
1890 4 : cb = tmp_s;
1891 4 : if (!anti)
1892 : with_strimps = true;
1893 : else
1894 0 : with_strimps_anti = true;
1895 : } else {
1896 : /* strimps failed, continue without */
1897 0 : GDKclrerr();
1898 : }
1899 : }
1900 :
1901 110 : MT_thread_setalgorithm(with_strimps ?
1902 53 : "string_select: strcmp function using strimps" :
1903 : (with_strimps_anti ?
1904 : "string_select: strcmp function using strimps anti"
1905 : : "string_select: strcmp function with no accelerator"));
1906 :
1907 57 : canditer_init(&ci, b, cb);
1908 57 : if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
1909 0 : BBPnreclaim(2, b, cb);
1910 0 : throw(MAL, fname, SQLSTATE(HY013) MAL_MALLOC_FAIL);
1911 : }
1912 :
1913 114 : if (!strNil(key)) {
1914 57 : BATiter bi = bat_iterator(b);
1915 57 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
1916 57 : if (icase)
1917 26 : str_cmp = str_icmp;
1918 57 : oid *vals = Tloc(bn, 0);
1919 57 : const int klen = str_strlen(key);
1920 57 : if (ci.tpe == cand_dense) {
1921 57 : if (with_strimps_anti)
1922 0 : scanloop(strNil(v) || str_cmp(v, key, klen) == 0, canditer_next_dense);
1923 57 : else if (anti)
1924 0 : scanloop(!strNil(v) && str_cmp(v, key, klen) != 0, canditer_next_dense);
1925 : else
1926 2914 : scanloop(!strNil(v) && str_cmp(v, key, klen) == 0, canditer_next_dense);
1927 : } else {
1928 0 : if (with_strimps_anti)
1929 0 : scanloop(strNil(v) || str_cmp(v, key, klen) == 0, canditer_next);
1930 0 : else if (anti)
1931 0 : scanloop(!strNil(v) && str_cmp(v, key, klen) != 0, canditer_next);
1932 : else
1933 0 : scanloop(!strNil(v) && str_cmp(v, key, klen) == 0, canditer_next);
1934 : }
1935 57 : bat_iterator_end(&bi);
1936 57 : TIMEOUT_CHECK(qry_ctx, HANDLE_TIMEOUT(qry_ctx));
1937 :
1938 0 : if (!msg) {
1939 57 : BATsetcount(bn, rcnt);
1940 57 : bn->tsorted = true;
1941 57 : bn->trevsorted = bn->batCount <= 1;
1942 57 : bn->tkey = true;
1943 57 : bn->tnil = false;
1944 57 : bn->tnonil = true;
1945 114 : bn->tseqbase = rcnt == 0 ?
1946 57 : 0 : rcnt == 1 ?
1947 23 : *(const oid *) Tloc(bn, 0) : rcnt == ci.ncand && ci.tpe == cand_dense ? ci.hseq : oid_nil;
1948 :
1949 57 : if (with_strimps_anti) {
1950 0 : BAT *rev;
1951 0 : if (old_s) {
1952 0 : rev = BATdiffcand(old_s, bn);
1953 : #ifndef NDEBUG
1954 0 : BAT *is = BATintersectcand(old_s, bn);
1955 0 : if (is) {
1956 0 : assert(is->batCount == bn->batCount);
1957 0 : BBPreclaim(is);
1958 : }
1959 0 : assert(rev->batCount == old_s->batCount - bn->batCount);
1960 : #endif
1961 : } else
1962 0 : rev = BATnegcands(0, b->batCount, bn);
1963 :
1964 0 : BBPreclaim(bn);
1965 0 : bn = rev;
1966 0 : if (bn == NULL)
1967 0 : msg = createException(MAL, fname, SQLSTATE(HY013) MAL_MALLOC_FAIL);
1968 : }
1969 : }
1970 : }
1971 :
1972 57 : if (bn && !msg) {
1973 57 : *r_id = bn->batCacheid;
1974 57 : BBPkeepref(bn);
1975 : } else {
1976 0 : BBPreclaim(bn);
1977 : }
1978 :
1979 57 : BBPnreclaim(3, b, cb, old_s);
1980 57 : return msg;
1981 : }
1982 :
1983 : /**
1984 : * @r_id: result oid
1985 : * @b_id: input bat oid
1986 : * @cb_id: input bat candidates oid
1987 : * @key: input string
1988 : * @icase: ignore case
1989 : * @anti: anti join
1990 : */
1991 : static str
1992 18 : STRstartswithselect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1993 : {
1994 18 : (void) cntxt;
1995 18 : (void) mb;
1996 18 : return STRselect(stk, pci,
1997 : str_is_iprefix, str_is_prefix, "str.startswithselect");
1998 : }
1999 :
2000 : /**
2001 : * @r_id: result oid
2002 : * @b_id: input bat oid
2003 : * @cb_id: input bat candidates oid
2004 : * @key: input string
2005 : * @icase: ignore case
2006 : * @anti: anti join
2007 : */
2008 : static str
2009 14 : STRendswithselect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
2010 : {
2011 14 : (void) cntxt;
2012 14 : (void) mb;
2013 14 : return STRselect(stk, pci,
2014 : str_is_isuffix, str_is_suffix, "str.endswithselect");
2015 : }
2016 :
2017 : /**
2018 : * @r_id: result oid
2019 : * @b_id: input bat oid
2020 : * @cb_id: input bat candidates oid
2021 : * @key: input string
2022 : * @icase: ignore case
2023 : * @anti: anti join
2024 : */
2025 : static str
2026 25 : STRcontainsselect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
2027 : {
2028 25 : (void) cntxt;
2029 25 : (void) mb;
2030 25 : return STRselect(stk, pci,
2031 : str_icontains, str_contains, "str.containsselect");
2032 : }
2033 :
2034 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
2035 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##i.width))
2036 :
2037 : #define set_empty_bat_props(B) \
2038 : do { \
2039 : B->tnil = false; \
2040 : B->tnonil = true; \
2041 : B->tkey = true; \
2042 : B->tsorted = true; \
2043 : B->trevsorted = true; \
2044 : B->tseqbase = 0; \
2045 : } while (0)
2046 :
2047 : #define CONTAINS_JOIN_LOOP(STR_CMP, STR_LEN) \
2048 : do { \
2049 : canditer_init(&rci, r, cr); \
2050 : for (BUN ridx = 0; ridx < rci.ncand; ridx++) { \
2051 : BAT *filtered_sl = NULL; \
2052 : GDK_CHECK_TIMEOUT(qry_ctx, counter, GOTO_LABEL_TIMEOUT_HANDLER(exit, qry_ctx)); \
2053 : ro = canditer_next(&rci); \
2054 : vr = VALUE(r, ro - rbase); \
2055 : matches = 0; \
2056 : if (!strNil(vr)) { \
2057 : vr_len = STR_LEN; \
2058 : if (with_strimps) \
2059 : filtered_sl = STRMPfilter(l, cl, vr, anti); \
2060 : if (filtered_sl) \
2061 : canditer_init(&lci, l, filtered_sl); \
2062 : else \
2063 : canditer_init(&lci, l, cl); \
2064 : for (BUN lidx = 0; lidx < lci.ncand; lidx++) { \
2065 : lo = canditer_next(&lci); \
2066 : vl = VALUE(l, lo - lbase); \
2067 : if (strNil(vl)) \
2068 : continue; \
2069 : if (STR_CMP) \
2070 : continue; \
2071 : if (BATcount(rl) == BATcapacity(rl)) { \
2072 : newcap = BATgrows(rl); \
2073 : BATsetcount(rl, BATcount(rl)); \
2074 : if (rr) \
2075 : BATsetcount(rr, BATcount(rr)); \
2076 : if (BATextend(rl, newcap) != GDK_SUCCEED || \
2077 : (rr && BATextend(rr, newcap) != GDK_SUCCEED)) { \
2078 : msg = createException(MAL, fname, SQLSTATE(HY013) MAL_MALLOC_FAIL); \
2079 : goto exit; \
2080 : } \
2081 : assert(!rr || BATcapacity(rl) == BATcapacity(rr)); \
2082 : } \
2083 : if (BATcount(rl) > 0) { \
2084 : if (lastl + 1 != lo) \
2085 : rl->tseqbase = oid_nil; \
2086 : if (matches == 0) { \
2087 : if (rr) \
2088 : rr->trevsorted = false; \
2089 : if (lastl > lo) { \
2090 : rl->tsorted = false; \
2091 : rl->tkey = false; \
2092 : } else if (lastl < lo) { \
2093 : rl->trevsorted = false; \
2094 : } else { \
2095 : rl->tkey = false; \
2096 : } \
2097 : } \
2098 : } \
2099 : APPEND(rl, lo); \
2100 : if (rr) \
2101 : APPEND(rr, ro); \
2102 : lastl = lo; \
2103 : matches++; \
2104 : } \
2105 : BBPreclaim(filtered_sl); \
2106 : } \
2107 : if (rr) { \
2108 : if (matches > 1) { \
2109 : rr->tkey = false; \
2110 : rr->tseqbase = oid_nil; \
2111 : rl->trevsorted = false; \
2112 : } else if (matches == 0) { \
2113 : rskipped = BATcount(rr) > 0; \
2114 : } else if (rskipped) { \
2115 : rr->tseqbase = oid_nil; \
2116 : } \
2117 : } else if (matches > 1) { \
2118 : rl->trevsorted = false; \
2119 : } \
2120 : } \
2121 : } while (0)
2122 :
2123 : #define STR_JOIN_NESTED_LOOP(STR_CMP, STR_LEN, FNAME) \
2124 : do { \
2125 : canditer_init(&rci, r, cr); \
2126 : for (BUN ridx = 0; ridx < rci.ncand; ridx++) { \
2127 : GDK_CHECK_TIMEOUT(qry_ctx, counter, GOTO_LABEL_TIMEOUT_HANDLER(exit, qry_ctx)); \
2128 : ro = canditer_next(&rci); \
2129 : vr = VALUE(r, ro - rbase); \
2130 : matches = 0; \
2131 : if (!strNil(vr)) { \
2132 : vr_len = STR_LEN; \
2133 : canditer_init(&lci, l, cl); \
2134 : for (BUN lidx = 0; lidx < lci.ncand; lidx++) { \
2135 : lo = canditer_next(&lci); \
2136 : vl = VALUE(l, lo - lbase); \
2137 : if (strNil(vl)) \
2138 : continue; \
2139 : if (!(STR_CMP)) \
2140 : continue; \
2141 : if (BATcount(rl) == BATcapacity(rl)) { \
2142 : newcap = BATgrows(rl); \
2143 : BATsetcount(rl, BATcount(rl)); \
2144 : if (rr) \
2145 : BATsetcount(rr, BATcount(rr)); \
2146 : if (BATextend(rl, newcap) != GDK_SUCCEED || \
2147 : (rr && BATextend(rr, newcap) != GDK_SUCCEED)) { \
2148 : msg = createException(MAL, FNAME, SQLSTATE(HY013) MAL_MALLOC_FAIL); \
2149 : goto exit; \
2150 : } \
2151 : assert(!rr || BATcapacity(rl) == BATcapacity(rr)); \
2152 : } \
2153 : if (BATcount(rl) > 0) { \
2154 : if (last_lo + 1 != lo) \
2155 : rl->tseqbase = oid_nil; \
2156 : if (matches == 0) { \
2157 : if (rr) \
2158 : rr->trevsorted = false; \
2159 : if (last_lo > lo) { \
2160 : rl->tsorted = false; \
2161 : rl->tkey = false; \
2162 : } else if (last_lo < lo) { \
2163 : rl->trevsorted = false; \
2164 : } else { \
2165 : rl->tkey = false; \
2166 : } \
2167 : } \
2168 : } \
2169 : APPEND(rl, lo); \
2170 : if (rr) \
2171 : APPEND(rr, ro); \
2172 : last_lo = lo; \
2173 : matches++; \
2174 : } \
2175 : } \
2176 : if (rr) { \
2177 : if (matches > 1) { \
2178 : rr->tkey = false; \
2179 : rr->tseqbase = oid_nil; \
2180 : rl->trevsorted = false; \
2181 : } else if (matches == 0) { \
2182 : rskipped = BATcount(rr) > 0; \
2183 : } else if (rskipped) { \
2184 : rr->tseqbase = oid_nil; \
2185 : } \
2186 : } else if (matches > 1) { \
2187 : rl->trevsorted = false; \
2188 : } \
2189 : } \
2190 : } while (0)
2191 :
2192 : #define STARTSWITH_SORTED_LOOP(STR_CMP, STR_LEN, FNAME) \
2193 : do { \
2194 : canditer_init(&rci, sorted_r, sorted_cr); \
2195 : canditer_init(&lci, sorted_l, sorted_cl); \
2196 : for (lx = 0; lx < lci.ncand; lx++) { \
2197 : lo = canditer_next(&lci); \
2198 : vl = VALUE(l, lo - lbase); \
2199 : if (!strNil(vl)) \
2200 : break; \
2201 : } \
2202 : for (rx = 0; rx < rci.ncand; rx++) { \
2203 : ro = canditer_next(&rci); \
2204 : vr = VALUE(r, ro - rbase); \
2205 : if (!strNil(vr)) { \
2206 : canditer_setidx(&rci, rx); \
2207 : break; \
2208 : } \
2209 : } \
2210 : for (; rx < rci.ncand; rx++) { \
2211 : GDK_CHECK_TIMEOUT(qry_ctx, counter, GOTO_LABEL_TIMEOUT_HANDLER(exit, qry_ctx)); \
2212 : ro = canditer_next(&rci); \
2213 : vr = VALUE(r, ro - rbase); \
2214 : vr_len = STR_LEN; \
2215 : matches = 0; \
2216 : for (canditer_setidx(&lci, lx), n = lx; n < lci.ncand; n++) { \
2217 : lo = canditer_next_dense(&lci); \
2218 : vl = VALUE(l, lo - lbase); \
2219 : cmp = STR_CMP; \
2220 : if (cmp < 0) { \
2221 : lx++; \
2222 : continue; \
2223 : } \
2224 : else if (cmp > 0) \
2225 : break; \
2226 : if (BATcount(rl) == BATcapacity(rl)) { \
2227 : newcap = BATgrows(rl); \
2228 : BATsetcount(rl, BATcount(rl)); \
2229 : if (rr) \
2230 : BATsetcount(rr, BATcount(rr)); \
2231 : if (BATextend(rl, newcap) != GDK_SUCCEED || \
2232 : (rr && BATextend(rr, newcap) != GDK_SUCCEED)) { \
2233 : msg = createException(MAL, FNAME, SQLSTATE(HY013) MAL_MALLOC_FAIL); \
2234 : goto exit; \
2235 : } \
2236 : assert(!rr || BATcapacity(rl) == BATcapacity(rr)); \
2237 : } \
2238 : if (BATcount(rl) > 0) { \
2239 : if (last_lo + 1 != lo) \
2240 : rl->tseqbase = oid_nil; \
2241 : if (matches == 0) { \
2242 : if (rr) \
2243 : rr->trevsorted = false; \
2244 : if (last_lo > lo) { \
2245 : rl->tsorted = false; \
2246 : rl->tkey = false; \
2247 : } else if (last_lo < lo) { \
2248 : rl->trevsorted = false; \
2249 : } else { \
2250 : rl->tkey = false; \
2251 : } \
2252 : } \
2253 : } \
2254 : APPEND(rl, lo); \
2255 : if (rr) \
2256 : APPEND(rr, ro); \
2257 : last_lo = lo; \
2258 : matches++; \
2259 : } \
2260 : if (rr) { \
2261 : if (matches > 1) { \
2262 : rr->tkey = false; \
2263 : rr->tseqbase = oid_nil; \
2264 : rl->trevsorted = false; \
2265 : } else if (matches == 0) { \
2266 : rskipped = BATcount(rr) > 0; \
2267 : } else if (rskipped) { \
2268 : rr->tseqbase = oid_nil; \
2269 : } \
2270 : } else if (matches > 1) { \
2271 : rl->trevsorted = false; \
2272 : } \
2273 : } \
2274 : } while (0)
2275 :
2276 : static void
2277 562 : do_strrev(char *dst, const char *src, size_t len)
2278 : {
2279 562 : dst[len] = 0;
2280 562 : if (strNil(src)) {
2281 8 : assert(len == strlen(str_nil));
2282 8 : strcpy(dst, str_nil);
2283 8 : return;
2284 : }
2285 4502 : while (*src) {
2286 3948 : if ((*src & 0xF8) == 0xF0) {
2287 0 : assert(len >= 4);
2288 0 : dst[len - 4] = *src++;
2289 0 : assert((*src & 0xC0) == 0x80);
2290 0 : dst[len - 3] = *src++;
2291 0 : assert((*src & 0xC0) == 0x80);
2292 0 : dst[len - 2] = *src++;
2293 0 : assert((*src & 0xC0) == 0x80);
2294 0 : dst[len - 1] = *src++;
2295 0 : len -= 4;
2296 3948 : } else if ((*src & 0xF0) == 0xE0) {
2297 0 : assert(len >= 3);
2298 0 : dst[len - 3] = *src++;
2299 0 : assert((*src & 0xC0) == 0x80);
2300 0 : dst[len - 2] = *src++;
2301 0 : assert((*src & 0xC0) == 0x80);
2302 0 : dst[len - 1] = *src++;
2303 0 : len -= 3;
2304 3948 : } else if ((*src & 0xE0) == 0xC0) {
2305 0 : assert(len >= 2);
2306 0 : dst[len - 2] = *src++;
2307 0 : assert((*src & 0xC0) == 0x80);
2308 0 : dst[len - 1] = *src++;
2309 0 : len -= 2;
2310 : } else {
2311 3948 : assert(len >= 1);
2312 3948 : assert((*src & 0x80) == 0);
2313 3948 : dst[--len] = *src++;
2314 : }
2315 : }
2316 554 : assert(len == 0);
2317 : }
2318 :
2319 : static BAT *
2320 28 : batstr_strrev(BAT *b)
2321 : {
2322 28 : BAT *bn = NULL;
2323 28 : BATiter bi;
2324 28 : BUN p, q;
2325 28 : const char *src;
2326 28 : size_t len;
2327 28 : char *dst;
2328 28 : size_t dstlen;
2329 :
2330 28 : dstlen = 1024;
2331 28 : dst = GDKmalloc(dstlen);
2332 28 : if (dst == NULL)
2333 : return NULL;
2334 :
2335 28 : assert(b->ttype == TYPE_str);
2336 :
2337 28 : bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT);
2338 28 : if (bn == NULL) {
2339 0 : GDKfree(dst);
2340 0 : return NULL;
2341 : }
2342 :
2343 28 : bi = bat_iterator(b);
2344 590 : BATloop(b, p, q) {
2345 562 : src = (const char *) BUNtail(bi, p);
2346 562 : len = strlen(src);
2347 562 : if (len >= dstlen) {
2348 0 : char *ndst;
2349 0 : dstlen = len + 1024;
2350 0 : ndst = GDKrealloc(dst, dstlen);
2351 0 : if (ndst == NULL) {
2352 0 : bat_iterator_end(&bi);
2353 0 : BBPreclaim(bn);
2354 0 : GDKfree(dst);
2355 0 : return NULL;
2356 : }
2357 : dst = ndst;
2358 : }
2359 562 : do_strrev(dst, src, len);
2360 561 : if (BUNappend(bn, dst, false) != GDK_SUCCEED) {
2361 0 : bat_iterator_end(&bi);
2362 0 : BBPreclaim(bn);
2363 0 : GDKfree(dst);
2364 0 : return NULL;
2365 : }
2366 : }
2367 :
2368 28 : bat_iterator_end(&bi);
2369 28 : GDKfree(dst);
2370 28 : return bn;
2371 : }
2372 :
2373 : static BAT *
2374 26 : batstr_strlower(BAT *b)
2375 : {
2376 26 : BAT *bn = NULL;
2377 26 : BATiter bi;
2378 26 : BUN p, q;
2379 :
2380 26 : assert(b->ttype == TYPE_str);
2381 :
2382 26 : bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT);
2383 26 : if (bn == NULL)
2384 : return NULL;
2385 :
2386 26 : bi = bat_iterator(b);
2387 114 : BATloop(b, p, q) {
2388 88 : const char *vb = BUNtail(bi, p);
2389 88 : char *vb_low = NULL;
2390 88 : if (STRlower(&vb_low, &vb)) {
2391 0 : bat_iterator_end(&bi);
2392 0 : BBPreclaim(bn);
2393 0 : return NULL;
2394 : }
2395 88 : if (BUNappend(bn, vb_low, false) != GDK_SUCCEED) {
2396 0 : GDKfree(vb_low);
2397 0 : bat_iterator_end(&bi);
2398 0 : BBPreclaim(bn);
2399 0 : return NULL;
2400 : }
2401 88 : GDKfree(vb_low);
2402 : }
2403 26 : bat_iterator_end(&bi);
2404 26 : return bn;
2405 : }
2406 :
2407 : static str
2408 14 : str_join_nested(BAT *rl, BAT *rr, BAT *l, BAT *r, BAT *cl, BAT *cr,
2409 : bit anti, int (*str_cmp)(const char *, const char *, int),
2410 : const char *fname)
2411 : {
2412 14 : str msg = MAL_SUCCEED;
2413 :
2414 14 : size_t counter = 0;
2415 14 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2416 :
2417 14 : TRC_DEBUG(ALGO,
2418 : "(%s, %s, l=%s#" BUNFMT "[%s]%s%s,"
2419 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
2420 : "sr=%s#" BUNFMT "%s%s)\n",
2421 : fname, "nested loop",
2422 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
2423 : l->tsorted ? "-sorted" : "",
2424 : l->trevsorted ? "-revsorted" : "",
2425 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
2426 : r->tsorted ? "-sorted" : "",
2427 : r->trevsorted ? "-revsorted" : "",
2428 : cl ? BATgetId(cl) : "NULL", cl ? BATcount(cl) : 0,
2429 : cl && cl->tsorted ? "-sorted" : "",
2430 : cl && cl->trevsorted ? "-revsorted" : "",
2431 : cr ? BATgetId(cr) : "NULL", cr ? BATcount(cr) : 0,
2432 : cr && cr->tsorted ? "-sorted" : "",
2433 : cr && cr->trevsorted ? "-revsorted" : "");
2434 :
2435 42 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
2436 14 : assert(ATOMtype(l->ttype) == TYPE_str);
2437 :
2438 14 : BATiter li = bat_iterator(l);
2439 14 : BATiter ri = bat_iterator(r);
2440 14 : assert(ri.vh && r->ttype);
2441 :
2442 14 : struct canditer lci, rci;
2443 14 : oid lbase = l->hseqbase,
2444 14 : rbase = r->hseqbase,
2445 14 : lo, ro, last_lo = 0;
2446 14 : const char *lvals = (const char *) li.base,
2447 14 : *rvals = (const char *) ri.base,
2448 14 : *lvars = li.vh->base,
2449 14 : *rvars = ri.vh->base,
2450 : *vl, *vr;
2451 14 : BUN matches, newcap;
2452 14 : int rskipped = 0, vr_len = 0;
2453 :
2454 14 : if (anti)
2455 0 : STR_JOIN_NESTED_LOOP((str_cmp(vl, vr, vr_len) != 0), str_strlen(vr), fname);
2456 : else
2457 734 : STR_JOIN_NESTED_LOOP((str_cmp(vl, vr, vr_len) == 0), str_strlen(vr), fname);
2458 :
2459 14 : assert(!rr || BATcount(rl) == BATcount(rr));
2460 14 : BATsetcount(rl, BATcount(rl));
2461 14 : if (rr)
2462 14 : BATsetcount(rr, BATcount(rr));
2463 :
2464 14 : if (BATcount(rl) > 0) {
2465 13 : if (BATtdense(rl))
2466 2 : rl->tseqbase = ((oid *) rl->theap->base)[0];
2467 13 : if (rr && BATtdense(rr))
2468 4 : rr->tseqbase = ((oid *) rr->theap->base)[0];
2469 : } else {
2470 1 : rl->tseqbase = 0;
2471 1 : if (rr)
2472 1 : rr->tseqbase = 0;
2473 : }
2474 :
2475 14 : TRC_DEBUG(ALGO,
2476 : "(%s, l=%s,r=%s)=(%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s\n",
2477 : fname,
2478 : BATgetId(l), BATgetId(r), BATgetId(rl), BATcount(rl),
2479 : rl->tsorted ? "-sorted" : "",
2480 : rl->trevsorted ? "-revsorted" : "",
2481 : rr ? BATgetId(rr) : NULL, rr ? BATcount(rr) : 0,
2482 : rr && rr->tsorted ? "-sorted" : "",
2483 : rr && rr->trevsorted ? "-revsorted" : "");
2484 :
2485 14 : exit:
2486 14 : bat_iterator_end(&li);
2487 14 : bat_iterator_end(&ri);
2488 14 : return msg;
2489 : }
2490 :
2491 : static str
2492 34 : contains_join(BAT *rl, BAT *rr, BAT *l, BAT *r, BAT *cl, BAT *cr, bit anti,
2493 : int (*str_cmp)(const char *, const char *, int),
2494 : const char *fname)
2495 : {
2496 34 : str msg = MAL_SUCCEED;
2497 :
2498 34 : size_t counter = 0;
2499 34 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2500 :
2501 34 : TRC_DEBUG(ALGO,
2502 : "(%s, l=%s#" BUNFMT "[%s]%s%s,"
2503 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
2504 : "sr=%s#" BUNFMT "%s%s)\n",
2505 : fname,
2506 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
2507 : l->tsorted ? "-sorted" : "",
2508 : l->trevsorted ? "-revsorted" : "",
2509 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
2510 : r->tsorted ? "-sorted" : "",
2511 : r->trevsorted ? "-revsorted" : "",
2512 : cl ? BATgetId(cl) : "NULL", cl ? BATcount(cl) : 0,
2513 : cl && cl->tsorted ? "-sorted" : "",
2514 : cl && cl->trevsorted ? "-revsorted" : "",
2515 : cr ? BATgetId(cr) : "NULL", cr ? BATcount(cr) : 0,
2516 : cr && cr->tsorted ? "-sorted" : "",
2517 : cr && cr->trevsorted ? "-revsorted" : "");
2518 :
2519 34 : bool with_strimps = false;
2520 :
2521 34 : if (BAThasstrimps(l)) {
2522 8 : with_strimps = true;
2523 8 : if (STRMPcreate(l, NULL) != GDK_SUCCEED) {
2524 0 : GDKclrerr();
2525 0 : with_strimps = false;
2526 : }
2527 : }
2528 :
2529 102 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
2530 34 : assert(ATOMtype(l->ttype) == TYPE_str);
2531 :
2532 34 : BATiter li = bat_iterator(l);
2533 34 : BATiter ri = bat_iterator(r);
2534 34 : assert(ri.vh && r->ttype);
2535 :
2536 34 : struct canditer lci, rci;
2537 34 : oid lbase = l->hseqbase,
2538 34 : rbase = r->hseqbase,
2539 34 : lo, ro, lastl = 0;
2540 34 : const char *lvals = (const char *) li.base,
2541 34 : *rvals = (const char *) ri.base,
2542 34 : *lvars = li.vh->base,
2543 34 : *rvars = ri.vh->base,
2544 : *vl, *vr;
2545 34 : int rskipped = 0, vr_len = 0;
2546 34 : BUN matches, newcap;
2547 :
2548 34 : if (anti)
2549 0 : CONTAINS_JOIN_LOOP(str_cmp(vl, vr, vr_len) == 0, str_strlen(vr));
2550 : else
2551 30906 : CONTAINS_JOIN_LOOP(str_cmp(vl, vr, vr_len) != 0, str_strlen(vr));
2552 :
2553 34 : assert(!rr || BATcount(rl) == BATcount(rr));
2554 34 : BATsetcount(rl, BATcount(rl));
2555 34 : if (rr)
2556 34 : BATsetcount(rr, BATcount(rr));
2557 34 : if (BATcount(rl) > 0) {
2558 31 : if (BATtdense(rl))
2559 9 : rl->tseqbase = ((oid *) rl->theap->base)[0];
2560 31 : if (rr && BATtdense(rr))
2561 7 : rr->tseqbase = ((oid *) rr->theap->base)[0];
2562 : } else {
2563 3 : rl->tseqbase = 0;
2564 3 : if (rr)
2565 3 : rr->tseqbase = 0;
2566 : }
2567 :
2568 34 : TRC_DEBUG(ALGO,
2569 : "(%s, l=%s,r=%s)=(%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s\n",
2570 : fname,
2571 : BATgetId(l), BATgetId(r), BATgetId(rl), BATcount(rl),
2572 : rl->tsorted ? "-sorted" : "",
2573 : rl->trevsorted ? "-revsorted" : "",
2574 : rr ? BATgetId(rr) : NULL, rr ? BATcount(rr) : 0,
2575 : rr && rr->tsorted ? "-sorted" : "",
2576 : rr && rr->trevsorted ? "-revsorted" : "");
2577 34 : exit:
2578 34 : bat_iterator_end(&li);
2579 34 : bat_iterator_end(&ri);
2580 34 : return msg;
2581 : }
2582 :
2583 : static str
2584 30 : startswith_join(BAT **rl_ptr, BAT **rr_ptr, BAT *l, BAT *r, BAT *cl, BAT *cr,
2585 : bit anti, int (*str_cmp)(const char *, const char *, int),
2586 : const char *fname)
2587 : {
2588 30 : str msg = MAL_SUCCEED;
2589 30 : gdk_return rc;
2590 :
2591 30 : size_t counter = 0;
2592 30 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2593 :
2594 30 : assert(*rl_ptr && *rr_ptr);
2595 :
2596 30 : BAT *sorted_l = NULL, *sorted_r = NULL,
2597 30 : *sorted_cl = NULL, *sorted_cr = NULL,
2598 30 : *ord_sorted_l = NULL, *ord_sorted_r = NULL,
2599 30 : *proj_rl = NULL, *proj_rr = NULL,
2600 30 : *rl = *rl_ptr, *rr = *rr_ptr;
2601 :
2602 30 : TRC_DEBUG(ALGO,
2603 : "(%s, %s, l=%s#" BUNFMT "[%s]%s%s,"
2604 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
2605 : "sr=%s#" BUNFMT "%s%s)\n",
2606 : fname, "sorted inputs",
2607 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
2608 : l->tsorted ? "-sorted" : "",
2609 : l->trevsorted ? "-revsorted" : "",
2610 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
2611 : r->tsorted ? "-sorted" : "",
2612 : r->trevsorted ? "-revsorted" : "",
2613 : cl ? BATgetId(cl) : "NULL", cl ? BATcount(cl) : 0,
2614 : cl && cl->tsorted ? "-sorted" : "",
2615 : cl && cl->trevsorted ? "-revsorted" : "",
2616 : cr ? BATgetId(cr) : "NULL", cr ? BATcount(cr) : 0,
2617 : cr && cr->tsorted ? "-sorted" : "",
2618 : cr && cr->trevsorted ? "-revsorted" : "");
2619 :
2620 30 : bool l_sorted = BATordered(l);
2621 30 : bool r_sorted = BATordered(r);
2622 :
2623 30 : if (l_sorted == FALSE) {
2624 26 : rc = BATsort(&sorted_l, &ord_sorted_l, NULL,
2625 : l, NULL, NULL, false, false, false);
2626 26 : if (rc != GDK_SUCCEED) {
2627 0 : throw(MAL, fname, "Sorting left input failed");
2628 : } else {
2629 26 : if (cl) {
2630 0 : rc = BATsort(&sorted_cl, NULL, NULL,
2631 : cl, ord_sorted_l, NULL, false, false, false);
2632 0 : if (rc != GDK_SUCCEED) {
2633 0 : BBPnreclaim(2, sorted_l, ord_sorted_l);
2634 0 : throw(MAL, fname, "Sorting left candidates input failed");
2635 : }
2636 : }
2637 : }
2638 : } else {
2639 4 : sorted_l = l;
2640 4 : sorted_cl = cl;
2641 : }
2642 :
2643 30 : if (r_sorted == FALSE) {
2644 18 : rc = BATsort(&sorted_r, &ord_sorted_r, NULL,
2645 : r, NULL, NULL, false, false, false);
2646 18 : if (rc != GDK_SUCCEED) {
2647 0 : BBPnreclaim(3, sorted_l, ord_sorted_l, sorted_cl);
2648 0 : throw(MAL, fname, "Sorting right input failed");
2649 : } else {
2650 18 : if (cr) {
2651 0 : rc = BATsort(&sorted_cr, NULL, NULL,
2652 : cr, ord_sorted_r, NULL, false, false, false);
2653 0 : if (rc != GDK_SUCCEED) {
2654 0 : BBPnreclaim(5, sorted_l, ord_sorted_l, sorted_cl, sorted_r, ord_sorted_r);
2655 0 : throw(MAL, fname, "Sorting right candidates input failed");
2656 : }
2657 : }
2658 : }
2659 : } else {
2660 12 : sorted_r = r;
2661 12 : sorted_cr = cr;
2662 : }
2663 :
2664 30 : assert(BATordered(sorted_l) && BATordered(sorted_r));
2665 :
2666 30 : BATiter li = bat_iterator(sorted_l);
2667 30 : BATiter ri = bat_iterator(sorted_r);
2668 30 : assert(ri.vh && r->ttype);
2669 :
2670 30 : struct canditer lci, rci;
2671 30 : oid lbase = sorted_l->hseqbase,
2672 30 : rbase = sorted_r->hseqbase,
2673 30 : lo, ro, last_lo = 0;
2674 30 : const char *lvals = (const char *) li.base,
2675 30 : *rvals = (const char *) ri.base,
2676 30 : *lvars = li.vh->base,
2677 30 : *rvars = ri.vh->base,
2678 : *vl, *vr;
2679 30 : BUN matches, newcap, n = 0, rx = 0, lx = 0;
2680 30 : int rskipped = 0, vr_len = 0, cmp = 0;
2681 :
2682 30 : if (anti)
2683 0 : STR_JOIN_NESTED_LOOP(str_cmp(vl, vr, vr_len) != 0, str_strlen(vr), fname);
2684 : else
2685 1303 : STARTSWITH_SORTED_LOOP(str_cmp(vl, vr, vr_len), str_strlen(vr), fname);
2686 :
2687 30 : assert(!rr || BATcount(rl) == BATcount(rr));
2688 30 : BATsetcount(rl, BATcount(rl));
2689 30 : if (rr)
2690 30 : BATsetcount(rr, BATcount(rr));
2691 :
2692 30 : if (BATcount(rl) > 0) {
2693 22 : if (BATtdense(rl))
2694 15 : rl->tseqbase = ((oid *) rl->theap->base)[0];
2695 22 : if (rr && BATtdense(rr))
2696 11 : rr->tseqbase = ((oid *) rr->theap->base)[0];
2697 : } else {
2698 8 : rl->tseqbase = 0;
2699 8 : if (rr)
2700 8 : rr->tseqbase = 0;
2701 : }
2702 :
2703 30 : if (l_sorted == FALSE) {
2704 26 : proj_rl = BATproject(rl, ord_sorted_l);
2705 26 : if (!proj_rl) {
2706 0 : msg = createException(MAL, fname, "Project left pre-sort order failed");
2707 0 : goto exit;
2708 : } else {
2709 26 : BBPreclaim(rl);
2710 26 : *rl_ptr = proj_rl;
2711 : }
2712 : }
2713 :
2714 30 : if (rr && r_sorted == FALSE) {
2715 18 : proj_rr = BATproject(rr, ord_sorted_r);
2716 18 : if (!proj_rr) {
2717 0 : BBPreclaim(proj_rl);
2718 0 : msg = createException(MAL, fname, "Project right pre-sort order failed");
2719 0 : goto exit;
2720 : } else {
2721 18 : BBPreclaim(rr);
2722 18 : *rr_ptr = proj_rr;
2723 : }
2724 : }
2725 :
2726 30 : TRC_DEBUG(ALGO,
2727 : "(%s, l=%s,r=%s)=(%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s\n",
2728 : fname,
2729 : BATgetId(l), BATgetId(r), BATgetId(rl), BATcount(rl),
2730 : rl->tsorted ? "-sorted" : "",
2731 : rl->trevsorted ? "-revsorted" : "",
2732 : rr ? BATgetId(rr) : NULL, rr ? BATcount(rr) : 0,
2733 : rr && rr->tsorted ? "-sorted" : "",
2734 : rr && rr->trevsorted ? "-revsorted" : "");
2735 :
2736 30 : exit:
2737 30 : if (l_sorted == FALSE)
2738 26 : BBPnreclaim(3, sorted_l, ord_sorted_l, sorted_cl);
2739 :
2740 30 : if (r_sorted == FALSE)
2741 18 : BBPnreclaim(3, sorted_r, ord_sorted_r, sorted_cr);
2742 :
2743 30 : bat_iterator_end(&li);
2744 30 : bat_iterator_end(&ri);
2745 30 : return msg;
2746 : }
2747 :
2748 : static str
2749 78 : STRjoin(bat *rl_id, bat *rr_id, const bat l_id, const bat r_id,
2750 : const bat cl_id, const bat cr_id, const bit anti, bool icase,
2751 : int (*str_cmp)(const char *, const char *, int), const char *fname)
2752 : {
2753 78 : str msg = MAL_SUCCEED;
2754 :
2755 78 : BAT *rl = NULL, *rr = NULL, *l = NULL, *r = NULL, *cl = NULL, *cr = NULL;
2756 :
2757 78 : if (!(l = BATdescriptor(l_id)) || !(r = BATdescriptor(r_id))) {
2758 0 : BBPnreclaim(2, l, r);
2759 0 : throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2760 : }
2761 :
2762 78 : if ((cl_id && !is_bat_nil(cl_id) && (cl = BATdescriptor(cl_id)) == NULL) ||
2763 78 : (cr_id && !is_bat_nil(cr_id) && (cr = BATdescriptor(cr_id)) == NULL)) {
2764 0 : BBPnreclaim(4, l, r, cl, cr);
2765 0 : throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2766 : }
2767 :
2768 78 : rl = COLnew(0, TYPE_oid, BATcount(l), TRANSIENT);
2769 78 : if (rr_id)
2770 78 : rr = COLnew(0, TYPE_oid, BATcount(l), TRANSIENT);
2771 :
2772 78 : if (!rl || (rr_id && !rr)) {
2773 0 : BBPnreclaim(6, l, r, cl, cr, rl, rr);
2774 0 : throw(MAL, fname, SQLSTATE(HY013) MAL_MALLOC_FAIL);
2775 : }
2776 :
2777 78 : set_empty_bat_props(rl);
2778 78 : if (rr_id)
2779 78 : set_empty_bat_props(rr);
2780 :
2781 234 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
2782 78 : assert(ATOMtype(l->ttype) == TYPE_str);
2783 :
2784 78 : BAT *nl = l, *nr = r;
2785 :
2786 78 : if (strcmp(fname, "str.containsjoin") == 0) {
2787 34 : msg = contains_join(rl, rr, l, r, cl, cr, anti, str_cmp, fname);
2788 34 : if (msg) {
2789 0 : BBPnreclaim(6, rl, rr, l, r, cl, cr);
2790 0 : return msg;
2791 : }
2792 : } else {
2793 44 : struct canditer lci, rci;
2794 44 : canditer_init(&lci, l, cl);
2795 44 : canditer_init(&rci, r, cr);
2796 44 : BUN lcnt = lci.ncand, rcnt = rci.ncand;
2797 44 : BUN nl_cost = lci.ncand * rci.ncand,
2798 44 : sorted_cost =
2799 44 : (BUN) floor(0.8 * (lcnt*log2((double)lcnt)
2800 44 : + rcnt*log2((double)rcnt)));
2801 :
2802 44 : if (nl_cost < sorted_cost) {
2803 14 : msg = str_join_nested(rl, rr, nl, nr, cl, cr, anti, str_cmp, fname);
2804 : } else {
2805 30 : BAT *l_low = NULL, *r_low = NULL, *l_rev = NULL, *r_rev = NULL;
2806 30 : if (icase) {
2807 13 : l_low = batstr_strlower(nl);
2808 13 : if (l_low == NULL) {
2809 0 : BBPnreclaim(6, rl, rr, nl, nr, cl, cr);
2810 0 : throw(MAL, fname, "Failed lowering strings of left input");
2811 : }
2812 13 : r_low = batstr_strlower(nr);
2813 13 : if (r_low == NULL) {
2814 0 : BBPnreclaim(7, rl, rr, nl, nr, cl, cr, l_low);
2815 0 : throw(MAL, fname, "Failed lowering strings of right input");
2816 : }
2817 13 : BBPnreclaim(2, nl, nr);
2818 13 : nl = l_low;
2819 13 : nr = r_low;
2820 : }
2821 30 : if (strcmp(fname, "str.endswithjoin") == 0) {
2822 14 : l_rev = batstr_strrev(nl);
2823 14 : if (l_rev == NULL) {
2824 0 : BBPnreclaim(6, rl, rr, nl, nr, cl, cr);
2825 0 : throw(MAL, fname, "Failed reversing strings of left input");
2826 : }
2827 14 : r_rev = batstr_strrev(nr);
2828 14 : if (r_rev == NULL) {
2829 0 : BBPnreclaim(7, rl, rr, nl, nr, cl, cr, l_rev);
2830 0 : throw(MAL, fname, "Failed reversing strings of right input");
2831 : }
2832 14 : BBPnreclaim(2, nl, nr);
2833 14 : nl = l_rev;
2834 14 : nr = r_rev;
2835 : }
2836 30 : msg = startswith_join(&rl, &rr, nl, nr, cl, cr, anti, str_is_prefix, fname);
2837 : }
2838 : }
2839 :
2840 78 : if (!msg) {
2841 78 : *rl_id = rl->batCacheid;
2842 78 : BBPkeepref(rl);
2843 78 : if (rr_id) {
2844 78 : *rr_id = rr->batCacheid;
2845 78 : BBPkeepref(rr);
2846 : }
2847 : } else {
2848 0 : BBPnreclaim(2, rl, rr);
2849 : }
2850 :
2851 78 : BBPnreclaim(4, nl, nr, cl, cr);
2852 78 : return msg;
2853 : }
2854 :
2855 : #define STRJOIN_MAPARGS(STK, PCI, RL_ID, RR_ID, L_ID, R_ID, CL_ID, CR_ID, IC_ID, ANTI) \
2856 : do { \
2857 : RL_ID = getArgReference(STK, PCI, 0); \
2858 : RR_ID = PCI->retc == 1 ? 0 : getArgReference(STK, PCI, 1); \
2859 : int i = PCI->retc == 1 ? 1 : 2; \
2860 : L_ID = getArgReference(STK, PCI, i++); \
2861 : R_ID = getArgReference(STK, PCI, i++); \
2862 : IC_ID = PCI->argc - PCI->retc == 7 ? \
2863 : NULL : getArgReference(stk, pci, i++); \
2864 : CL_ID = getArgReference(STK, PCI, i++); \
2865 : CR_ID = getArgReference(STK, PCI, i++); \
2866 : ANTI = PCI->argc - PCI->retc == 7 ? \
2867 : getArgReference(STK, PCI, 8) : getArgReference(STK, PCI, 9); \
2868 : } while (0)
2869 :
2870 : static inline str
2871 51 : ignorecase(const bat *ic_id, bool *icase, str fname)
2872 : {
2873 51 : BAT *c = NULL;
2874 :
2875 51 : if ((c = BATdescriptor(*ic_id)) == NULL)
2876 0 : throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2877 :
2878 51 : BUN cnt = BATcount(c);
2879 51 : if (cnt < 1) {
2880 1 : BBPreclaim(c);
2881 1 : throw(MAL, fname, SQLSTATE(42000) "Missing ignore case value\n");
2882 : }
2883 :
2884 50 : BATiter bi = bat_iterator(c);
2885 50 : *icase = *(bit *) BUNtloc(bi, 0);
2886 50 : for(BUN i = 1; i<cnt; i++) {
2887 0 : if (*icase != *(bit*)BUNtloc(bi, i)) {
2888 0 : bat_iterator_end(&bi);
2889 0 : BBPreclaim(c);
2890 0 : throw(MAL, fname, SQLSTATE(42000) "Multiple ignore case values passed, only one expected\n");
2891 : }
2892 : }
2893 50 : bat_iterator_end(&bi);
2894 50 : BBPreclaim(c);
2895 50 : return MAL_SUCCEED;
2896 : }
2897 :
2898 : /**
2899 : * @rl_id: result left oid
2900 : * @rr_id: result right oid
2901 : * @l_id: left oid
2902 : * @r_id: right oid
2903 : * @cl_id: candidates left oid
2904 : * @cr_id: candidates right oid
2905 : * @ic_id: ignore case oid
2906 : * @anti: anti join oid
2907 : */
2908 : static str
2909 23 : STRstartswithjoin(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
2910 : {
2911 23 : (void)cntxt;
2912 23 : (void)mb;
2913 :
2914 23 : str msg = MAL_SUCCEED;
2915 23 : bat *rl_id = NULL, *rr_id = NULL, *l_id = NULL, *r_id = NULL,
2916 23 : *cl_id = NULL, *cr_id = NULL, *ic_id = NULL;
2917 23 : bit *anti = NULL;
2918 23 : bool icase = false;
2919 :
2920 46 : STRJOIN_MAPARGS(stk, pci, rl_id, rr_id, l_id, r_id, cl_id, cr_id, ic_id, anti);
2921 :
2922 23 : if (pci->argc - pci->retc == 8)
2923 19 : msg = ignorecase(ic_id, &icase, "str.startswithjoin");
2924 :
2925 41 : return msg ? msg : STRjoin(rl_id, rr_id, *l_id, *r_id,
2926 : cl_id ? *cl_id : 0,
2927 : cr_id ? *cr_id : 0,
2928 34 : *anti, icase, icase ? str_is_iprefix : str_is_prefix,
2929 : "str.startswithjoin");
2930 : }
2931 :
2932 : /**
2933 : * @rl_id: result left oid
2934 : * @rr_id: result right oid
2935 : * @l_id: left oid
2936 : * @r_id: right oid
2937 : * @cl_id: candidates left oid
2938 : * @cr_id: candidates right oid
2939 : * @ic_id: ignore case oid
2940 : * @anti: anti join oid
2941 : */
2942 : static str
2943 22 : STRendswithjoin(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
2944 : {
2945 22 : (void) cntxt;
2946 22 : (void) mb;
2947 :
2948 22 : str msg = MAL_SUCCEED;
2949 22 : bat *rl_id = NULL, *rr_id = NULL, *l_id = NULL, *r_id = NULL,
2950 22 : *cl_id = NULL, *cr_id = NULL, *ic_id = NULL;
2951 22 : bit *anti = NULL;
2952 22 : bool icase = false;
2953 :
2954 44 : STRJOIN_MAPARGS(stk, pci, rl_id, rr_id, l_id, r_id, cl_id, cr_id, ic_id, anti);
2955 :
2956 22 : if (pci->argc - pci->retc == 8)
2957 18 : msg = ignorecase(ic_id, &icase, "str.endswithjoin");
2958 :
2959 40 : return msg ? msg : STRjoin(rl_id, rr_id, *l_id, *r_id,
2960 : cl_id ? *cl_id : 0, cr_id ? *cr_id : 0,
2961 34 : *anti, icase, icase ? str_is_isuffix : str_is_suffix,
2962 : "str.endswithjoin");
2963 : }
2964 :
2965 : /**
2966 : * @rl_id: result left oid
2967 : * @rr_id: result right oid
2968 : * @l_id: left oid
2969 : * @r_id: right oid
2970 : * @cl_id: candidates left oid
2971 : * @cr_id: candidates right oid
2972 : * @ic_id: ignore case oid
2973 : * @anti: anti join oid
2974 : */
2975 : static str
2976 34 : STRcontainsjoin(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
2977 : {
2978 34 : (void) cntxt;
2979 34 : (void) mb;
2980 :
2981 34 : str msg = MAL_SUCCEED;
2982 34 : bat *rl_id = NULL, *rr_id = NULL, *l_id = NULL, *r_id = NULL,
2983 34 : *cl_id = NULL, *cr_id = NULL, *ic_id = NULL;
2984 34 : bit *anti = NULL;
2985 34 : bool icase = false;
2986 :
2987 68 : STRJOIN_MAPARGS(stk, pci, rl_id, rr_id, l_id, r_id, cl_id, cr_id, ic_id, anti);
2988 :
2989 34 : if (pci->argc - pci->retc == 8)
2990 14 : msg = ignorecase(ic_id, &icase, "str.containsjoin");
2991 :
2992 48 : return msg ? msg : STRjoin(rl_id, rr_id, *l_id, *r_id,
2993 : cl_id ? *cl_id : 0, cr_id ? *cr_id : 0,
2994 58 : *anti, icase, icase ? str_icontains : str_contains,
2995 : "str.containsjoin");
2996 : }
2997 :
2998 : #include "mel.h"
2999 : mel_func str_init_funcs[] = {
3000 : command("str", "str", STRtostr, false, "Noop routine.", args(1,2, arg("",str),arg("s",str))),
3001 : command("str", "string", STRTail, false, "Return the tail s[offset..n]\nof a string s[0..n].", args(1,3, arg("",str),arg("s",str),arg("offset",int))),
3002 : command("str", "string3", STRSubString, false, "Return substring s[offset..offset+count] of a string s[0..n]", args(1,4, arg("",str),arg("s",str),arg("offset",int),arg("count",int))),
3003 : command("str", "length", STRLength, false, "Return the length of a string.", args(1,2, arg("",int),arg("s",str))),
3004 : command("str", "nbytes", STRBytes, false, "Return the string length in bytes.", args(1,2, arg("",int),arg("s",str))),
3005 : command("str", "unicodeAt", STRWChrAt, false, "get a unicode character\n(as an int) from a string position.", args(1,3, arg("",int),arg("s",str),arg("index",int))),
3006 : command("str", "unicode", STRFromWChr, false, "convert a unicode to a character.", args(1,2, arg("",str),arg("wchar",int))),
3007 : pattern("str", "startswith", STRstartswith, false, "Check if string starts with substring.", args(1,3, arg("",bit),arg("s",str),arg("prefix",str))),
3008 : pattern("str", "startswith", STRstartswith, false, "Check if string starts with substring, icase flag.", args(1,4, arg("",bit),arg("s",str),arg("prefix",str),arg("icase",bit))),
3009 : pattern("str", "endswith", STRendswith, false, "Check if string ends with substring.", args(1,3, arg("",bit),arg("s",str),arg("suffix",str))),
3010 : pattern("str", "endswith", STRendswith, false, "Check if string ends with substring, icase flag.", args(1,4, arg("",bit),arg("s",str),arg("suffix",str),arg("icase",bit))),
3011 : pattern("str", "contains", STRcontains, false, "Check if string haystack contains string needle.", args(1,3, arg("",bit),arg("haystack",str),arg("needle",str))),
3012 : pattern("str", "contains", STRcontains, false, "Check if string haystack contains string needle, icase flag.", args(1,4, arg("",bit),arg("haystack",str),arg("needle",str),arg("icase",bit))),
3013 : command("str", "toLower", STRlower, false, "Convert a string to lower case.", args(1,2, arg("",str),arg("s",str))),
3014 : command("str", "toUpper", STRupper, false, "Convert a string to upper case.", args(1,2, arg("",str),arg("s",str))),
3015 : command("str", "caseFold", STRcasefold, false, "Fold the case of a string.", args(1,2, arg("",str),arg("s",str))),
3016 : pattern("str", "search", STRstr_search, false, "Search for a substring. Returns\nposition, -1 if not found.", args(1,3, arg("",int),arg("s",str),arg("c",str))),
3017 : pattern("str", "search", STRstr_search, false, "Search for a substring, icase flag. Returns\nposition, -1 if not found.", args(1,4, arg("",int),arg("s",str),arg("c",str),arg("icase",bit))),
3018 : pattern("str", "r_search", STRrevstr_search, false, "Reverse search for a substring. Returns\nposition, -1 if not found.", args(1,3, arg("",int),arg("s",str),arg("c",str))),
3019 : pattern("str", "r_search", STRrevstr_search, false, "Reverse search for a substring, icase flag. Returns\nposition, -1 if not found.", args(1,4, arg("",int),arg("s",str),arg("c",str),arg("icase",bit))),
3020 : command("str", "splitpart", STRsplitpart, false, "Split string on delimiter. Returns\ngiven field (counting from one.)", args(1,4, arg("",str),arg("s",str),arg("needle",str),arg("field",int))),
3021 : command("str", "trim", STRStrip, false, "Strip whitespaces around a string.", args(1,2, arg("",str),arg("s",str))),
3022 : command("str", "ltrim", STRLtrim, false, "Strip whitespaces from start of a string.", args(1,2, arg("",str),arg("s",str))),
3023 : command("str", "rtrim", STRRtrim, false, "Strip whitespaces from end of a string.", args(1,2, arg("",str),arg("s",str))),
3024 : command("str", "trim2", STRStrip2, false, "Remove the longest string containing only characters from the second string around the first string.", args(1,3, arg("",str),arg("s",str),arg("s2",str))),
3025 : command("str", "ltrim2", STRLtrim2, false, "Remove the longest string containing only characters from the second string from the start of the first string.", args(1,3, arg("",str),arg("s",str),arg("s2",str))),
3026 : command("str", "rtrim2", STRRtrim2, false, "Remove the longest string containing only characters from the second string from the end of the first string.", args(1,3, arg("",str),arg("s",str),arg("s2",str))),
3027 : command("str", "lpad", STRLpad, false, "Fill up a string to the given length prepending the whitespace character.", args(1,3, arg("",str),arg("s",str),arg("len",int))),
3028 : command("str", "rpad", STRRpad, false, "Fill up a string to the given length appending the whitespace character.", args(1,3, arg("",str),arg("s",str),arg("len",int))),
3029 : command("str", "lpad3", STRLpad3, false, "Fill up the first string to the given length prepending characters of the second string.", args(1,4, arg("",str),arg("s",str),arg("len",int),arg("s2",str))),
3030 : command("str", "rpad3", STRRpad3, false, "Fill up the first string to the given length appending characters of the second string.", args(1,4, arg("",str),arg("s",str),arg("len",int),arg("s2",str))),
3031 : command("str", "substitute", STRSubstitute, false, "Substitute first occurrence of 'src' by\n'dst'. Iff repeated = true this is\nrepeated while 'src' can be found in the\nresult string. In order to prevent\nrecursion and result strings of unlimited\nsize, repeating is only done iff src is\nnot a substring of dst.", args(1,5, arg("",str),arg("s",str),arg("src",str),arg("dst",str),arg("rep",bit))),
3032 : command("str", "like", STRlikewrap, false, "SQL pattern match function", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
3033 : command("str", "like3", STRlikewrap3, false, "SQL pattern match function", args(1,4, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str))),
3034 : command("str", "ascii", STRascii, false, "Return unicode of head of string", args(1,2, arg("",int),arg("s",str))),
3035 : command("str", "substring", STRsubstringTail, false, "Extract the tail of a string", args(1,3, arg("",str),arg("s",str),arg("start",int))),
3036 : command("str", "substring3", STRsubstring, false, "Extract a substring from str starting at start, for length len", args(1,4, arg("",str),arg("s",str),arg("start",int),arg("len",int))),
3037 : command("str", "prefix", STRprefix, false, "Extract the prefix of a given length", args(1,3, arg("",str),arg("s",str),arg("l",int))),
3038 : command("str", "suffix", STRsuffix, false, "Extract the suffix of a given length", args(1,3, arg("",str),arg("s",str),arg("l",int))),
3039 : command("str", "stringleft", STRprefix, false, "", args(1,3, arg("",str),arg("s",str),arg("l",int))),
3040 : command("str", "stringright", STRsuffix, false, "", args(1,3, arg("",str),arg("s",str),arg("l",int))),
3041 : command("str", "locate", STRlocate, false, "Locate the start position of a string", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
3042 : command("str", "locate3", STRlocate3, false, "Locate the start position of a string", args(1,4, arg("",int),arg("s1",str),arg("s2",str),arg("start",int))),
3043 : command("str", "insert", STRinsert, false, "Insert a string into another", args(1,5, arg("",str),arg("s",str),arg("start",int),arg("l",int),arg("s2",str))),
3044 : command("str", "replace", STRreplace, false, "Insert a string into another", args(1,4, arg("",str),arg("s",str),arg("pat",str),arg("s2",str))),
3045 : command("str", "repeat", STRrepeat, false, "", args(1,3, arg("",str),arg("s2",str),arg("c",int))),
3046 : command("str", "space", STRspace, false, "", args(1,2, arg("",str),arg("l",int))),
3047 : command("str", "asciify", STRasciify, false, "Transform string from UTF8 to ASCII", args(1, 2, arg("out",str), arg("in",str))),
3048 : pattern("str", "startswithselect", STRstartswithselect, false, "Select all head values of the first input BAT for which the\ntail value starts with the given prefix.", args(1,5, batarg("",oid),batarg("b",str),batarg("s",oid),arg("prefix",str),arg("anti",bit))),
3049 : pattern("str", "startswithselect", STRstartswithselect, false, "Select all head values of the first input BAT for which the\ntail value starts with the given prefix + icase.", args(1,6, batarg("",oid),batarg("b",str),batarg("s",oid),arg("prefix",str),arg("caseignore",bit),arg("anti",bit))),
3050 : pattern("str", "endswithselect", STRendswithselect, false, "Select all head values of the first input BAT for which the\ntail value end with the given suffix.", args(1,5, batarg("",oid),batarg("b",str),batarg("s",oid),arg("suffix",str),arg("anti",bit))),
3051 : pattern("str", "endswithselect", STRendswithselect, false, "Select all head values of the first input BAT for which the\ntail value end with the given suffix + icase.", args(1,6, batarg("",oid),batarg("b",str),batarg("s",oid),arg("suffix",str),arg("caseignore",bit),arg("anti",bit))),
3052 : pattern("str", "containsselect", STRcontainsselect, false, "Select all head values of the first input BAT for which the\ntail value contains the given needle.", args(1,5, batarg("",oid),batarg("b",str),batarg("s",oid),arg("needle",str),arg("anti",bit))),
3053 : pattern("str", "containsselect", STRcontainsselect, false, "Select all head values of the first input BAT for which the\ntail value contains the given needle + icase.", args(1,6, batarg("",oid),batarg("b",str),batarg("s",oid),arg("needle",str),arg("caseignore",bit),arg("anti",bit))),
3054 : pattern("str", "startswithjoin", STRstartswithjoin, false, "Join the string bat L with the prefix bat R\nwith optional candidate lists SL and SR\nThe result is two aligned bats with oids of matching rows.", args(2,9, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
3055 : pattern("str", "startswithjoin", STRstartswithjoin, false, "Join the string bat L with the prefix bat R\nwith optional candidate lists SL and SR\nThe result is two aligned bats with oids of matching rows + icase.", args(2,10, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
3056 : pattern("str", "startswithjoin", STRstartswithjoin, false, "The same as STRstartswithjoin, but only produce one output.", args(1,8,batarg("",oid),batarg("l",str),batarg("r",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
3057 : pattern("str", "startswithjoin", STRstartswithjoin, false, "The same as STRstartswithjoin, but only produce one output + icase.", args(1,9,batarg("",oid),batarg("l",str),batarg("r",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
3058 : pattern("str", "endswithjoin", STRendswithjoin, false, "Join the string bat L with the suffix bat R\nwith optional candidate lists SL and SR\nThe result is two aligned bats with oids of matching rows.", args(2,9, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
3059 : pattern("str", "endswithjoin", STRendswithjoin, false, "Join the string bat L with the suffix bat R\nwith optional candidate lists SL and SR\nThe result is two aligned bats with oids of matching rows + icase.", args(2,10, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
3060 : pattern("str", "endswithjoin", STRendswithjoin, false, "The same as STRendswithjoin, but only produce one output.", args(1,8,batarg("",oid),batarg("l",str),batarg("r",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
3061 : pattern("str", "endswithjoin", STRendswithjoin, false, "The same as STRendswithjoin, but only produce one output + icase.", args(1,9,batarg("",oid),batarg("l",str),batarg("r",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
3062 : pattern("str", "containsjoin", STRcontainsjoin, false, "Join the string bat L with the bat R if L contains the string of R\nwith optional candidate lists SL and SR\nThe result is two aligned bats with oids of matching rows.", args(2,9, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
3063 : pattern("str", "containsjoin", STRcontainsjoin, false, "Join the string bat L with the bat R if L contains the string of R\nwith optional candidate lists SL and SR\nThe result is two aligned bats with oids of matching rows + icase.", args(2,10, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
3064 : pattern("str", "containsjoin", STRcontainsjoin, false, "The same as STRcontainsjoin, but only produce one output.", args(1,8,batarg("",oid),batarg("l",str),batarg("r",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
3065 : pattern("str", "containsjoin", STRcontainsjoin, false, "The same as STRcontainsjoin, but only produce one output + icase.", args(1,9,batarg("",oid),batarg("l",str),batarg("r",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
3066 : { .imp=NULL }
3067 : };
3068 : #include "mal_import.h"
3069 : #ifdef _MSC_VER
3070 : #undef read
3071 : #pragma section(".CRT$XCU",read)
3072 : #endif
3073 324 : LIB_STARTUP_FUNC(init_str_mal)
3074 324 : { mal_module2("str", NULL, str_init_funcs, NULL, NULL); }
|