LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - pcre.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1010 1289 78.4 %
Date: 2024-04-26 00:35:57 Functions: 50 56 89.3 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  * N. Nes
      15             :  * PCRE library interface
      16             :  * The  PCRE library is a set of functions that implement regular
      17             :  * expression pattern matching using the same syntax  and  semantics  as  Perl,
      18             :  * with  just  a  few  differences.  The  current  implementation of PCRE
      19             :  * (release 4.x) corresponds approximately with Perl 5.8, including  support
      20             :  * for  UTF-8  encoded  strings.   However,  this support has to be
      21             :  * explicitly enabled; it is not the default.
      22             :  *
      23             :  * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
      24             :  */
      25             : #include "monetdb_config.h"
      26             : #include <string.h>
      27             : 
      28             : #include "mal.h"
      29             : #include "mal_client.h"
      30             : #include "mal_interpreter.h"
      31             : #include "mal_exception.h"
      32             : 
      33             : #include <wchar.h>
      34             : #include <wctype.h>
      35             : 
      36             : #ifdef HAVE_LIBPCRE
      37             : #include <pcre.h>
      38             : #ifndef PCRE_STUDY_JIT_COMPILE
      39             : /* old library version on e.g. EPEL 6 */
      40             : #define pcre_free_study(x)              pcre_free(x)
      41             : #define PCRE_STUDY_JIT_COMPILE  0
      42             : #endif
      43             : #define JIT_COMPILE_MIN 1024    /* when to try JIT compilation of patterns */
      44             : 
      45             : #else
      46             : 
      47             : #include <regex.h>
      48             : 
      49             : typedef regex_t pcre;
      50             : #endif
      51             : 
      52             : /* current implementation assumes simple %keyword% [keyw%]* */
      53             : struct RE {
      54             :         char *k;
      55             :         uint32_t *w;
      56             :         bool search:1, atend:1, is_ascii:1, case_ignore:1;
      57             :         size_t len;
      58             :         struct RE *n;
      59             : };
      60             : 
      61             : /* We cannot use strcasecmp and strncasecmp since they work byte for
      62             :  * byte and don't deal with multibyte encodings (such as UTF-8).
      63             :  *
      64             :  * We implement our own conversion from UTF-8 encoding to Unicode code
      65             :  * points which we store in uint32_t.  The reason for this is,
      66             :  * functions like mbsrtowcs are locale-dependent (so we need a UTF-8
      67             :  * locale to use them), and on Windows, wchar_t is only 2 bytes and
      68             :  * therefore cannot hold all Unicode code points.  We do use functions
      69             :  * such as towlower to convert a Unicode code point to its lower-case
      70             :  * equivalent, but again on Windows, if the code point doesn't fit in
      71             :  * 2 bytes, we skip this conversion and compare the unconverted code
      72             :  * points.
      73             :  *
      74             :  * Note, towlower is also locale-dependent, but we don't need a UTF-8
      75             :  * locale in order to use it. */
      76             : 
      77             : /* helper function to convert a UTF-8 multibyte character to a wide
      78             :  * character */
      79             : static size_t
      80         274 : utfc8touc(uint32_t *restrict dest, const char *restrict src)
      81             : {
      82         274 :         if ((src[0] & 0x80) == 0) {
      83         217 :                 *dest = src[0];
      84         217 :                 return src[0] != 0;
      85          57 :         } else if ((src[0] & 0xE0) == 0xC0
      86          40 :                            && (src[1] & 0xC0) == 0x80 && (src[0] & 0x1E) != 0) {
      87          40 :                 *dest = (src[0] & 0x1F) << 6 | (src[1] & 0x3F);
      88          40 :                 return 2;
      89          17 :         } else if ((src[0] & 0xF0) == 0xE0
      90          17 :                            && (src[1] & 0xC0) == 0x80
      91          17 :                            && (src[2] & 0xC0) == 0x80
      92          17 :                            && ((src[0] & 0x0F) != 0 || (src[1] & 0x20) != 0)) {
      93          17 :                 *dest = (src[0] & 0x0F) << 12 | (src[1] & 0x3F) << 6 | (src[2] & 0x3F);
      94          17 :                 return 3;
      95           0 :         } else if ((src[0] & 0xF8) == 0xF0
      96           0 :                            && (src[1] & 0xC0) == 0x80
      97           0 :                            && (src[2] & 0xC0) == 0x80 && (src[3] & 0xC0) == 0x80) {
      98           0 :                 uint32_t c = (src[0] & 0x07) << 18
      99           0 :                                 | (src[1] & 0x3F) << 12
     100           0 :                                 | (src[2] & 0x3F) << 6 | (src[3] & 0x3F);
     101           0 :                 if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
     102             :                         return (size_t) -1;
     103           0 :                 *dest = c;
     104           0 :                 return 4;
     105             :         }
     106             :         return (size_t) -1;
     107             : }
     108             : 
     109             : /* helper function to convert a UTF-8 string to a wide character
     110             :  * string, the wide character string is allocated */
     111             : static uint32_t *
     112          74 : utf8stoucs(const char *src)
     113             : {
     114          74 :         uint32_t *dest;
     115          74 :         size_t i = 0;
     116          74 :         size_t j = 0;
     117             : 
     118             :         /* count how many uint32_t's we need, while also checking for
     119             :          * correctness of the input */
     120         340 :         while (src[j]) {
     121         266 :                 i++;
     122         266 :                 if ((src[j + 0] & 0x80) == 0) {
     123         192 :                         j += 1;
     124          74 :                 } else if ((src[j + 0] & 0xE0) == 0xC0
     125          49 :                                    && (src[j + 1] & 0xC0) == 0x80 && (src[j + 0] & 0x1E) != 0) {
     126          49 :                         j += 2;
     127          25 :                 } else if ((src[j + 0] & 0xF0) == 0xE0
     128          25 :                                    && (src[j + 1] & 0xC0) == 0x80
     129          25 :                                    && (src[j + 2] & 0xC0) == 0x80
     130          25 :                                    && ((src[j + 0] & 0x0F) != 0 || (src[j + 1] & 0x20) != 0)) {
     131          25 :                         j += 3;
     132           0 :                 } else if ((src[j + 0] & 0xF8) == 0xF0
     133           0 :                                    && (src[j + 1] & 0xC0) == 0x80
     134           0 :                                    && (src[j + 2] & 0xC0) == 0x80
     135           0 :                                    && (src[j + 3] & 0xC0) == 0x80) {
     136           0 :                         uint32_t c = (src[j + 0] & 0x07) << 18
     137           0 :                                         | (src[j + 1] & 0x3F) << 12
     138           0 :                                         | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
     139           0 :                         if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
     140             :                                 return NULL;
     141           0 :                         j += 4;
     142             :                 } else {
     143             :                         return NULL;
     144             :                 }
     145             :         }
     146          74 :         dest = GDKmalloc((i + 1) * sizeof(uint32_t));
     147          74 :         if (dest == NULL)
     148             :                 return NULL;
     149             :         /* go through the source string again, this time we can skip
     150             :          * the correctness tests */
     151             :         i = j = 0;
     152         340 :         while (src[j]) {
     153         266 :                 if ((src[j + 0] & 0x80) == 0) {
     154         192 :                         dest[i++] = src[j + 0];
     155         192 :                         j += 1;
     156          74 :                 } else if ((src[j + 0] & 0xE0) == 0xC0) {
     157          49 :                         dest[i++] = (src[j + 0] & 0x1F) << 6 | (src[j + 1] & 0x3F);
     158          49 :                         j += 2;
     159          25 :                 } else if ((src[j + 0] & 0xF0) == 0xE0) {
     160          25 :                         dest[i++] = (src[j + 0] & 0x0F) << 12
     161          25 :                                         | (src[j + 1] & 0x3F) << 6 | (src[j + 2] & 0x3F);
     162          25 :                         j += 3;
     163           0 :                 } else if ((src[j + 0] & 0xF8) == 0xF0) {
     164           0 :                         dest[i++] = (src[j + 0] & 0x07) << 18
     165           0 :                                         | (src[j + 1] & 0x3F) << 12
     166           0 :                                         | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
     167           0 :                         j += 4;
     168             :                 }
     169             :         }
     170          74 :         dest[i] = 0;
     171          74 :         return dest;
     172             : }
     173             : 
     174             : static size_t
     175          33 : myucslen(const uint32_t *ucs)
     176             : {
     177          33 :         size_t i = 0;
     178             : 
     179          66 :         while (ucs[i])
     180          33 :                 i++;
     181          33 :         return i;
     182             : }
     183             : 
     184             : static inline bool
     185          14 : mywstrncaseeq(const char *restrict s1, const uint32_t *restrict s2, size_t n2,
     186             :                           bool atend)
     187             : {
     188          14 :         uint32_t c1;
     189             : 
     190          27 :         while (n2 > 0) {
     191          20 :                 size_t nn1 = utfc8touc(&c1, s1);
     192          20 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     193           0 :                         return (*s2 == 0);
     194          20 :                 if (*s2 == 0)
     195             :                         return false;
     196             : #if SIZEOF_WCHAR_T == 2
     197             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     198             :                         if (c1 != *s2)
     199             :                                 return false;
     200             :                 } else
     201             : #endif
     202          20 :                 if (towlower((wint_t) c1) != towlower((wint_t) * s2))
     203             :                         return false;
     204          13 :                 s1 += nn1;
     205          13 :                 n2--;
     206          13 :                 s2++;
     207             :         }
     208          14 :         return !atend || *s1 == 0;
     209             : }
     210             : 
     211             : static inline int
     212           0 : mystrcasecmp(const char *s1, const char *s2)
     213             : {
     214           0 :         uint32_t c1 = 0, c2 = 0;
     215             : 
     216           0 :         for (;;) {
     217           0 :                 size_t nn1 = utfc8touc(&c1, s1);
     218           0 :                 size_t nn2 = utfc8touc(&c2, s2);
     219           0 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     220           0 :                         return -(nn2 != 0 && nn2 != (size_t) -1);
     221           0 :                 if (nn2 == 0 || nn2 == (size_t) -1)
     222             :                         return 1;
     223             : #if SIZEOF_WCHAR_T == 2
     224             :                 if (c1 > 0xFFFF || c2 > 0xFFFF) {
     225             :                         if (c1 != c2)
     226             :                                 return c1 - c2;
     227             :                 } else
     228             : #endif
     229           0 :                 if (towlower((wint_t) c1) != towlower((wint_t) c2))
     230           0 :                         return towlower((wint_t) c1) - towlower((wint_t) c2);
     231           0 :                 s1 += nn1;
     232           0 :                 s2 += nn2;
     233             :         }
     234             : }
     235             : 
     236             : static inline int
     237          42 : mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2)
     238             : {
     239          42 :         uint32_t c1 = 0;
     240             : 
     241         330 :         for (;;) {
     242         186 :                 size_t nn1 = utfc8touc(&c1, s1);
     243         186 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     244          22 :                         return -(*s2 != 0);
     245         164 :                 if (*s2 == 0)
     246             :                         return 1;
     247             : #if SIZEOF_WCHAR_T == 2
     248             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     249             :                         if (c1 != *s2)
     250             :                                 return c1 - *s2;
     251             :                 } else
     252             : #endif
     253         164 :                 if (towlower((wint_t) c1) != towlower((wint_t) * s2))
     254          20 :                         return towlower((wint_t) c1) - towlower((wint_t) * s2);
     255         144 :                 s1 += nn1;
     256         144 :                 s2++;
     257             :         }
     258             : }
     259             : 
     260             : static inline const char *
     261          33 : mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle,
     262             :                           bool atend)
     263             : {
     264          33 :         size_t nlen = myucslen(wneedle);
     265             : 
     266          33 :         if (nlen == 0)
     267           0 :                 return atend ? haystack + strlen(haystack) : haystack;
     268             : 
     269          86 :         while (*haystack) {
     270             :                 size_t i;
     271             :                 size_t h;
     272             :                 size_t step = 0;
     273          83 :                 for (i = h = 0; i < nlen; i++) {
     274          68 :                         uint32_t c = 0;
     275          68 :                         size_t j = utfc8touc(&c, haystack + h);
     276          68 :                         if (j == 0 || j == (size_t) -1)
     277           0 :                                 return NULL;
     278          68 :                         if (i == 0) {
     279          68 :                                 step = j;
     280             :                         }
     281             : #if SIZEOF_WCHAR_T == 2
     282             :                         if (c > 0xFFFF || wneedle[i] > 0xFFFF) {
     283             :                                 if (c != wneedle[i])
     284             :                                         break;
     285             :                         } else
     286             : #endif
     287          68 :                         if (towlower((wint_t) c) != towlower((wint_t) wneedle[i]))
     288             :                                 break;
     289          15 :                         h += j;
     290             :                 }
     291          68 :                 if (i == nlen && (!atend || haystack[h] == 0))
     292          15 :                         return haystack;
     293          53 :                 haystack += step;
     294             :         }
     295             :         return NULL;
     296             : }
     297             : 
     298             : /* returns true if the pattern does not contain unescaped `_' (single
     299             :  * character match) and ends with unescaped `%' (any sequence
     300             :  * match) */
     301             : static inline bool
     302        8626 : re_simple(const char *pat, unsigned char esc)
     303             : {
     304        8626 :         bool escaped = false;
     305             : 
     306        8626 :         if (pat == 0)
     307             :                 return false;
     308        8626 :         if (*pat == '%') {
     309        7494 :                 pat++;
     310             :         }
     311       61516 :         while (*pat) {
     312       53645 :                 if (escaped) {
     313             :                         escaped = false;
     314       53485 :                 } else if ((unsigned char) *pat == esc) {
     315             :                         escaped = true;
     316       53321 :                 } else if (*pat == '_') {
     317             :                         return false;
     318             :                 }
     319       52890 :                 pat++;
     320             :         }
     321             :         return true;
     322             : }
     323             : 
     324             : static inline bool
     325        9550 : re_is_pattern_properly_escaped(const char *pat, unsigned char esc)
     326             : {
     327        9550 :         bool escaped = false;
     328             : 
     329        9550 :         if (pat == 0)
     330             :                 return true;
     331       79106 :         while (*pat) {
     332       69556 :                 if (escaped) {
     333             :                         escaped = false;
     334       69374 :                 } else if ((unsigned char) *pat == esc) {
     335       69556 :                         escaped = true;
     336             :                 }
     337       69556 :                 pat++;
     338             :         }
     339        9550 :         return escaped ? false : true;
     340             : }
     341             : 
     342             : /* returns true if the pattern does not contain wildcard
     343             :  * characters ('%' or '_') and no character is escaped
     344             :  */
     345             : static inline bool
     346        9558 : is_strcmpable(const char *pat, const char *esc)
     347             : {
     348        9558 :         if (pat[strcspn(pat, "%_")])
     349             :                 return false;
     350        1941 :         return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
     351             : }
     352             : 
     353             : /* Compare two strings ignoring case. When both strings are
     354             :  * lower case this function returns the same result as strcmp.
     355             :  */
     356             : static int
     357         601 : istrcmp(const char *s1, const char *s2)
     358             : {
     359         601 :         char c1, c2;
     360         601 :         const char *p1, *p2;
     361        1000 :         for (p1 = s1, p2 = s2; *p1 && *p2; p1++, p2++) {
     362         530 :                 c1 = *p1;
     363         530 :                 c2 = *p2;
     364             : 
     365         530 :                 if ('A' <= c1 && c1 <= 'Z')
     366          19 :                         c1 += 'a' - 'A';
     367             : 
     368         530 :                 if ('A' <= c2 && c2 <= 'Z')
     369          72 :                         c2 += 'a' - 'A';
     370             : 
     371         530 :                 if (c1 != c2)
     372         131 :                         return (c1 - c2);
     373             :         }
     374             : 
     375         470 :         if (*p1 != *p2)
     376         397 :                 return *p1 - *p2;
     377             : 
     378             :         return 0;
     379             : }
     380             : 
     381             : /* Compare at most len characters of two strings ignoring
     382             :  * case. When both strings are lowercase this function
     383             :  * returns the same result as strncmp.
     384             :  */
     385             : static int
     386          16 : istrncmp(const char *s1, const char *s2, size_t len)
     387             : {
     388          16 :         char c1, c2;
     389          16 :         const char *p1, *p2;
     390          16 :         size_t n = 0;
     391             : 
     392          32 :         for (p1 = s1, p2 = s2; *p1 && *p2 && (n < len); p1++, p2++, n++) {
     393          16 :                 c1 = *p1;
     394          16 :                 c2 = *p2;
     395             : 
     396          16 :                 if ('A' <= c1 && c1 <= 'Z')
     397           4 :                         c1 += 'a' - 'A';
     398             : 
     399          16 :                 if ('A' <= c2 && c2 <= 'Z')
     400           0 :                         c2 += 'a' - 'A';
     401             : 
     402          16 :                 if (c1 != c2)
     403           0 :                         return c1 - c2;
     404             :         }
     405             : 
     406          16 :         if (*p1 != *p2 && n < len)
     407           0 :                 return *p1 - *p2;
     408             : 
     409             :         return 0;
     410             : }
     411             : 
     412             : 
     413             : /* Find the first occurence of the substring needle in
     414             :  * haystack ignoring case.
     415             :  *
     416             :  * NOTE: This function assumes that the needle is already
     417             :  * lowercase.
     418             :  */
     419             : static const char *
     420        6204 : istrstr(const char *haystack, const char *needle)
     421             : {
     422        6204 :         const char *ph;
     423        6204 :         const char *pn;
     424        6204 :         const char *p1;
     425        6204 :         bool match = true;
     426             : 
     427      227049 :         for (ph = haystack; *ph; ph++) {
     428      270770 :                 match = true;
     429      270770 :                 for (pn = needle, p1 = ph; *pn && *p1; pn++, p1++) {
     430      268593 :                         char c1 = *pn;
     431      268593 :                         char c2 = ('A' <= *p1 && *p1 <= 'Z') ? *p1 - 'A' + 'a' : *p1;
     432      268593 :                         if (c1 != c2) {
     433             :                                 match = false;
     434             :                                 break;
     435             :                         }
     436             :                 }
     437             : 
     438             :                 /* We reached the end of the haystack, but we still have characters in
     439             :                  * needle. None of the future iterations will match.
     440             :                  */
     441      223021 :                 if (*p1 == 0 && *pn != 0) {
     442             :                         break;
     443             :                 }
     444             : 
     445      223021 :                 if (match) {
     446        2176 :                         return ph;
     447             :                 }
     448             :         }
     449             :         return NULL;
     450             : }
     451             : 
     452             : /* Match regular expression by comparing bytes.
     453             :  *
     454             :  * This is faster than re_match_ignore, because it does not
     455             :  * need to decode characters. This function should be used
     456             :  * in all cases except when we need to perform UTF-8
     457             :  * comparisons ignoring case.
     458             :  *
     459             :  * TODO: The name of the function is no longer accurate and
     460             :  * needs to change.
     461             :  */
     462             : static inline bool
     463      135083 : re_match_no_ignore(const char *restrict s, const struct RE *restrict pattern)
     464             : {
     465      135083 :         const struct RE *r;
     466      135083 :         size_t l;
     467             : 
     468      184916 :         for (r = pattern; r; r = r->n) {
     469      135834 :                 if (*r->k == 0 && (r->search || *s == 0))
     470             :                         return true;
     471      122028 :                 if (!*s ||
     472             :                         (r->search
     473      121955 :                          ? (r->atend
     474      108567 :                                 ? (r->case_ignore
     475        4997 :                                    ? (l = strlen(s)) < r->len || istrcmp(s + l - r->len, r->k) != 0
     476        4913 :                                    : (l = strlen(s)) < r->len || strcmp(s + l - r->len, r->k) != 0)
     477      103570 :                                 : (r->case_ignore ? (s = istrstr(s, r->k)) == NULL
     478       97464 :                                    : (s = strstr(s, r->k)) == NULL))
     479             :                          : (r->atend
     480       13388 :                                 ? (r->case_ignore ? istrcmp(s, r->k) != 0
     481          93 :                                    : strcmp(s, r->k) != 0)
     482       13295 :                                 : (r->case_ignore ? istrncmp(s, r->k, r->len) != 0
     483       13279 :                                    : strncmp(s, r->k, r->len) != 0))))
     484             :                         return false;
     485       49833 :                 s += r->len;
     486             :         }
     487             :         return true;
     488             : }
     489             : 
     490             : /* Match a regular expression by comparing wide characters.
     491             :  *
     492             :  * This needs to be used when we need to perform a
     493             :  * case-ignoring comparions involving UTF-8 characters.
     494             :  */
     495             : static inline bool
     496          44 : re_match_ignore(const char *restrict s, const struct RE *restrict pattern)
     497             : {
     498          44 :         const struct RE *r;
     499             : 
     500             :         /* Since the pattern is ascii, do the cheaper comparison */
     501          44 :         if (pattern->is_ascii) {
     502           0 :                 return re_match_no_ignore(s, pattern);
     503             :         }
     504             : 
     505          66 :         for (r = pattern; r; r = r->n) {
     506          47 :                 if (*r->w == 0 && (r->search || *s == 0))
     507             :                         return true;
     508          47 :                 if (!*s ||
     509             :                         (r->search
     510          47 :                          ? (s = mywstrcasestr(s, r->w, r->atend)) == NULL
     511          14 :                          : !mywstrncaseeq(s, r->w, r->len, r->atend)))
     512             :                         return false;
     513          22 :                 s += r->len;
     514             :         }
     515             :         return true;
     516             : }
     517             : 
     518             : static void
     519        7894 : re_destroy(struct RE *p)
     520             : {
     521        7894 :         if (p) {
     522        7894 :                 GDKfree(p->k);
     523        7898 :                 GDKfree(p->w);
     524        8029 :                 do {
     525        8029 :                         struct RE *n = p->n;
     526             : 
     527        8029 :                         GDKfree(p);
     528        8031 :                         p = n;
     529        8031 :                 } while (p);
     530             :         }
     531        7900 : }
     532             : 
     533             : /* Create a linked list of RE structures.  Depending on the
     534             :  * caseignore and the ascii_pattern flags, the w
     535             :  * (if caseignore == true && ascii_pattern == false) or the k
     536             :  * (in every other case) field is used.  These in the first
     537             :  * structure are allocated, whereas in all subsequent
     538             :  * structures the fields point into the allocated buffer of
     539             :  * the first.
     540             :  */
     541             : static struct RE *
     542        7891 : re_create(const char *pat, bool caseignore, bool ascii_pattern, uint32_t esc)
     543             : {
     544        7891 :         struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
     545        7897 :         bool escaped = false;
     546             : 
     547        7897 :         if (r == NULL)
     548             :                 return NULL;
     549        7897 :         *r = (struct RE) {.atend = true };
     550             : 
     551       14997 :         while (esc != '%' && *pat == '%') {
     552        7100 :                 pat++;                                  /* skip % */
     553        7100 :                 r->search = true;
     554             :         }
     555        7897 :         if (caseignore && !ascii_pattern) {
     556          36 :                 uint32_t *wp;
     557          36 :                 uint32_t *wq;
     558          36 :                 wp = utf8stoucs(pat);
     559          36 :                 if (wp == NULL) {
     560           0 :                         GDKfree(r);
     561           0 :                         return NULL;
     562             :                 }
     563          36 :                 r->w = wp;
     564          36 :                 wq = wp;
     565         112 :                 while (*wp) {
     566          76 :                         if (escaped) {
     567           0 :                                 *wq++ = *wp;
     568           0 :                                 n->len++;
     569           0 :                                 escaped = false;
     570          76 :                         } else if (*wp == esc) {
     571             :                                 escaped = true;
     572          76 :                         } else if (*wp == '%') {
     573          28 :                                 n->atend = false;
     574          28 :                                 while (wp[1] == '%')
     575           0 :                                         wp++;
     576          28 :                                 if (wp[1]) {
     577           4 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     578           4 :                                         if (n == NULL)
     579           0 :                                                 goto bailout;
     580           4 :                                         *n = (struct RE) {
     581             :                                                 .search = true,
     582             :                                                 .atend = true,
     583           4 :                                                 .w = wp + 1,
     584             :                                         };
     585             :                                 }
     586          28 :                                 *wq = 0;
     587          28 :                                 wq = wp + 1;
     588             :                         } else {
     589          48 :                                 *wq++ = *wp;
     590          48 :                                 n->len++;
     591             :                         }
     592          76 :                         wp++;
     593             :                 }
     594          36 :                 *wq = 0;
     595             :         } else {
     596        7861 :                 char *p, *q;
     597        7861 :                 if ((p = GDKstrdup(pat)) == NULL) {
     598           0 :                         GDKfree(r);
     599           0 :                         return NULL;
     600             :                 }
     601        7856 :                 if (ascii_pattern)
     602        7853 :                         n->is_ascii = true;
     603        7856 :                 if (caseignore)
     604          94 :                         n->case_ignore = true;
     605             : 
     606          94 :                 if (ascii_pattern && caseignore) {
     607         991 :                         for (q = p; *q != 0; q++) {
     608         896 :                                 if ('A' <= *q && *q <= 'Z')
     609          21 :                                         *q += 'a' - 'A';
     610             :                         }
     611             :                 }
     612             : 
     613        7856 :                 r->k = p;
     614        7856 :                 q = p;
     615       58761 :                 while (*p) {
     616       50905 :                         if (escaped) {
     617         158 :                                 *q++ = *p;
     618         158 :                                 n->len++;
     619         158 :                                 escaped = false;
     620       50747 :                         } else if ((unsigned char) *p == esc) {
     621             :                                 escaped = true;
     622       50589 :                         } else if (*p == '%') {
     623        7507 :                                 n->atend = false;
     624        7563 :                                 while (p[1] == '%')
     625          56 :                                         p++;
     626        7507 :                                 if (p[1]) {
     627         127 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     628         127 :                                         if (n == NULL)
     629           0 :                                                 goto bailout;
     630         127 :                                         *n = (struct RE) {
     631             :                                                 .search = true,
     632             :                                                 .atend = true,
     633         127 :                                                 .k = p + 1
     634             :                                         };
     635         127 :                                         if (ascii_pattern) {
     636         124 :                                                 n->is_ascii = true;
     637             :                                         }
     638         127 :                                         if (caseignore) {
     639          25 :                                                 n->case_ignore = true;
     640             :                                         }
     641             :                                 }
     642        7507 :                                 *q = 0;
     643        7507 :                                 q = p + 1;
     644             :                         } else {
     645       43082 :                                 char c = *p;
     646       43082 :                                 if (ascii_pattern && caseignore && 'A' <= c && c <= 'Z') {
     647           0 :                                         c += 'a' - 'A';
     648             :                                 }
     649       43082 :                                 *q++ = c;
     650       43082 :                                 n->len++;
     651             :                         }
     652       50905 :                         p++;
     653             :                 }
     654        7856 :                 *q = 0;
     655             :         }
     656             :         return r;
     657           0 :   bailout:
     658           0 :         re_destroy(r);
     659           0 :         return NULL;
     660             : }
     661             : 
     662             : #ifdef HAVE_LIBPCRE
     663             : static str
     664          25 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
     665             : {
     666          25 :         pcre *r;
     667          25 :         const char *err_p = NULL;
     668          25 :         int errpos = 0;
     669          25 :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
     670          25 :         if (insensitive)
     671           0 :                 options |= PCRE_CASELESS;
     672             : 
     673          25 :         if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
     674           0 :                 throw(MAL, "pcre.compile", OPERATION_FAILED
     675             :                           " with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern);
     676             :         }
     677          25 :         *res = r;
     678          25 :         return MAL_SUCCEED;
     679             : }
     680             : #endif
     681             : 
     682             : /* maximum number of back references and quoted \ or $ in replacement string */
     683             : #define MAX_NR_REFS             20
     684             : 
     685             : struct backref {
     686             :         int idx;
     687             :         int start;
     688             :         int end;
     689             : };
     690             : 
     691             : #ifdef HAVE_LIBPCRE
     692             : /* fill in parameter backrefs (length maxrefs) with information about
     693             :  * back references in the replacement string; a back reference is a
     694             :  * dollar or backslash followed by a number */
     695             : static int
     696          78 : parse_replacement(const char *replacement, int len_replacement,
     697             :                                   struct backref *backrefs, int maxrefs)
     698             : {
     699          78 :         int nbackrefs = 0;
     700             : 
     701         126 :         for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
     702          48 :                 if (replacement[i] == '$' || replacement[i] == '\\') {
     703           6 :                         char *endptr;
     704           6 :                         backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
     705           6 :                         if (endptr > replacement + i + 1) {
     706           6 :                                 int k = (int) (endptr - (replacement + i + 1));
     707           6 :                                 backrefs[nbackrefs].start = i;
     708           6 :                                 backrefs[nbackrefs].end = i + k + 1;
     709           6 :                                 nbackrefs++;
     710           0 :                         } else if (replacement[i] == replacement[i + 1]) {
     711             :                                 /* doubled $ or \, we must copy just one to the output */
     712           0 :                                 backrefs[nbackrefs].idx = INT_MAX;      /* impossible value > 0 */
     713           0 :                                 backrefs[nbackrefs].start = i;
     714           0 :                                 backrefs[nbackrefs].end = i + 1;
     715           0 :                                 i++;                    /* don't look at second $ or \ again */
     716           0 :                                 nbackrefs++;
     717             :                         }
     718             :                         /* else: $ or \ followed by something we don't recognize,
     719             :                          * so just leave it */
     720             :                 }
     721             :         }
     722          78 :         return nbackrefs;
     723             : }
     724             : 
     725             : static char *
     726       27892 : single_replace(pcre *pcre_code, pcre_extra *extra,
     727             :                            const char *origin_str, int len_origin_str,
     728             :                            int exec_options, int *ovector, int ovecsize,
     729             :                            const char *replacement, int len_replacement,
     730             :                            struct backref *backrefs, int nbackrefs,
     731             :                            bool global, char *result, int *max_result)
     732             : {
     733       27892 :         int offset = 0;
     734       27892 :         int len_result = 0;
     735      104375 :         int addlen;
     736      104375 :         char *tmp;
     737             : 
     738      104375 :         do {
     739      104375 :                 int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
     740             :                                                   exec_options, ovector, ovecsize);
     741      104471 :                 if (j <= 0)
     742             :                         break;
     743       78653 :                 addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0);
     744       78653 :                 if (len_result + addlen >= *max_result) {
     745        6840 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     746        6840 :                         if (tmp == NULL) {
     747           0 :                                 GDKfree(result);
     748           0 :                                 return NULL;
     749             :                         }
     750        6840 :                         result = tmp;
     751        6840 :                         *max_result = len_result + addlen + 1;
     752             :                 }
     753       78653 :                 if (ovector[0] > offset) {
     754       76482 :                         strncpy(result + len_result, origin_str + offset,
     755       76482 :                                         ovector[0] - offset);
     756       76482 :                         len_result += ovector[0] - offset;
     757             :                 }
     758       78653 :                 if (nbackrefs == 0) {
     759       76486 :                         strncpy(result + len_result, replacement, len_replacement);
     760       76486 :                         len_result += len_replacement;
     761             :                 } else {
     762             :                         int prevend = 0;
     763        4334 :                         for (int i = 0; i < nbackrefs; i++) {
     764        2167 :                                 int off, len;
     765        2167 :                                 if (backrefs[i].idx >= ovecsize / 3) {
     766             :                                         /* out of bounds, replace with empty string */
     767             :                                         off = 0;
     768             :                                         len = 0;
     769             :                                 } else {
     770        2167 :                                         off = ovector[backrefs[i].idx * 2];
     771        2167 :                                         len = ovector[backrefs[i].idx * 2 + 1] - off;
     772             :                                 }
     773        2167 :                                 addlen = backrefs[i].start - prevend + len;
     774        2167 :                                 if (len_result + addlen >= *max_result) {
     775          21 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     776          21 :                                         if (tmp == NULL) {
     777           0 :                                                 GDKfree(result);
     778           0 :                                                 return NULL;
     779             :                                         }
     780          21 :                                         result = tmp;
     781          21 :                                         *max_result = len_result + addlen + 1;
     782             :                                 }
     783        2167 :                                 if (backrefs[i].start > prevend) {
     784           2 :                                         strncpy(result + len_result, replacement + prevend,
     785           2 :                                                         backrefs[i].start - prevend);
     786           2 :                                         len_result += backrefs[i].start - prevend;
     787             :                                 }
     788        2167 :                                 if (len > 0) {
     789        2167 :                                         strncpy(result + len_result, origin_str + off, len);
     790        2167 :                                         len_result += len;
     791             :                                 }
     792        2167 :                                 prevend = backrefs[i].end;
     793             :                         }
     794             :                         /* copy rest of replacement string (after last backref) */
     795        2167 :                         addlen = len_replacement - prevend;
     796        2167 :                         if (addlen > 0) {
     797           2 :                                 if (len_result + addlen >= *max_result) {
     798           1 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     799           1 :                                         if (tmp == NULL) {
     800           0 :                                                 GDKfree(result);
     801           0 :                                                 return NULL;
     802             :                                         }
     803           1 :                                         result = tmp;
     804           1 :                                         *max_result = len_result + addlen + 1;
     805             :                                 }
     806           2 :                                 strncpy(result + len_result, replacement + prevend, addlen);
     807           2 :                                 len_result += addlen;
     808             :                         }
     809             :                 }
     810       78653 :                 offset = ovector[1];
     811       78653 :         } while (offset < len_origin_str && global);
     812       27988 :         if (offset < len_origin_str) {
     813       25768 :                 addlen = len_origin_str - offset;
     814       25768 :                 if (len_result + addlen >= *max_result) {
     815         367 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     816         368 :                         if (tmp == NULL) {
     817           0 :                                 GDKfree(result);
     818           0 :                                 return NULL;
     819             :                         }
     820         368 :                         result = tmp;
     821         368 :                         *max_result = len_result + addlen + 1;
     822             :                 }
     823       25769 :                 strncpy(result + len_result, origin_str + offset, addlen);
     824       25769 :                 len_result += addlen;
     825             :         }
     826             :         /* null terminate string */
     827       27989 :         result[len_result] = '\0';
     828       27989 :         return result;
     829             : }
     830             : #endif
     831             : 
     832             : static str
     833          10 : pcre_replace(str *res, const char *origin_str, const char *pattern,
     834             :                          const char *replacement, const char *flags, bool global)
     835             : {
     836             : #ifdef HAVE_LIBPCRE
     837          10 :         const char *err_p = NULL;
     838          10 :         pcre *pcre_code = NULL;
     839          10 :         pcre_extra *extra;
     840          10 :         char *tmpres;
     841          10 :         int max_result;
     842          10 :         int i, errpos = 0;
     843          10 :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     844          10 :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     845          10 :         int *ovector, ovecsize;
     846          10 :         int len_origin_str = (int) strlen(origin_str);
     847          10 :         int len_replacement = (int) strlen(replacement);
     848          10 :         struct backref backrefs[MAX_NR_REFS];
     849          10 :         int nbackrefs = 0;
     850             : 
     851          14 :         while (*flags) {
     852           4 :                 switch (*flags) {
     853             :                 case 'e':
     854             :                         exec_options &= ~PCRE_NOTEMPTY;
     855             :                         break;
     856           1 :                 case 'i':
     857           1 :                         compile_options |= PCRE_CASELESS;
     858           1 :                         break;
     859           1 :                 case 'm':
     860           1 :                         compile_options |= PCRE_MULTILINE;
     861           1 :                         break;
     862           1 :                 case 's':
     863           1 :                         compile_options |= PCRE_DOTALL;
     864           1 :                         break;
     865           1 :                 case 'x':
     866           1 :                         compile_options |= PCRE_EXTENDED;
     867           1 :                         break;
     868           0 :                 default:
     869           0 :                         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     870             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     871             :                                   *flags);
     872             :                 }
     873           4 :                 flags++;
     874             :         }
     875             : 
     876          10 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     877           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     878             :                           OPERATION_FAILED
     879             :                           ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     880             :                           pattern, errpos, err_p);
     881             :         }
     882             : 
     883             :         /* Since the compiled pattern is going to be used several times, it is
     884             :          * worth spending more time analyzing it in order to speed up the time
     885             :          * taken for matching.
     886             :          */
     887          10 :         extra = pcre_study(pcre_code, 0, &err_p);
     888          10 :         if (err_p != NULL) {
     889           0 :                 pcre_free(pcre_code);
     890           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     891             :                           OPERATION_FAILED
     892             :                           ": pcre study of pattern (%s) failed with '%s'.\n", pattern,
     893             :                           err_p);
     894             :         }
     895          10 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
     896          10 :         ovecsize = (i + 1) * 3;
     897          10 :         if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
     898           0 :                 pcre_free_study(extra);
     899           0 :                 pcre_free(pcre_code);
     900           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     901             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     902             :         }
     903             : 
     904             :         /* identify back references in the replacement string */
     905          10 :         nbackrefs = parse_replacement(replacement, len_replacement,
     906             :                                                                   backrefs, MAX_NR_REFS);
     907             : 
     908          10 :         max_result = len_origin_str + 1;
     909          10 :         tmpres = GDKmalloc(max_result);
     910          10 :         if (tmpres == NULL) {
     911           0 :                 GDKfree(ovector);
     912           0 :                 pcre_free_study(extra);
     913           0 :                 pcre_free(pcre_code);
     914           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     915             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     916             :         }
     917             : 
     918          10 :         tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
     919             :                                                         exec_options, ovector, ovecsize, replacement,
     920             :                                                         len_replacement, backrefs, nbackrefs, global,
     921             :                                                         tmpres, &max_result);
     922          10 :         GDKfree(ovector);
     923          10 :         pcre_free_study(extra);
     924          10 :         pcre_free(pcre_code);
     925          10 :         if (tmpres == NULL)
     926           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     927             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     928             : 
     929          10 :         *res = tmpres;
     930          10 :         return MAL_SUCCEED;
     931             : #else
     932             :         (void) res;
     933             :         (void) origin_str;
     934             :         (void) pattern;
     935             :         (void) replacement;
     936             :         (void) flags;
     937             :         (void) global;
     938             :         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     939             :                   "Database was compiled without PCRE support.");
     940             : #endif
     941             : }
     942             : 
     943             : static str
     944          70 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
     945             :                                  const char *replacement, const char *flags, bool global)
     946             : {
     947             : #ifdef HAVE_LIBPCRE
     948          70 :         const char *err_p = NULL;
     949          70 :         char *tmpres;
     950          70 :         int i, errpos = 0;
     951          70 :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     952          70 :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     953          70 :         pcre *pcre_code = NULL;
     954          70 :         pcre_extra *extra;
     955          70 :         BAT *tmpbat;
     956          70 :         BUN p, q;
     957          70 :         int *ovector, ovecsize;
     958          70 :         int len_replacement = (int) strlen(replacement);
     959          70 :         struct backref backrefs[MAX_NR_REFS];
     960          70 :         int nbackrefs = 0;
     961          70 :         const char *origin_str;
     962          70 :         int max_dest_size = 0;
     963             : 
     964          90 :         while (*flags) {
     965          20 :                 switch (*flags) {
     966             :                 case 'e':
     967             :                         exec_options &= ~PCRE_NOTEMPTY;
     968             :                         break;
     969           5 :                 case 'i':
     970           5 :                         compile_options |= PCRE_CASELESS;
     971           5 :                         break;
     972          10 :                 case 'm':
     973          10 :                         compile_options |= PCRE_MULTILINE;
     974          10 :                         break;
     975           5 :                 case 's':
     976           5 :                         compile_options |= PCRE_DOTALL;
     977           5 :                         break;
     978           0 :                 case 'x':
     979           0 :                         compile_options |= PCRE_EXTENDED;
     980           0 :                         break;
     981           0 :                 default:
     982           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     983             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     984             :                                   *flags);
     985             :                 }
     986          20 :                 flags++;
     987             :         }
     988             : 
     989          70 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     990           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     991             :                           OPERATION_FAILED
     992             :                           ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     993             :                           pattern, errpos, err_p);
     994             :         }
     995             : 
     996             :         /* Since the compiled pattern is going to be used several times,
     997             :          * it is worth spending more time analyzing it in order to speed
     998             :          * up the time taken for matching.
     999             :          */
    1000         138 :         extra = pcre_study(pcre_code,
    1001          69 :                                            BATcount(origin_strs) >
    1002             :                                            JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
    1003          69 :         if (err_p != NULL) {
    1004           0 :                 pcre_free(pcre_code);
    1005           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1006             :                           OPERATION_FAILED);
    1007             :         }
    1008          69 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
    1009          67 :         ovecsize = (i + 1) * 3;
    1010          67 :         if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
    1011           0 :                 pcre_free_study(extra);
    1012           0 :                 pcre_free(pcre_code);
    1013           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1014             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1015             :         }
    1016             : 
    1017             :         /* identify back references in the replacement string */
    1018          70 :         nbackrefs = parse_replacement(replacement, len_replacement,
    1019             :                                                                   backrefs, MAX_NR_REFS);
    1020             : 
    1021          68 :         tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs),
    1022             :                                         TRANSIENT);
    1023             : 
    1024             :         /* the buffer for all destination strings is allocated only once,
    1025             :          * and extended when needed */
    1026          70 :         max_dest_size = len_replacement + 1;
    1027          70 :         tmpres = GDKmalloc(max_dest_size);
    1028          70 :         if (tmpbat == NULL || tmpres == NULL) {
    1029           0 :                 pcre_free_study(extra);
    1030           0 :                 pcre_free(pcre_code);
    1031           0 :                 GDKfree(ovector);
    1032           0 :                 BBPreclaim(tmpbat);
    1033           0 :                 GDKfree(tmpres);
    1034           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1035             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1036             :         }
    1037          70 :         BATiter origin_strsi = bat_iterator(origin_strs);
    1038       28092 :         BATloop(origin_strs, p, q) {
    1039       28023 :                 origin_str = BUNtvar(origin_strsi, p);
    1040       56003 :                 tmpres = single_replace(pcre_code, extra, origin_str,
    1041       28045 :                                                                 (int) strlen(origin_str), exec_options,
    1042             :                                                                 ovector, ovecsize, replacement,
    1043             :                                                                 len_replacement, backrefs, nbackrefs, global,
    1044             :                                                                 tmpres, &max_dest_size);
    1045       27958 :                 if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
    1046           0 :                         bat_iterator_end(&origin_strsi);
    1047           0 :                         pcre_free_study(extra);
    1048           0 :                         pcre_free(pcre_code);
    1049           0 :                         GDKfree(ovector);
    1050           0 :                         GDKfree(tmpres);
    1051           0 :                         BBPreclaim(tmpbat);
    1052           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1053             :                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1054             :                 }
    1055             :         }
    1056          69 :         bat_iterator_end(&origin_strsi);
    1057          70 :         pcre_free_study(extra);
    1058          70 :         pcre_free(pcre_code);
    1059          70 :         GDKfree(ovector);
    1060          70 :         GDKfree(tmpres);
    1061          69 :         *res = tmpbat;
    1062          69 :         return MAL_SUCCEED;
    1063             : #else
    1064             :         (void) res;
    1065             :         (void) origin_strs;
    1066             :         (void) pattern;
    1067             :         (void) replacement;
    1068             :         (void) flags;
    1069             :         (void) global;
    1070             :         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1071             :                   "Database was compiled without PCRE support.");
    1072             : #endif
    1073             : }
    1074             : 
    1075             : static str
    1076          74 : pcre_match_with_flags(bit *ret, const char *val, const char *pat,
    1077             :                                           const char *flags)
    1078             : {
    1079          74 :         int pos;
    1080             : #ifdef HAVE_LIBPCRE
    1081          74 :         const char *err_p = NULL;
    1082          74 :         int errpos = 0;
    1083          74 :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
    1084          74 :         pcre *re;
    1085             : #else
    1086             :         int options = REG_NOSUB;
    1087             :         regex_t re;
    1088             :         int errcode;
    1089             :         int retval;
    1090             : #endif
    1091             : 
    1092         148 :         while (*flags) {
    1093          74 :                 switch (*flags) {
    1094           0 :                 case 'i':
    1095             : #ifdef HAVE_LIBPCRE
    1096           0 :                         options |= PCRE_CASELESS;
    1097             : #else
    1098             :                         options |= REG_ICASE;
    1099             : #endif
    1100           0 :                         break;
    1101           0 :                 case 'm':
    1102             : #ifdef HAVE_LIBPCRE
    1103           0 :                         options |= PCRE_MULTILINE;
    1104             : #else
    1105             :                         options |= REG_NEWLINE;
    1106             : #endif
    1107           0 :                         break;
    1108             : #ifdef HAVE_LIBPCRE
    1109          74 :                 case 's':
    1110          74 :                         options |= PCRE_DOTALL;
    1111          74 :                         break;
    1112             : #endif
    1113           0 :                 case 'x':
    1114             : #ifdef HAVE_LIBPCRE
    1115           0 :                         options |= PCRE_EXTENDED;
    1116             : #else
    1117             :                         options |= REG_EXTENDED;
    1118             : #endif
    1119           0 :                         break;
    1120           0 :                 default:
    1121           0 :                         throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
    1122             :                                   ": unsupported flag character '%c'\n", *flags);
    1123             :                 }
    1124          74 :                 flags++;
    1125             :         }
    1126          74 :         if (strNil(val)) {
    1127           0 :                 *ret = FALSE;
    1128           0 :                 return MAL_SUCCEED;
    1129             :         }
    1130             : 
    1131             : #ifdef HAVE_LIBPCRE
    1132          74 :         if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
    1133             : #else
    1134             :         if ((errcode = regcomp(&re, pat, options)) != 0)
    1135             : #endif
    1136             :         {
    1137           0 :                 throw(MAL, "pcre.match", OPERATION_FAILED
    1138             :                           ": compilation of regular expression (%s) failed "
    1139             : #ifdef HAVE_LIBPCRE
    1140             :                           "at %d with '%s'", pat, errpos, err_p
    1141             : #else
    1142             :                           , pat
    1143             : #endif
    1144             :                                 );
    1145             :         }
    1146             : #ifdef HAVE_LIBPCRE
    1147          74 :         pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK,
    1148             :                                         NULL, 0);
    1149          74 :         pcre_free(re);
    1150             : #else
    1151             :         retval = regexec(&re, val, (size_t) 0, NULL, 0);
    1152             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1153             :         regfree(&re);
    1154             : #endif
    1155          74 :         if (pos >= 0)
    1156          10 :                 *ret = TRUE;
    1157          64 :         else if (pos == -1)
    1158          64 :                 *ret = FALSE;
    1159             :         else
    1160           0 :                 throw(MAL, "pcre.match", OPERATION_FAILED
    1161             :                           ": matching of regular expression (%s) failed with %d", pat, pos);
    1162             :         return MAL_SUCCEED;
    1163             : }
    1164             : 
    1165             : #ifdef HAVE_LIBPCRE
    1166             : /* special characters in PCRE that need to be escaped */
    1167             : static const char *pcre_specials = ".+?*()[]{}|^$\\";
    1168             : #else
    1169             : /* special characters in POSIX basic regular expressions that need to
    1170             :  * be escaped */
    1171             : static const char *pcre_specials = "^.[$()|*+?{\\";
    1172             : #endif
    1173             : 
    1174             : /* change SQL LIKE pattern into PCRE pattern */
    1175             : static str
    1176         753 : sql2pcre(str *r, const char *pat, const char *esc_str)
    1177             : {
    1178         753 :         int escaped = 0;
    1179         753 :         int hasWildcard = 0;
    1180         753 :         char *ppat;
    1181        1505 :         int esc = strNil(esc_str) ? 0 : esc_str[0];     /* should change to utf8_convert() */
    1182         753 :         int specials;
    1183         753 :         int c;
    1184             : 
    1185         753 :         if (strlen(esc_str) > 1)
    1186           0 :                 throw(MAL, "pcre.sql2pcre",
    1187             :                           SQLSTATE(22019) ILLEGAL_ARGUMENT
    1188             :                           ": ESCAPE string must have length 1");
    1189         753 :         if (pat == NULL)
    1190           0 :                 throw(MAL, "pcre.sql2pcre",
    1191             :                           SQLSTATE(22019) ILLEGAL_ARGUMENT
    1192             :                           ": (I)LIKE pattern must not be NULL");
    1193         753 :         ppat = GDKmalloc(strlen(pat) * 3 +
    1194             :                                          3 /* 3 = "^'the translated regexp'$0" */ );
    1195         753 :         if (ppat == NULL)
    1196           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1197             : 
    1198         753 :         *r = ppat;
    1199             :         /* The escape character can be a char which is special in a PCRE
    1200             :          * expression.  If the user used the "+" char as escape and has "++"
    1201             :          * in their pattern, then replacing this with "+" is not correct and
    1202             :          * should be "\+" instead. */
    1203         753 :         specials = (esc && strchr(pcre_specials, esc) != NULL);
    1204             : 
    1205         753 :         *ppat++ = '^';
    1206        7264 :         while ((c = *pat++) != 0) {
    1207        6511 :                 if (c == esc) {
    1208          13 :                         if (escaped) {
    1209           0 :                                 if (specials) { /* change ++ into \+ */
    1210           0 :                                         *ppat++ = esc;
    1211             :                                 } else {                /* do not escape simple escape symbols */
    1212           0 :                                         ppat[-1] = esc; /* overwrite backslash */
    1213             :                                 }
    1214             :                                 escaped = 0;
    1215             :                         } else {
    1216          13 :                                 *ppat++ = '\\';
    1217          13 :                                 escaped = 1;
    1218             :                         }
    1219             :                         hasWildcard = 1;
    1220        6498 :                 } else if (strchr(pcre_specials, c) != NULL) {
    1221             :                         /* escape PCRE special chars, avoid double backslash if the
    1222             :                          * user uses an invalid escape sequence */
    1223          36 :                         if (!escaped)
    1224          36 :                                 *ppat++ = '\\';
    1225          36 :                         *ppat++ = c;
    1226          36 :                         hasWildcard = 1;
    1227          36 :                         escaped = 0;
    1228        6462 :                 } else if (c == '%' && !escaped) {
    1229         909 :                         *ppat++ = '.';
    1230         909 :                         *ppat++ = '*';
    1231         909 :                         *ppat++ = '?';
    1232         909 :                         hasWildcard = 1;
    1233             :                         /* collapse multiple %, but only if it isn't the escape */
    1234         909 :                         if (esc != '%')
    1235         909 :                                 while (*pat == '%')
    1236           0 :                                         pat++;
    1237        5553 :                 } else if (c == '_' && !escaped) {
    1238         901 :                         *ppat++ = '.';
    1239         901 :                         hasWildcard = 1;
    1240             :                 } else {
    1241        4652 :                         if (escaped) {
    1242          13 :                                 ppat[-1] = c;   /* overwrite backslash of invalid escape */
    1243             :                         } else {
    1244        4639 :                                 *ppat++ = c;
    1245             :                         }
    1246             :                         escaped = 0;
    1247             :                 }
    1248             :         }
    1249             :         /* no wildcard or escape character at end of string */
    1250         753 :         if (!hasWildcard || escaped) {
    1251           0 :                 GDKfree(*r);
    1252           0 :                 *r = NULL;
    1253           0 :                 if (escaped)
    1254           0 :                         throw(MAL, "pcre.sql2pcre",
    1255             :                                   SQLSTATE(22019) ILLEGAL_ARGUMENT
    1256             :                                   ": (I)LIKE pattern must not end with escape character");
    1257           0 :                 *r = GDKstrdup(str_nil);
    1258           0 :                 if (*r == NULL)
    1259           0 :                         throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1260             :         } else {
    1261         753 :                 *ppat++ = '$';
    1262         753 :                 *ppat = 0;
    1263             :         }
    1264             :         return MAL_SUCCEED;
    1265             : }
    1266             : 
    1267             : #ifdef HAVE_LIBPCRE
    1268             : /* change SQL PATINDEX pattern into PCRE pattern */
    1269             : static str
    1270          25 : pat2pcre(str *r, const char *pat)
    1271             : {
    1272          25 :         size_t len = strlen(pat);
    1273          25 :         char *ppat = GDKmalloc(len * 2 + 3 /* 3 = "^'the translated regexp'$0" */ );
    1274          25 :         int start = 0;
    1275             : 
    1276          25 :         if (ppat == NULL)
    1277           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1278          25 :         *r = ppat;
    1279          77 :         while (*pat) {
    1280          52 :                 int c = *pat++;
    1281             : 
    1282          52 :                 if (strchr(pcre_specials, c) != NULL) {
    1283          17 :                         *ppat++ = '\\';
    1284          17 :                         *ppat++ = c;
    1285          35 :                 } else if (c == '%') {
    1286           3 :                         if (start && *pat) {
    1287           0 :                                 *ppat++ = '.';
    1288           0 :                                 *ppat++ = '*';
    1289             :                         }
    1290           3 :                         start++;
    1291          32 :                 } else if (c == '_') {
    1292           0 :                         *ppat++ = '.';
    1293             :                 } else {
    1294          32 :                         *ppat++ = c;
    1295             :                 }
    1296             :         }
    1297          25 :         *ppat = 0;
    1298          25 :         return MAL_SUCCEED;
    1299             : }
    1300             : #endif
    1301             : 
    1302             : /*
    1303             :  * @+ Wrapping
    1304             :  */
    1305             : 
    1306             : static str
    1307          10 : PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl,
    1308             :                                  const str *flags)
    1309             : {
    1310          10 :         return pcre_replace(res, *or, *pat, *repl, *flags, true);
    1311             : }
    1312             : 
    1313             : static str
    1314           0 : PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl,
    1315             :                                           const str *flags)
    1316             : {
    1317           0 :         return pcre_replace(res, *or, *pat, *repl, *flags, false);
    1318             : }
    1319             : 
    1320             : static str
    1321          70 : PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl,
    1322             :                                          const str *flags)
    1323             : {
    1324          70 :         BAT *b, *bn = NULL;
    1325          70 :         str msg;
    1326          70 :         if ((b = BATdescriptor(*bid)) == NULL)
    1327           0 :                 throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1328             : 
    1329          70 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
    1330          69 :         if (msg == MAL_SUCCEED) {
    1331          70 :                 *res = bn->batCacheid;
    1332          70 :                 BBPkeepref(bn);
    1333             :         }
    1334          69 :         BBPunfix(b->batCacheid);
    1335          69 :         return msg;
    1336             : }
    1337             : 
    1338             : static str
    1339           0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat,
    1340             :                                                   const str *repl, const str *flags)
    1341             : {
    1342           0 :         BAT *b, *bn = NULL;
    1343           0 :         str msg;
    1344           0 :         if ((b = BATdescriptor(*bid)) == NULL)
    1345           0 :                 throw(MAL, "batpcre.replace_first", RUNTIME_OBJECT_MISSING);
    1346             : 
    1347           0 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
    1348           0 :         if (msg == MAL_SUCCEED) {
    1349           0 :                 *res = bn->batCacheid;
    1350           0 :                 BBPkeepref(bn);
    1351             :         }
    1352           0 :         BBPunfix(b->batCacheid);
    1353           0 :         return msg;
    1354             : }
    1355             : 
    1356             : static str
    1357          74 : PCREmatch(bit *ret, const str *val, const str *pat)
    1358             : {
    1359           4 :         return pcre_match_with_flags(ret, *val, *pat,
    1360             : #ifdef HAVE_LIBPCRE
    1361             :                                                                  "s"
    1362             : #else
    1363             :                                                                  "x"
    1364             : #endif
    1365             :                         );
    1366             : }
    1367             : 
    1368             : static str
    1369           0 : PCREimatch(bit *ret, const str *val, const str *pat)
    1370             : {
    1371           0 :         return pcre_match_with_flags(ret, *val, *pat, "i"
    1372             : #ifndef HAVE_LIBPCRE
    1373             :                                                                  "x"
    1374             : #endif
    1375             :                         );
    1376             : }
    1377             : 
    1378             : static str
    1379          25 : PCREindex(int *res, const pcre *pattern, const str *s)
    1380             : {
    1381             : #ifdef HAVE_LIBPCRE
    1382          25 :         int v[3];
    1383             : 
    1384          25 :         v[0] = v[1] = *res = 0;
    1385          25 :         if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0,
    1386             :                                   PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
    1387          23 :                 *res = v[1];
    1388             :         }
    1389          25 :         return MAL_SUCCEED;
    1390             : #else
    1391             :         (void) res;
    1392             :         (void) pattern;
    1393             :         (void) s;
    1394             :         throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
    1395             : #endif
    1396             : }
    1397             : 
    1398             : static str
    1399          27 : PCREpatindex(int *ret, const str *pat, const str *val)
    1400             : {
    1401             : #ifdef HAVE_LIBPCRE
    1402          27 :         pcre *re = NULL;
    1403          27 :         char *ppat = NULL, *msg;
    1404             : 
    1405          53 :         if (strNil(*pat) || strNil(*val)) {
    1406           2 :                 *ret = int_nil;
    1407           2 :                 return MAL_SUCCEED;
    1408             :         }
    1409             : 
    1410          25 :         if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
    1411             :                 return msg;
    1412          25 :         if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
    1413           0 :                 GDKfree(ppat);
    1414           0 :                 return msg;
    1415             :         }
    1416          25 :         GDKfree(ppat);
    1417          25 :         msg = PCREindex(ret, re, val);
    1418          25 :         pcre_free(re);
    1419          25 :         return msg;
    1420             : #else
    1421             :         (void) ret;
    1422             :         (void) pat;
    1423             :         (void) val;
    1424             :         throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
    1425             : #endif
    1426             : }
    1427             : 
    1428             : static str
    1429           0 : PCREquote(str *ret, const str *val)
    1430             : {
    1431           0 :         char *p;
    1432           0 :         const char *s = *val;
    1433             : 
    1434           0 :         *ret = p = GDKmalloc(strlen(s) * 2 + 1);        /* certainly long enough */
    1435           0 :         if (p == NULL)
    1436           0 :                 throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1437             :         /* quote all non-alphanumeric ASCII characters (i.e. leave
    1438             :            non-ASCII and alphanumeric alone) */
    1439           0 :         while (*s) {
    1440           0 :                 if (!((*s & 0x80) != 0 ||
    1441           0 :                           ('a' <= *s && *s <= 'z') ||
    1442           0 :                           ('A' <= *s && *s <= 'Z') || isdigit((unsigned char) *s)))
    1443           0 :                         *p++ = '\\';
    1444           0 :                 *p++ = *s++;
    1445             :         }
    1446           0 :         *p = 0;
    1447           0 :         return MAL_SUCCEED;
    1448             : }
    1449             : 
    1450             : static str
    1451           0 : PCREsql2pcre(str *ret, const str *pat, const str *esc)
    1452             : {
    1453           0 :         return sql2pcre(ret, *pat, *esc);
    1454             : }
    1455             : 
    1456             : static bool
    1457       10018 : is_ascii_str(const char *pat)
    1458             : {
    1459       10018 :         size_t len = strlen(pat);
    1460       78267 :         for (size_t i = 0; i < len; i++) {
    1461       68937 :                 if (pat[i] & 0x80)
    1462             :                         return false;
    1463             :         }
    1464             : 
    1465             :         return true;
    1466             : }
    1467             : 
    1468             : static inline str
    1469       10045 : choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty,
    1470             :                                  bool *ascii_pattern, const char *pat, const char *esc)
    1471             : {
    1472       10045 :         str res = MAL_SUCCEED;
    1473       10045 :         *use_re = false;
    1474       10045 :         *use_strcmp = false;
    1475       10045 :         *empty = false;
    1476             : 
    1477             : 
    1478       10045 :         *ascii_pattern = is_ascii_str(pat);
    1479             : 
    1480       19618 :         if (strNil(pat) || strNil(esc)) {
    1481         472 :                 *empty = true;
    1482             :         } else {
    1483        9573 :                 if (!re_is_pattern_properly_escaped(pat, (unsigned char) *esc))
    1484           5 :                         throw(MAL, "pcre.sql2pcre",
    1485             :                                   SQLSTATE(22019) ILLEGAL_ARGUMENT
    1486             :                                   ": (I)LIKE pattern must not end with escape character");
    1487        9551 :                 if (is_strcmpable(pat, esc)) {
    1488         923 :                         *use_re = true;
    1489         923 :                         *use_strcmp = true;
    1490        8628 :                 } else if (re_simple(pat, (unsigned char) *esc)) {
    1491        7889 :                         *use_re = true;
    1492             :                 } else {
    1493         755 :                         if ((res = sql2pcre(ppat, pat, esc)) != MAL_SUCCEED)
    1494             :                                 return res;
    1495         756 :                         if (strNil(*ppat)) {
    1496           0 :                                 GDKfree(*ppat);
    1497           0 :                                 *ppat = NULL;
    1498           0 :                                 *use_re = true;
    1499           0 :                                 *use_strcmp = true;
    1500             :                         }
    1501             :                 }
    1502             :         }
    1503             :         return res;
    1504             : }
    1505             : 
    1506             : static str
    1507         234 : PCRElike_imp(bit *ret, const str *s, const str *pat, const str *esc,
    1508             :                          const bit *isens)
    1509             : {
    1510         234 :         str res = MAL_SUCCEED;
    1511         234 :         char *ppat = NULL;
    1512         234 :         bool use_re = false, use_strcmp = false, empty = false, ascii_pattern = false;
    1513         234 :         struct RE *re = NULL;
    1514             : 
    1515         234 :         if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
    1516             :                                                                 *pat, *esc)) != MAL_SUCCEED)
    1517             :                 return res;
    1518             : 
    1519         459 :         MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ?
    1520         225 :                                                    "pcrelike: pattern matching using strcmp" : use_re ?
    1521             :                                                    "pcrelike: pattern matching using RE" :
    1522             :                                                    "pcrelike: pattern matching using pcre");
    1523             : 
    1524         468 :         if (strNil(*s) || empty) {
    1525           0 :                 *ret = bit_nil;
    1526         234 :         } else if (use_re) {
    1527         164 :                 if (use_strcmp) {
    1528           9 :                         *ret = *isens ? (ascii_pattern
    1529           2 :                                                          ? istrcmp(*s, *pat) == 0
    1530           0 :                                                          : mystrcasecmp(*s, *pat) == 0)
    1531           7 :                                 : strcmp(*s, *pat) == 0;
    1532             :                 } else {
    1533         155 :                         if (!(re = re_create(*pat, *isens, ascii_pattern, (unsigned char) **esc)))
    1534           0 :                                 res = createException(MAL, "pcre.like4",
    1535             :                                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1536             :                         else
    1537         310 :                                 *ret = (*isens && !re->is_ascii)
    1538           0 :                                         ? re_match_ignore(*s, re)
    1539         155 :                                         : re_match_no_ignore(*s, re);
    1540             :                 }
    1541             :         } else {
    1542          70 :                 res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, &ppat);
    1543             :         }
    1544             : 
    1545         164 :         if (re)
    1546         155 :                 re_destroy(re);
    1547         234 :         GDKfree(ppat);
    1548         234 :         return res;
    1549             : }
    1550             : 
    1551             : static str
    1552         234 : PCRElike(bit *ret, const str *s, const str *pat, const str *esc,
    1553             :                  const bit *isens)
    1554             : {
    1555         229 :         return PCRElike_imp(ret, s, pat, esc, isens);
    1556             : }
    1557             : 
    1558             : static str
    1559           5 : PCREnotlike(bit *ret, const str *s, const str *pat, const str *esc,
    1560             :                         const bit *isens)
    1561             : {
    1562           5 :         str tmp;
    1563           5 :         bit r;
    1564             : 
    1565           5 :         rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
    1566           5 :         *ret = r == bit_nil ? bit_nil : !r;
    1567           5 :         return MAL_SUCCEED;
    1568             : }
    1569             : 
    1570             : static inline str
    1571        8655 : re_like_build(struct RE **re, uint32_t **wpat, const char *pat, bool caseignore,
    1572             :                           bool use_strcmp, bool ascii_pattern, uint32_t esc)
    1573             : {
    1574        8655 :         if (!use_strcmp) {
    1575        7740 :                 if (!(*re = re_create(pat, caseignore, ascii_pattern, esc)))
    1576           0 :                         return createException(MAL, "pcre.re_like_build",
    1577             :                                                                    SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1578         915 :         } else if (caseignore && !ascii_pattern) {
    1579          38 :                 if (!(*wpat = utf8stoucs(pat)))
    1580           0 :                         return createException(MAL, "pcre.re_like_build",
    1581             :                                                                    SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1582             :         }
    1583             :         return MAL_SUCCEED;
    1584             : }
    1585             : 
    1586             : #define proj_scanloop(TEST)     \
    1587             :         do {                                    \
    1588             :                 if (strNil(s))          \
    1589             :                         return bit_nil; \
    1590             :                 else                            \
    1591             :                         return TEST;    \
    1592             :         } while (0)
    1593             : 
    1594             : static inline bit
    1595        5034 : re_like_proj_apply(const char *s, const struct RE *restrict re,
    1596             :                                    const uint32_t *restrict wpat, const char *pat,
    1597             :                                    bool caseignore, bool anti, bool use_strcmp, bool is_ascii)
    1598             : {
    1599        5034 :         if (use_strcmp) {
    1600        1126 :                 if (caseignore) {
    1601         498 :                         if (is_ascii) {
    1602         479 :                                 if (anti)
    1603         874 :                                         proj_scanloop(istrcmp(s, pat) != 0);
    1604             :                                 else
    1605          84 :                                         proj_scanloop(istrcmp(s, pat) == 0);
    1606             :                         } else {
    1607          19 :                                 if (anti)
    1608          28 :                                         proj_scanloop(mywstrcasecmp(s, wpat) != 0);
    1609             :                                 else
    1610          10 :                                         proj_scanloop(mywstrcasecmp(s, wpat) == 0);
    1611             :                         }
    1612             :                 } else {
    1613         628 :                         if (anti)
    1614         606 :                                 proj_scanloop(strcmp(s, pat) != 0);
    1615             :                         else
    1616         650 :                                 proj_scanloop(strcmp(s, pat) == 0);
    1617             :                 }
    1618             :         } else {
    1619             :                 /* Use re_match_ignore only if the pattern is UTF-8
    1620             :                  * and we need to ignore case
    1621             :                  */
    1622        3908 :                 if (caseignore && !is_ascii) {
    1623           3 :                         if (anti)
    1624           6 :                                 proj_scanloop(!re_match_ignore(s, re));
    1625             :                         else
    1626           0 :                                 proj_scanloop(re_match_ignore(s, re));
    1627             :                 } else {
    1628        3905 :                         if (anti)
    1629         174 :                                 proj_scanloop(!re_match_no_ignore(s, re));
    1630             :                         else
    1631        7636 :                                 proj_scanloop(re_match_no_ignore(s, re));
    1632             :                 }
    1633             :         }
    1634             : }
    1635             : 
    1636             : static inline void
    1637        9010 : re_like_clean(struct RE **re, uint32_t **wpat)
    1638             : {
    1639        9010 :         if (*re) {
    1640        7742 :                 re_destroy(*re);
    1641        7745 :                 *re = NULL;
    1642             :         }
    1643        9013 :         if (*wpat) {
    1644          38 :                 GDKfree(*wpat);
    1645          38 :                 *wpat = NULL;
    1646             :         }
    1647        9013 : }
    1648             : 
    1649             : #ifdef HAVE_LIBPCRE
    1650             : static inline str
    1651         687 : pcre_like_build(pcre **res, pcre_extra **ex, const char *ppat, bool caseignore,
    1652             :                                 BUN count)
    1653             : {
    1654         687 :         const char *err_p = NULL;
    1655         687 :         int errpos = 0;
    1656         687 :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE | PCRE_DOTALL;
    1657         687 :         int pcrestopt = count > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0;
    1658             : 
    1659         687 :         *res = NULL;
    1660         687 :         *ex = NULL;
    1661             : 
    1662         687 :         if (caseignore) {
    1663          22 :                 options |= PCRE_CASELESS;
    1664             :         }
    1665         687 :         if ((*res = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL)
    1666           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1667             :                                                            ": compilation of regular expression (%s) failed"
    1668             :                                                            " at %d with '%s'", ppat, errpos, err_p);
    1669         676 :         *ex = pcre_study(*res, pcrestopt, &err_p);
    1670         678 :         if (err_p != NULL)
    1671           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1672             :                                                            ": pcre study of pattern (%s) "
    1673             :                                                            "failed with '%s'", ppat, err_p);
    1674             :         return MAL_SUCCEED;
    1675             : }
    1676             : #else
    1677             : static inline str
    1678             : pcre_like_build(regex_t *res, void *ex, const char *ppat, bool caseignore,
    1679             :                                 BUN count)
    1680             : {
    1681             :         int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
    1682             :         int errcode;
    1683             : 
    1684             :         *res = (regex_t) {
    1685             :         0};
    1686             :         (void) count;
    1687             : 
    1688             :         if (caseignore) {
    1689             :                 options |= REG_ICASE;
    1690             :         }
    1691             :         if ((errcode = regcomp(res, ppat, options)) != 0)
    1692             :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1693             :                                                            ": compilation of regular expression (%s) failed",
    1694             :                                                            ppat);
    1695             :         (void) ex;
    1696             :         return MAL_SUCCEED;
    1697             : }
    1698             : #endif
    1699             : 
    1700             : #define PCRE_LIKE_BODY(LOOP_BODY, RES1, RES2) \
    1701             :         do { \
    1702             :                 LOOP_BODY  \
    1703             :                 if (strNil(s))          \
    1704             :                         *ret = bit_nil; \
    1705             :                 else if (pos >= 0) \
    1706             :                         *ret = RES1; \
    1707             :                 else if (pos == -1) \
    1708             :                         *ret = RES2; \
    1709             :                 else \
    1710             :                         return createException(MAL, "pcre.match", OPERATION_FAILED ": matching of regular expression (%s) failed with %d", ppat, pos); \
    1711             :         } while(0)
    1712             : 
    1713             : static inline str
    1714        1120 : pcre_like_apply(bit *ret, const char *s,
    1715             : #ifdef HAVE_LIBPCRE
    1716             :                                 const pcre *re, const pcre_extra *ex
    1717             : #else
    1718             :                                 regex_t re, void *ex
    1719             : #endif
    1720             :                                 , const char *ppat, bool anti)
    1721             : {
    1722        1120 :         int pos;
    1723             : 
    1724             : #ifdef HAVE_LIBPCRE
    1725             : #define LOOP_BODY       \
    1726             :         pos = pcre_exec(re, ex, s, (int) strlen(s), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
    1727             : #else
    1728             : #define LOOP_BODY       \
    1729             :         int retval = regexec(&re, s, (size_t) 0, NULL, 0); \
    1730             :         (void) ex; \
    1731             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1732             : #endif
    1733             : 
    1734        1120 :         if (anti)
    1735          43 :                 PCRE_LIKE_BODY(LOOP_BODY, FALSE, TRUE);
    1736             :         else
    1737        1077 :                 PCRE_LIKE_BODY(LOOP_BODY, TRUE, FALSE);
    1738             : 
    1739             :         return MAL_SUCCEED;
    1740             : }
    1741             : 
    1742             : static inline void
    1743        1636 : pcre_clean(
    1744             : #ifdef HAVE_LIBPCRE
    1745             :                           pcre **re, pcre_extra **ex)
    1746             : {
    1747        1636 :         if (*re)
    1748         683 :                 pcre_free(*re);
    1749        1638 :         if (*ex)
    1750         685 :                 pcre_free_study(*ex);
    1751        1635 :         *re = NULL;
    1752        1635 :         *ex = NULL;
    1753             : #else
    1754             :                           regex_t *re, void *ex)
    1755             : {
    1756             :         regfree(re);
    1757             :         *re = (regex_t) {
    1758             :         0};
    1759             :         (void) ex;
    1760             : #endif
    1761        1635 : }
    1762             : 
    1763             : static str
    1764        1041 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci,
    1765             :                                 const str *esc, const bit *isens, const bit *not)
    1766             : {
    1767        1041 :         str msg = MAL_SUCCEED;
    1768        1041 :         BAT *b = NULL, *pbn = NULL, *bn = NULL;
    1769        1041 :         char *ppat = NULL;
    1770        1041 :         const char *input = NULL;
    1771        1041 :         bool use_re = false,
    1772        1041 :                 use_strcmp = false,
    1773        1041 :                 empty = false,
    1774        1041 :                 isensitive = (bool) *isens,
    1775        1041 :                 anti = (bool) *not,
    1776        1041 :                 has_nil = false,
    1777        1041 :                 ascii_pattern = false,
    1778        1041 :                 input_is_a_bat = isaBatType(getArgType(mb, pci, 1)),
    1779        1041 :                 pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
    1780        1041 :         bat *r = getArgReference_bat(stk, pci, 0);
    1781        1041 :         BUN q = 0;
    1782        1041 :         bit *restrict ret = NULL;
    1783             : #ifdef HAVE_LIBPCRE
    1784        1041 :         pcre *re = NULL;
    1785        1041 :         pcre_extra *ex = NULL;
    1786             : #else
    1787             :         regex_t re = (regex_t) { 0 };
    1788             :         void *ex = NULL;
    1789             : #endif
    1790        1041 :         struct RE *re_simple = NULL;
    1791        1041 :         uint32_t *wpat = NULL;
    1792        1041 :         BATiter bi = (BATiter) { 0 }, pi;
    1793             : 
    1794        1041 :         (void) cntxt;
    1795        1041 :         if (input_is_a_bat) {
    1796        1043 :                 bat *bid = getArgReference_bat(stk, pci, 1);
    1797        1043 :                 if (!(b = BATdescriptor(*bid))) {
    1798           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3",
    1799             :                                                                   SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1800           0 :                         goto bailout;
    1801             :                 }
    1802             :         }
    1803        1047 :         if (pattern_is_a_bat) {
    1804         112 :                 bat *pb = getArgReference_bat(stk, pci, 2);
    1805         112 :                 if (!(pbn = BATdescriptor(*pb))) {
    1806           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3",
    1807             :                                                                   SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1808           0 :                         goto bailout;
    1809             :                 }
    1810             :         }
    1811        1047 :         assert((!b || ATOMstorage(b->ttype) == TYPE_str)
    1812             :                    && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
    1813             : 
    1814        1047 :         q = BATcount(b ? b : pbn);
    1815        1047 :         if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
    1816           0 :                 msg = createException(MAL, "batalgebra.batpcrelike3",
    1817             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1818           0 :                 goto bailout;
    1819             :         }
    1820        1042 :         ret = (bit *) Tloc(bn, 0);
    1821             : 
    1822        1042 :         if (pattern_is_a_bat) {
    1823         111 :                 pi = bat_iterator(pbn);
    1824         112 :                 if (b)
    1825         112 :                         bi = bat_iterator(b);
    1826             :                 else
    1827           0 :                         input = *getArgReference_str(stk, pci, 1);
    1828             : 
    1829        1188 :                 for (BUN p = 0; p < q; p++) {
    1830        1076 :                         const char *next_input = b ? BUNtvar(bi, p) : input,
    1831        1074 :                                 *np = BUNtvar(pi, p);
    1832             : 
    1833        1072 :                         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
    1834             :                                                                                 &ascii_pattern, np, *esc)) != MAL_SUCCEED) {
    1835           0 :                                 bat_iterator_end(&pi);
    1836           0 :                                 if (b)
    1837           0 :                                         bat_iterator_end(&bi);
    1838           0 :                                 goto bailout;
    1839             :                         }
    1840             : 
    1841        1091 :                         if (use_re) {
    1842         639 :                                 if ((msg = re_like_build(&re_simple, &wpat, np, isensitive,
    1843             :                                                                                  use_strcmp, ascii_pattern,
    1844         639 :                                                                                  (unsigned char) **esc)) != MAL_SUCCEED) {
    1845           0 :                                         bat_iterator_end(&pi);
    1846           0 :                                         if (b)
    1847           0 :                                                 bat_iterator_end(&bi);
    1848           0 :                                         goto bailout;
    1849             :                                 }
    1850         639 :                                 ret[p] = re_like_proj_apply(next_input, re_simple, wpat, np,
    1851             :                                                                                         isensitive, anti, use_strcmp,
    1852             :                                                                                         ascii_pattern);
    1853         639 :                                 re_like_clean(&re_simple, &wpat);
    1854         452 :                         } else if (empty) {
    1855         446 :                                 ret[p] = bit_nil;
    1856             :                         } else {
    1857           6 :                                 if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, 1)) != MAL_SUCCEED) {
    1858           0 :                                         bat_iterator_end(&pi);
    1859           0 :                                         if (b)
    1860           0 :                                                 bat_iterator_end(&bi);
    1861           0 :                                         goto bailout;
    1862             :                                 }
    1863           6 :                                 if ((msg = pcre_like_apply(&(ret[p]), next_input, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1864           0 :                                         bat_iterator_end(&pi);
    1865           0 :                                         if (b)
    1866           0 :                                                 bat_iterator_end(&bi);
    1867           0 :                                         goto bailout;
    1868             :                                 }
    1869           6 :                                 pcre_clean(&re, &ex);
    1870             :                         }
    1871        1091 :                         has_nil |= is_bit_nil(ret[p]);
    1872        1091 :                         GDKfree(ppat);
    1873        1076 :                         ppat = NULL;
    1874             :                 }
    1875         112 :                 bat_iterator_end(&pi);
    1876         112 :                 if (b)
    1877         112 :                         bat_iterator_end(&bi);
    1878             :         } else {
    1879         931 :                 const char *pat = *getArgReference_str(stk, pci, 2);
    1880         931 :                 if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
    1881             :                                                                         &ascii_pattern, pat, *esc)) != MAL_SUCCEED)
    1882           5 :                         goto bailout;
    1883             : 
    1884         926 :                 bi = bat_iterator(b);
    1885        1787 :                 MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp
    1886             :                                                            ? "pcrelike: pattern matching using strcmp" :
    1887         857 :                                                            use_re ? "pcrelike: pattern matching using RE" :
    1888             :                                                            "pcrelike: pattern matching using pcre");
    1889             : 
    1890         932 :                 if (use_re) {
    1891         701 :                         if ((msg = re_like_build(&re_simple, &wpat, pat, isensitive, use_strcmp,
    1892         701 :                                                                          ascii_pattern, (unsigned char) **esc)) != MAL_SUCCEED) {
    1893           0 :                                 bat_iterator_end(&bi);
    1894           0 :                                 goto bailout;
    1895             :                         }
    1896        5097 :                         for (BUN p = 0; p < q; p++) {
    1897        4397 :                                 const char *s = BUNtvar(bi, p);
    1898        4399 :                                 ret[p] = re_like_proj_apply(s, re_simple, wpat, pat, isensitive,
    1899             :                                                                                         anti, use_strcmp, ascii_pattern);
    1900        4396 :                                 has_nil |= is_bit_nil(ret[p]);
    1901             :                         }
    1902         231 :                 } else if (empty) {
    1903          40 :                         for (BUN p = 0; p < q; p++)
    1904          23 :                                 ret[p] = bit_nil;
    1905             :                         has_nil = true;
    1906             :                 } else {
    1907         214 :                         if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, q)) != MAL_SUCCEED) {
    1908           0 :                                 bat_iterator_end(&bi);
    1909           0 :                                 goto bailout;
    1910             :                         }
    1911        1328 :                         for (BUN p = 0; p < q; p++) {
    1912        1115 :                                 const char *s = BUNtvar(bi, p);
    1913        1117 :                                 if ((msg = pcre_like_apply(&(ret[p]), s, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1914           0 :                                         bat_iterator_end(&bi);
    1915           0 :                                         goto bailout;
    1916             :                                 }
    1917        1119 :                                 has_nil |= is_bit_nil(ret[p]);
    1918             :                         }
    1919             :                 }
    1920         930 :                 bat_iterator_end(&bi);
    1921             :         }
    1922             : 
    1923        1049 :   bailout:
    1924        1049 :         GDKfree(ppat);
    1925        1044 :         re_like_clean(&re_simple, &wpat);
    1926        1046 :         pcre_clean(&re, &ex);
    1927        1043 :         if (bn && !msg) {
    1928        1038 :                 BATsetcount(bn, q);
    1929        1043 :                 bn->tnil = has_nil;
    1930        1043 :                 bn->tnonil = !has_nil;
    1931        1043 :                 bn->tkey = BATcount(bn) <= 1;
    1932        1043 :                 bn->tsorted = BATcount(bn) <= 1;
    1933        1043 :                 bn->trevsorted = BATcount(bn) <= 1;
    1934        1043 :                 *r = bn->batCacheid;
    1935        1043 :                 BBPkeepref(bn);
    1936           5 :         } else if (bn)
    1937           5 :                 BBPreclaim(bn);
    1938        1045 :         BBPreclaim(b);
    1939        1049 :         BBPreclaim(pbn);
    1940        1048 :         return msg;
    1941             : }
    1942             : 
    1943             : static str
    1944         887 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1945             : {
    1946         887 :         const str *esc = getArgReference_str(stk, pci, 3);
    1947         887 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1948         887 :         bit no = FALSE;
    1949             : 
    1950         887 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &no);
    1951             : }
    1952             : 
    1953             : static str
    1954         158 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1955             : {
    1956         158 :         const str *esc = getArgReference_str(stk, pci, 3);
    1957         158 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1958         158 :         bit yes = TRUE;
    1959             : 
    1960         158 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &yes);
    1961             : }
    1962             : 
    1963             : /* scan select loop with or without candidates */
    1964             : #define pcrescanloop(TEST, KEEP_NULLS)                                                                  \
    1965             :         do {                                                                                                                            \
    1966             :                 TRC_DEBUG(ALGO,                                                                                                 \
    1967             :                                   "PCREselect(b=%s#"BUNFMT",anti=%d): "                                     \
    1968             :                                   "scanselect %s\n", BATgetId(b), BATcount(b),                        \
    1969             :                                   anti, #TEST);                                                                                 \
    1970             :                 if (!s || BATtdense(s)) {                                                                               \
    1971             :                         for (; p < q; p++) {                                                                         \
    1972             :                                 GDK_CHECK_TIMEOUT(qry_ctx, counter,                                             \
    1973             :                                                                   GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
    1974             :                                 const char *restrict v = BUNtvar(bi, p - off);                  \
    1975             :                                 if ((TEST) || ((KEEP_NULLS) && strNil(v)))                              \
    1976             :                                         vals[cnt++] = p;                                                                        \
    1977             :                         }                                                                                                                       \
    1978             :                 } else {                                                                                                                \
    1979             :                         for (; p < ncands; p++) {                                                                    \
    1980             :                                 GDK_CHECK_TIMEOUT(qry_ctx, counter,                                             \
    1981             :                                                                   GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
    1982             :                                 oid o = canditer_next(ci);                                                              \
    1983             :                                 const char *restrict v = BUNtvar(bi, o - off);                  \
    1984             :                                 if ((TEST) || ((KEEP_NULLS) && strNil(v)))                              \
    1985             :                                         vals[cnt++] = o;                                                                        \
    1986             :                         }                                                                                                                       \
    1987             :                 }                                                                                                                               \
    1988             :         } while (0)
    1989             : 
    1990             : #ifdef HAVE_LIBPCRE
    1991             : #define PCRE_LIKESELECT_BODY (pcre_exec(re, ex, v, (int) strlen(v), 0, PCRE_NO_UTF8_CHECK, NULL, 0) >= 0)
    1992             : #else
    1993             : #define PCRE_LIKESELECT_BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH)
    1994             : #endif
    1995             : 
    1996             : static str
    1997         458 : pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
    1998             :                                 BUN *rcnt, const char *pat, bool caseignore, bool anti,
    1999             :                                 bool keep_nulls)
    2000             : {
    2001             : #ifdef HAVE_LIBPCRE
    2002         458 :         pcre *re = NULL;
    2003         458 :         pcre_extra *ex = NULL;
    2004             : #else
    2005             :         regex_t re = (regex_t) { 0 };
    2006             :         void *ex = NULL;
    2007             : #endif
    2008         458 :         BATiter bi = bat_iterator(b);
    2009         460 :         BUN cnt = 0, ncands = ci->ncand;
    2010         460 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    2011         460 :         str msg = MAL_SUCCEED;
    2012             : 
    2013         460 :         size_t counter = 0;
    2014         460 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2015             : 
    2016         461 :         if ((msg = pcre_like_build(&re, &ex, pat, caseignore, ci->ncand)) != MAL_SUCCEED)
    2017           0 :                 goto bailout;
    2018             : 
    2019         456 :         if (anti)
    2020           0 :                 pcrescanloop(!strNil(v) && !PCRE_LIKESELECT_BODY, keep_nulls);
    2021             :         else
    2022       37477 :                 pcrescanloop(!strNil(v) && PCRE_LIKESELECT_BODY, keep_nulls);
    2023             : 
    2024           4 :   bailout:
    2025         455 :         bat_iterator_end(&bi);
    2026         459 :         pcre_clean(&re, &ex);
    2027         458 :         *rcnt = cnt;
    2028         458 :         return msg;
    2029             : }
    2030             : 
    2031             : static str
    2032        7162 : re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
    2033             :                           BUN *rcnt, const char *pat, bool caseignore, bool anti,
    2034             :                           bool use_strcmp, uint32_t esc, bool keep_nulls,
    2035             :                           bool ascii_pattern)
    2036             : {
    2037        7162 :         BATiter bi = bat_iterator(b);
    2038        7199 :         BUN cnt = 0, ncands = ci->ncand;
    2039        7199 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    2040        7199 :         struct RE *re = NULL;
    2041        7199 :         uint32_t *wpat = NULL;
    2042        7199 :         str msg = MAL_SUCCEED;
    2043             : 
    2044        7199 :         size_t counter = 0;
    2045        7199 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2046             : 
    2047        7199 :         if ((msg = re_like_build(&re, &wpat, pat, caseignore, use_strcmp, ascii_pattern,
    2048             :                                                          esc)) != MAL_SUCCEED)
    2049           0 :                 goto bailout;
    2050             : 
    2051        7188 :         if (use_strcmp) {
    2052         114 :                 if (caseignore) {
    2053          31 :                         if (ascii_pattern) {
    2054          15 :                                 if (anti)
    2055          59 :                                         pcrescanloop(!strNil(v)
    2056             :                                                                  && istrcmp(v, pat) != 0, keep_nulls);
    2057             :                                 else
    2058          19 :                                         pcrescanloop(!strNil(v)
    2059             :                                                                  && istrcmp(v, pat) == 0, keep_nulls);
    2060             :                         } else {
    2061          16 :                                 if (anti)
    2062           0 :                                         pcrescanloop(!strNil(v)
    2063             :                                                                  && mywstrcasecmp(v, wpat) != 0, keep_nulls);
    2064             :                                 else
    2065          52 :                                         pcrescanloop(!strNil(v)
    2066             :                                                                  && mywstrcasecmp(v, wpat) == 0, keep_nulls);
    2067             :                         }
    2068             :                 } else {
    2069          83 :                         if (anti)
    2070          62 :                                 pcrescanloop(!strNil(v) && strcmp(v, pat) != 0, keep_nulls);
    2071             :                         else
    2072        9863 :                                 pcrescanloop(!strNil(v) && strcmp(v, pat) == 0, keep_nulls);
    2073             :                 }
    2074             :         } else {
    2075        7074 :                 if (caseignore) {
    2076             :                         /* ascii_pattern == true is encoded in re */
    2077         100 :                         if (anti) {
    2078           1 :                                 if (ascii_pattern)
    2079          42 :                                         pcrescanloop(!strNil(v)
    2080             :                                                                  && !re_match_no_ignore(v, re), keep_nulls);
    2081             :                                 else
    2082           0 :                                         pcrescanloop(!strNil(v)
    2083             :                                                                  && !re_match_ignore(v, re), keep_nulls);
    2084             :                         } else {
    2085          99 :                                 if (ascii_pattern)
    2086        6226 :                                         pcrescanloop(!strNil(v)
    2087             :                                                                  && re_match_no_ignore(v, re), keep_nulls);
    2088             :                                 else
    2089         104 :                                         pcrescanloop(!strNil(v)
    2090             :                                                                  && re_match_ignore(v, re), keep_nulls);
    2091             :                         }
    2092             :                 } else {
    2093        6974 :                         if (anti)
    2094       37389 :                                 pcrescanloop(!strNil(v)
    2095             :                                                          && !re_match_no_ignore(v, re), keep_nulls);
    2096             :                         else
    2097      105183 :                                 pcrescanloop(!strNil(v)
    2098             :                                                          && re_match_no_ignore(v, re), keep_nulls);
    2099             :                 }
    2100             :         }
    2101             : 
    2102          87 :   bailout:
    2103        7185 :         bat_iterator_end(&bi);
    2104        7200 :         re_like_clean(&re, &wpat);
    2105        7200 :         *rcnt = cnt;
    2106        7200 :         return msg;
    2107             : }
    2108             : 
    2109             : static str
    2110        7642 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const str *pat,
    2111             :                            const str *esc, const bit *caseignore, const bit *anti)
    2112             : {
    2113        7642 :         BAT *b, *s = NULL, *bn = NULL, *old_s = NULL;
    2114        7642 :         str msg = MAL_SUCCEED;
    2115        7642 :         char *ppat = NULL;
    2116        7642 :         bool use_re = false,
    2117        7642 :                 use_strcmp = false,
    2118        7642 :                 empty = false,
    2119        7642 :                 ascii_pattern = false;
    2120        7642 :         bool with_strimps = false;
    2121        7642 :         bool with_strimps_anti = false;
    2122        7642 :         BUN p = 0, q = 0, rcnt = 0;
    2123        7642 :         struct canditer ci;
    2124             : 
    2125        7642 :         if ((b = BATdescriptor(*bid)) == NULL) {
    2126           0 :                 msg = createException(MAL, "algebra.likeselect",
    2127             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2128           0 :                 goto bailout;
    2129             :         }
    2130        7661 :         if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
    2131           0 :                 msg = createException(MAL, "algebra.likeselect",
    2132             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2133           0 :                 goto bailout;
    2134             :         }
    2135             : 
    2136        7662 :         assert(ATOMstorage(b->ttype) == TYPE_str);
    2137             : 
    2138        7662 :         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
    2139             :                                                                 *pat, *esc)) != MAL_SUCCEED)
    2140           0 :                 goto bailout;
    2141             : 
    2142        7634 :         if (empty) {
    2143           0 :                 if (!(bn = BATdense(0, 0, 0)))
    2144           0 :                         msg = createException(MAL, "algebra.likeselect",
    2145             :                                                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2146             : 
    2147           0 :                 goto bailout;
    2148             :         }
    2149             :         /* Since the strimp pre-filtering of a LIKE query produces a superset of the actual result the complement of that
    2150             :          * set will necessarily reject some of the matching entries in the NOT LIKE query.
    2151             :          *
    2152             :          * In this case we run the PCRElikeselect as a LIKE query with strimps and return the complement of the result,
    2153             :          * taking extra care to not return NULLs. This currently means that we do not run strimps for NOT LIKE queries if
    2154             :          * the BAT contains NULLs.
    2155             :          */
    2156        7634 :         if (BAThasstrimps(b)) {
    2157          48 :                 if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
    2158          48 :                         BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
    2159          48 :                         if (tmp_s) {
    2160          48 :                                 old_s = s;
    2161          48 :                                 s = tmp_s;
    2162          48 :                                 if (!*anti)
    2163             :                                         with_strimps = true;
    2164             :                                 else
    2165           0 :                                         with_strimps_anti = true;
    2166             :                         }
    2167             :                 } else {                                /* If we cannot filter with the strimp just continue normally */
    2168           0 :                         GDKclrerr();
    2169             :                 }
    2170             :         }
    2171             : 
    2172             : 
    2173        7656 :         MT_thread_setalgorithm(use_strcmp
    2174        7656 :                                                    ? (with_strimps ?
    2175             :                                                           "pcrelike: pattern matching using strcmp with strimps"
    2176             :                                                           : (with_strimps_anti ?
    2177             :                                                                  "pcrelike: pattern matching using strcmp with strimps anti"
    2178        7656 :                                                                  : "pcrelike: pattern matching using strcmp")) :
    2179        7540 :                                                    use_re ? (with_strimps ?
    2180             :                                                                          "pcrelike: pattern matching using RE with strimps"
    2181             :                                                                          : (with_strimps_anti ?
    2182             :                                                                                 "pcrelike: patterm matching using RE with strimps anti"
    2183             :                                                                                 :
    2184             :                                                                                 "pcrelike: pattern matching using RE"))
    2185             :                                                    : (with_strimps ?
    2186             :                                                           "pcrelike: pattern matching using pcre with strimps"
    2187             :                                                           : (with_strimps_anti ?
    2188             :                                                                  "pcrelike: pattermatching using pcre with strimps anti"
    2189             :                                                                  : "pcrelike: pattern matching using pcre")));
    2190             : 
    2191        7660 :         canditer_init(&ci, b, s);
    2192        7659 :         if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
    2193           0 :                 msg = createException(MAL, "algebra.likeselect",
    2194             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2195           0 :                 goto bailout;
    2196             :         }
    2197             : 
    2198        7640 :         if (!s || BATtdense(s)) {
    2199        2429 :                 if (s) {
    2200        5128 :                         assert(BATtdense(s));
    2201        5128 :                         p = (BUN) s->tseqbase;
    2202        5128 :                         q = p + BATcount(s);
    2203        5128 :                         if ((oid) p < b->hseqbase)
    2204             :                                 p = b->hseqbase;
    2205        5128 :                         if ((oid) q > b->hseqbase + BATcount(b))
    2206             :                                 q = b->hseqbase + BATcount(b);
    2207             :                 } else {
    2208        2429 :                         p = b->hseqbase;
    2209        2429 :                         q = BATcount(b) + b->hseqbase;
    2210             :                 }
    2211             :         }
    2212             : 
    2213        7640 :         if (use_re) {
    2214        7182 :                 msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, *caseignore, *anti
    2215        1584 :                                                         && !with_strimps_anti, use_strcmp,
    2216        7182 :                                                         (unsigned char) **esc, with_strimps_anti,
    2217             :                                                         ascii_pattern);
    2218             :         } else {
    2219         458 :                 msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, *caseignore,
    2220         458 :                                                           *anti && !with_strimps_anti, with_strimps_anti);
    2221             :         }
    2222             : 
    2223        7646 :         if (!msg) {                                     /* set some properties */
    2224        7646 :                 BATsetcount(bn, rcnt);
    2225        7649 :                 bn->tsorted = true;
    2226        7649 :                 bn->trevsorted = bn->batCount <= 1;
    2227        7649 :                 bn->tkey = true;
    2228        7649 :                 bn->tnil = false;
    2229        7649 :                 bn->tnonil = true;
    2230        7649 :                 bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 ? *(const oid *) Tloc(bn, 0) : rcnt == b->batCount ? b->hseqbase : oid_nil;
    2231        7649 :                 if (with_strimps_anti) {
    2232             :                         /* Reverse the result taking into account the original candidate list. */
    2233             :                         // BAT *rev = BATdiffcand(BATdense(b->hseqbase, 0, b->batCount), bn);
    2234           0 :                         BAT *rev;
    2235           0 :                         if (old_s) {
    2236           0 :                                 rev = BATdiffcand(old_s, bn);
    2237             : #ifndef NDEBUG
    2238           0 :                                 BAT *is = BATintersectcand(old_s, bn);
    2239           0 :                                 if (is) {
    2240           0 :                                         assert(is->batCount == bn->batCount);
    2241           0 :                                         BBPreclaim(is);
    2242             :                                 }
    2243           0 :                                 assert(rev->batCount == old_s->batCount - bn->batCount);
    2244             : #endif
    2245             :                         }
    2246             : 
    2247             :                         else
    2248           0 :                                 rev = BATnegcands(b->batCount, bn);
    2249             :                         /* BAT *rev = BATnegcands(b->batCount, bn); */
    2250           0 :                         BBPunfix(bn->batCacheid);
    2251           0 :                         bn = rev;
    2252             :                 }
    2253             :         }
    2254             : 
    2255             : 
    2256        7649 :   bailout:
    2257        7649 :         BBPreclaim(b);
    2258        7655 :         BBPreclaim(s);
    2259        7654 :         BBPreclaim(old_s);
    2260        7653 :         GDKfree(ppat);
    2261        7647 :         if (bn && !msg) {
    2262        7647 :                 *ret = bn->batCacheid;
    2263        7647 :                 BBPkeepref(bn);
    2264           0 :         } else if (bn)
    2265           0 :                 BBPreclaim(bn);
    2266        7653 :         return msg;
    2267             : }
    2268             : 
    2269             : #define APPEND(b, o)    (((oid *) b->theap->base)[b->batCount++] = (o))
    2270             : #define VALUE(s, x)             (s##vars + VarHeapVal(s##vals, (x), s##i.width))
    2271             : 
    2272             : #ifdef HAVE_LIBPCRE
    2273             : #define PCRE_EXEC \
    2274             :         do { \
    2275             :                 retval = pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, PCRE_NO_UTF8_CHECK, NULL, 0); \
    2276             :         } while (0)
    2277             : #define PCRE_EXEC_COND (retval < 0)
    2278             : #else
    2279             : #define PCRE_EXEC \
    2280             :         do { \
    2281             :                 retval = regexec(&pcrere, vl, (size_t) 0, NULL, 0); \
    2282             :         } while (0)
    2283             : #define PCRE_EXEC_COND (retval == REG_NOMATCH || retval == REG_ENOSYS)
    2284             : #endif
    2285             : 
    2286             : /* nested loop implementation for PCRE join */
    2287             : #define pcre_join_loop(STRCMP, RE_MATCH, PCRE_COND)                                             \
    2288             :         do {                                                                                                                            \
    2289             :                 for (BUN ridx = 0; ridx < rci.ncand; ridx++) {                                       \
    2290             :                         GDK_CHECK_TIMEOUT(qry_ctx, counter,                                                     \
    2291             :                                                           GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
    2292             :                         ro = canditer_next(&rci);                                                                   \
    2293             :                         vr = VALUE(r, ro - rbase);                                                                      \
    2294             :                         nl = 0;                                                                                                         \
    2295             :                         ascii_pattern = use_re = use_strcmp = empty = false;            \
    2296             :                         if ((msg = choose_like_path(&pcrepat, &use_re, &use_strcmp, &empty, &ascii_pattern, vr, esc))) \
    2297             :                                 goto bailout;                                                                                   \
    2298             :                         if (!empty) {                                                                                           \
    2299             :                                 if (use_re) {                                                                                   \
    2300             :                                         if ((msg = re_like_build(&re, &wpat, vr, caseignore, use_strcmp, ascii_pattern, (unsigned char) *esc)) != MAL_SUCCEED) \
    2301             :                                                 goto bailout;                                                                   \
    2302             :                                 } else if (pcrepat) {                                                                   \
    2303             :                                         if ((msg = pcre_like_build(&pcrere, &pcreex, pcrepat, caseignore, lci.ncand)) != MAL_SUCCEED) \
    2304             :                                                 goto bailout;                                                                   \
    2305             :                                         GDKfree(pcrepat);                                                                       \
    2306             :                                         pcrepat = NULL;                                                                         \
    2307             :                                 }                                                                                                               \
    2308             :                                 canditer_reset(&lci);                                                                       \
    2309             :                                 for (BUN lidx = 0; lidx < lci.ncand; lidx++) {                       \
    2310             :                                         lo = canditer_next(&lci);                                                   \
    2311             :                                         vl = VALUE(l, lo - lbase);                                                      \
    2312             :                                         if (strNil(vl)) {                                                                       \
    2313             :                                                 continue;                                                                               \
    2314             :                                         } else if (use_re) {                                                            \
    2315             :                                                 if (use_strcmp) {                                                               \
    2316             :                                                         if (STRCMP)                                                                     \
    2317             :                                                                 continue;                                                               \
    2318             :                                                 } else {                                                                                \
    2319             :                                                         assert(re);                                                                     \
    2320             :                                                         if (RE_MATCH)                                                           \
    2321             :                                                                 continue;                                                               \
    2322             :                                                 }                                                                                               \
    2323             :                                         } else {                                                                                        \
    2324             :                                                 int retval;                                                                             \
    2325             :                                                 PCRE_EXEC;                                                                              \
    2326             :                                                 if (PCRE_COND)                                                                  \
    2327             :                                                         continue;                                                                       \
    2328             :                                         }                                                                                                       \
    2329             :                                         if (BATcount(r1) == BATcapacity(r1)) {                          \
    2330             :                                                 newcap = BATgrows(r1);                                                  \
    2331             :                                                 BATsetcount(r1, BATcount(r1));                                  \
    2332             :                                                 if (r2)                                                                                 \
    2333             :                                                         BATsetcount(r2, BATcount(r2));                          \
    2334             :                                                 if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
    2335             :                                                         msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
    2336             :                                                         goto bailout;                                                           \
    2337             :                                                 }                                                                                               \
    2338             :                                                 assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
    2339             :                                         }                                                                                                       \
    2340             :                                         if (BATcount(r1) > 0) {                                                              \
    2341             :                                                 if (lastl + 1 != lo)                                                    \
    2342             :                                                         r1->tseqbase = oid_nil;                                              \
    2343             :                                                 if (nl == 0) {                                                                  \
    2344             :                                                         if (r2)                                                                         \
    2345             :                                                                 r2->trevsorted = false;                                      \
    2346             :                                                         if (lastl > lo) {                                                    \
    2347             :                                                                 r1->tsorted = false;                                 \
    2348             :                                                                 r1->tkey = false;                                            \
    2349             :                                                         } else if (lastl < lo) {                                     \
    2350             :                                                                 r1->trevsorted = false;                                      \
    2351             :                                                         } else {                                                                        \
    2352             :                                                                 r1->tkey = false;                                            \
    2353             :                                                         }                                                                                       \
    2354             :                                                 }                                                                                               \
    2355             :                                         }                                                                                                       \
    2356             :                                         APPEND(r1, lo);                                                                         \
    2357             :                                         if (r2)                                                                                         \
    2358             :                                                 APPEND(r2, ro);                                                                 \
    2359             :                                         lastl = lo;                                                                                     \
    2360             :                                         nl++;                                                                                           \
    2361             :                                 }                                                                                                               \
    2362             :                                 re_like_clean(&re, &wpat);                                                              \
    2363             :                                 pcre_clean(&pcrere, &pcreex);                                                   \
    2364             :                         }                                                                                                                       \
    2365             :                         if (r2) {                                                                                                       \
    2366             :                                 if (nl > 1) {                                                                                        \
    2367             :                                         r2->tkey = false;                                                                    \
    2368             :                                         r2->tseqbase = oid_nil;                                                              \
    2369             :                                         r1->trevsorted = false;                                                              \
    2370             :                                 } else if (nl == 0) {                                                                   \
    2371             :                                         rskipped = BATcount(r2) > 0;                                         \
    2372             :                                 } else if (rskipped) {                                                                  \
    2373             :                                         r2->tseqbase = oid_nil;                                                              \
    2374             :                                 }                                                                                                               \
    2375             :                         } else if (nl > 1) {                                                                         \
    2376             :                                 r1->trevsorted = false;                                                                      \
    2377             :                         }                                                                                                                       \
    2378             :                 }                                                                                                                               \
    2379             :         } while (0)
    2380             : 
    2381             : static char *
    2382          59 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc,
    2383             :                  bit caseignore, bit anti)
    2384             : {
    2385          59 :         struct canditer lci, rci;
    2386          59 :         const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
    2387          59 :         int rskipped = 0;                       /* whether we skipped values in r */
    2388          59 :         oid lbase, rbase, lo, ro, lastl = 0;    /* last value inserted into r1 */
    2389          59 :         BUN nl, newcap;
    2390          59 :         char *pcrepat = NULL, *msg = MAL_SUCCEED;
    2391          59 :         struct RE *re = NULL;
    2392          59 :         bool use_re = false,
    2393          59 :                 use_strcmp = false,
    2394          59 :                 empty = false,
    2395          59 :                 ascii_pattern = false;
    2396          59 :         uint32_t *wpat = NULL;
    2397             : #ifdef HAVE_LIBPCRE
    2398          59 :         pcre *pcrere = NULL;
    2399          59 :         pcre_extra *pcreex = NULL;
    2400             : #else
    2401             :         regex_t pcrere = (regex_t) { 0 };
    2402             :         void *pcreex = NULL;
    2403             : #endif
    2404             : 
    2405          59 :         size_t counter = 0;
    2406          59 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2407             : 
    2408          59 :         TRC_DEBUG(ALGO,
    2409             :                           "pcrejoin(l=%s#" BUNFMT "[%s]%s%s,"
    2410             :                           "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
    2411             :                           "sr=%s#" BUNFMT "%s%s)\n",
    2412             :                           BATgetId(l), BATcount(l), ATOMname(l->ttype),
    2413             :                           l->tsorted ? "-sorted" : "",
    2414             :                           l->trevsorted ? "-revsorted" : "",
    2415             :                           BATgetId(r), BATcount(r), ATOMname(r->ttype),
    2416             :                           r->tsorted ? "-sorted" : "",
    2417             :                           r->trevsorted ? "-revsorted" : "",
    2418             :                           sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
    2419             :                           sl && sl->tsorted ? "-sorted" : "",
    2420             :                           sl && sl->trevsorted ? "-revsorted" : "",
    2421             :                           sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
    2422             :                           sr && sr->tsorted ? "-sorted" : "",
    2423             :                           sr && sr->trevsorted ? "-revsorted" : "");
    2424             : 
    2425         177 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
    2426          59 :         assert(ATOMtype(l->ttype) == TYPE_str);
    2427             : 
    2428          59 :         canditer_init(&lci, l, sl);
    2429          59 :         canditer_init(&rci, r, sr);
    2430             : 
    2431          59 :         BATiter li = bat_iterator(l);
    2432          59 :         BATiter ri = bat_iterator(r);
    2433          59 :         lbase = l->hseqbase;
    2434          59 :         rbase = r->hseqbase;
    2435          59 :         lvals = (const char *) li.base;
    2436          59 :         rvals = (const char *) ri.base;
    2437          59 :         assert(ri.vh && r->ttype);
    2438          59 :         lvars = li.vh->base;
    2439          59 :         rvars = ri.vh->base;
    2440             : 
    2441          59 :         r1->tkey = true;
    2442          59 :         r1->tsorted = true;
    2443          59 :         r1->trevsorted = true;
    2444          59 :         r1->tnil = false;
    2445          59 :         r1->tnonil = true;
    2446          59 :         if (r2) {
    2447          43 :                 r2->tkey = true;
    2448          43 :                 r2->tsorted = true;
    2449          43 :                 r2->trevsorted = true;
    2450          43 :                 r2->tnil = false;
    2451          43 :                 r2->tnonil = true;
    2452             :         }
    2453             : 
    2454          59 :         if (anti) {
    2455          35 :                 if (caseignore) {
    2456         127 :                         pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) == 0 : mywstrcasecmp(vl, wpat) == 0,
    2457             :                                                    re_match_ignore(vl, re), !PCRE_EXEC_COND);
    2458             :                 } else {
    2459         327 :                         pcre_join_loop(strcmp(vl, vr) == 0, re_match_no_ignore(vl, re), !PCRE_EXEC_COND);
    2460             :                 }
    2461             :         } else {
    2462          24 :                 if (caseignore) {
    2463           5 :                         pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) != 0 : mywstrcasecmp(vl, wpat) != 0,
    2464             :                                                    !re_match_ignore(vl, re), PCRE_EXEC_COND);
    2465             :                 } else {
    2466         387 :                         pcre_join_loop(strcmp(vl, vr) != 0, !re_match_no_ignore(vl, re), PCRE_EXEC_COND);
    2467             :                 }
    2468             :         }
    2469          57 :         bat_iterator_end(&li);
    2470          58 :         bat_iterator_end(&ri);
    2471             : 
    2472          58 :         assert(!r2 || BATcount(r1) == BATcount(r2));
    2473             :         /* also set other bits of heap to correct value to indicate size */
    2474          58 :         BATsetcount(r1, BATcount(r1));
    2475          59 :         if (r2)
    2476          43 :                 BATsetcount(r2, BATcount(r2));
    2477          59 :         if (BATcount(r1) > 0) {
    2478          44 :                 if (BATtdense(r1))
    2479          14 :                         r1->tseqbase = ((oid *) r1->theap->base)[0];
    2480          44 :                 if (r2 && BATtdense(r2))
    2481          33 :                         r2->tseqbase = ((oid *) r2->theap->base)[0];
    2482             :         } else {
    2483          15 :                 r1->tseqbase = 0;
    2484          15 :                 if (r2)
    2485           9 :                         r2->tseqbase = 0;
    2486             :         }
    2487          42 :         if (r2)
    2488          43 :                 TRC_DEBUG(ALGO,
    2489             :                                   "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s\n",
    2490             :                                   BATgetId(l), BATgetId(r),
    2491             :                                   BATgetId(r1), BATcount(r1),
    2492             :                                   r1->tsorted ? "-sorted" : "",
    2493             :                                   r1->trevsorted ? "-revsorted" : "",
    2494             :                                   BATgetId(r2), BATcount(r2),
    2495             :                                   r2->tsorted ? "-sorted" : "",
    2496             :                                   r2->trevsorted ? "-revsorted" : "");
    2497             :         else
    2498          16 :                 TRC_DEBUG(ALGO,
    2499             :                                   "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s\n",
    2500             :                                   BATgetId(l), BATgetId(r),
    2501             :                                   BATgetId(r1), BATcount(r1),
    2502             :                                   r1->tsorted ? "-sorted" : "",
    2503             :                                   r1->trevsorted ? "-revsorted" : "");
    2504             :         return MAL_SUCCEED;
    2505             : 
    2506           0 :   bailout:
    2507           0 :         bat_iterator_end(&li);
    2508           0 :         bat_iterator_end(&ri);
    2509           0 :         GDKfree(pcrepat);
    2510           0 :         re_like_clean(&re, &wpat);
    2511           0 :         pcre_clean(&pcrere, &pcreex);
    2512           0 :         assert(msg != MAL_SUCCEED);
    2513             :         return msg;
    2514             : }
    2515             : 
    2516             : static str
    2517          55 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid,
    2518             :                  bat ciid, bit anti)
    2519             : {
    2520          55 :         BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL,
    2521          55 :                 *candleft = NULL, *candright = NULL;
    2522          55 :         BAT *result1 = NULL, *result2 = NULL;
    2523          55 :         char *msg = MAL_SUCCEED;
    2524          55 :         const char *esc = "";
    2525          55 :         bit ci;
    2526          55 :         BATiter bi;
    2527             : 
    2528          55 :         if ((left = BATdescriptor(lid)) == NULL)
    2529           0 :                 goto fail;
    2530          59 :         if ((right = BATdescriptor(rid)) == NULL)
    2531           0 :                 goto fail;
    2532          59 :         if ((escape = BATdescriptor(elid)) == NULL)
    2533           0 :                 goto fail;
    2534          59 :         if ((caseignore = BATdescriptor(ciid)) == NULL)
    2535           0 :                 goto fail;
    2536          59 :         if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
    2537           0 :                 goto fail;
    2538          59 :         if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
    2539           0 :                 goto fail;
    2540          59 :         result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2541          58 :         if (r2)
    2542          43 :                 result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2543          57 :         if (!result1 || (r2 && !result2)) {
    2544           0 :                 msg = createException(MAL, "pcre.join",
    2545             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2546           0 :                 goto fail;
    2547             :         }
    2548          57 :         result1->tnil = false;
    2549          57 :         result1->tnonil = true;
    2550          57 :         result1->tkey = true;
    2551          57 :         result1->tsorted = true;
    2552          57 :         result1->trevsorted = true;
    2553          57 :         result1->tseqbase = 0;
    2554          57 :         if (r2) {
    2555          42 :                 result2->tnil = false;
    2556          42 :                 result2->tnonil = true;
    2557          42 :                 result2->tkey = true;
    2558          42 :                 result2->tsorted = true;
    2559          42 :                 result2->trevsorted = true;
    2560          42 :                 result2->tseqbase = 0;
    2561             :         }
    2562          57 :         if (BATcount(escape) != 1) {
    2563           0 :                 msg = createException(MAL, "pcre.join",
    2564             :                                                           SQLSTATE(42000)
    2565             :                                                           "At the moment, only one value is allowed for the escape input at pcre join");
    2566           0 :                 goto fail;
    2567             :         }
    2568          57 :         if (BATcount(caseignore) != 1) {
    2569           0 :                 msg = createException(MAL, "pcre.join",
    2570             :                                                           SQLSTATE(42000)
    2571             :                                                           "At the moment, only one value is allowed for the case ignore input at pcre join");
    2572           0 :                 goto fail;
    2573             :         }
    2574          57 :         bi = bat_iterator(caseignore);
    2575          59 :         ci = *(bit *) BUNtloc(bi, 0);
    2576          59 :         bat_iterator_end(&bi);
    2577          59 :         bi = bat_iterator(escape);
    2578          59 :         esc = BUNtvar(bi, 0);
    2579          59 :         msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci,
    2580             :                                    anti);
    2581          59 :         bat_iterator_end(&bi);
    2582          59 :         if (msg)
    2583           0 :                 goto fail;
    2584          59 :         *r1 = result1->batCacheid;
    2585          59 :         BBPkeepref(result1);
    2586          59 :         if (r2) {
    2587          43 :                 *r2 = result2->batCacheid;
    2588          43 :                 BBPkeepref(result2);
    2589             :         }
    2590          59 :         BBPunfix(left->batCacheid);
    2591          59 :         BBPunfix(right->batCacheid);
    2592          59 :         BBPreclaim(escape);
    2593          59 :         BBPreclaim(caseignore);
    2594          59 :         BBPreclaim(candleft);
    2595          59 :         BBPreclaim(candright);
    2596             :         return MAL_SUCCEED;
    2597             : 
    2598           0 :   fail:
    2599           0 :         BBPreclaim(left);
    2600           0 :         BBPreclaim(right);
    2601           0 :         BBPreclaim(escape);
    2602           0 :         BBPreclaim(caseignore);
    2603           0 :         BBPreclaim(candleft);
    2604           0 :         BBPreclaim(candright);
    2605           0 :         BBPreclaim(result1);
    2606           0 :         BBPreclaim(result2);
    2607           0 :         if (msg)
    2608             :                 return msg;
    2609           0 :         throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2610             : }
    2611             : 
    2612             : static str
    2613          39 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid,
    2614             :                  const bat *cid, const bat *slid, const bat *srid,
    2615             :                  const bit *nil_matches, const lng *estimate, const bit *anti)
    2616             : {
    2617          39 :         (void) nil_matches;
    2618          39 :         (void) estimate;
    2619          39 :         return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
    2620          39 :                                         *elid, *cid, *anti);
    2621             : }
    2622             : 
    2623             : static str
    2624          16 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid,
    2625             :                   const bat *cid, const bat *slid, const bat *srid,
    2626             :                   const bit *nil_matches, const lng *estimate, const bit *anti)
    2627             : {
    2628          16 :         (void) nil_matches;
    2629          16 :         (void) estimate;
    2630          16 :         return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
    2631          16 :                                         *elid, *cid, *anti);
    2632             : }
    2633             : 
    2634             : #include "mel.h"
    2635             : mel_atom pcre_init_atoms[] = {
    2636             :  { .name="pcre", },  { .cmp=NULL }
    2637             : };
    2638             : mel_func pcre_init_funcs[] = {
    2639             :  command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
    2640             :  command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2641             :  command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2642             :  command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
    2643             :  command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2644             :  command("pcre", "replace_first", PCREreplacefirst_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2645             :  command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
    2646             :  command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
    2647             :  command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2648             :  command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2649             :  command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2650             :  command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2651             :  command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2652             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2653             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2654             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2655             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2656             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2657             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2658             :  command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds.  The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
    2659             :  command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    2660             :  command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
    2661             :  { .imp=NULL }
    2662             : };
    2663             : #include "mal_import.h"
    2664             : #ifdef _MSC_VER
    2665             : #undef read
    2666             : #pragma section(".CRT$XCU",read)
    2667             : #endif
    2668         334 : LIB_STARTUP_FUNC(init_pcre_mal)
    2669         334 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }

Generated by: LCOV version 1.14