LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - pcre.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1043 1304 80.0 %
Date: 2024-04-25 20:03:45 Functions: 52 56 92.9 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  * N. Nes
      15             :  * PCRE library interface
      16             :  * The  PCRE library is a set of functions that implement regular
      17             :  * expression pattern matching using the same syntax  and  semantics  as  Perl,
      18             :  * with  just  a  few  differences.  The  current  implementation of PCRE
      19             :  * (release 4.x) corresponds approximately with Perl 5.8, including  support
      20             :  * for  UTF-8  encoded  strings.   However,  this support has to be
      21             :  * explicitly enabled; it is not the default.
      22             :  *
      23             :  * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
      24             :  */
      25             : #include "monetdb_config.h"
      26             : #include <string.h>
      27             : 
      28             : #include "mal.h"
      29             : #include "mal_client.h"
      30             : #include "mal_interpreter.h"
      31             : #include "mal_exception.h"
      32             : 
      33             : #include <wchar.h>
      34             : #include <wctype.h>
      35             : 
      36             : #ifdef HAVE_LIBPCRE
      37             : #include <pcre.h>
      38             : #ifndef PCRE_STUDY_JIT_COMPILE
      39             : /* old library version on e.g. EPEL 6 */
      40             : #define pcre_free_study(x)              pcre_free(x)
      41             : #define PCRE_STUDY_JIT_COMPILE  0
      42             : #endif
      43             : #define JIT_COMPILE_MIN 1024    /* when to try JIT compilation of patterns */
      44             : 
      45             : #else
      46             : 
      47             : #include <regex.h>
      48             : 
      49             : typedef regex_t pcre;
      50             : #endif
      51             : 
      52             : /* current implementation assumes simple %keyword% [keyw%]* */
      53             : struct RE {
      54             :         char *k;
      55             :         uint32_t *w;
      56             :         bool search:1, atend:1, is_ascii:1, case_ignore:1;
      57             :         size_t len;
      58             :         struct RE *n;
      59             : };
      60             : 
      61             : /* We cannot use strcasecmp and strncasecmp since they work byte for
      62             :  * byte and don't deal with multibyte encodings (such as UTF-8).
      63             :  *
      64             :  * We implement our own conversion from UTF-8 encoding to Unicode code
      65             :  * points which we store in uint32_t.  The reason for this is,
      66             :  * functions like mbsrtowcs are locale-dependent (so we need a UTF-8
      67             :  * locale to use them), and on Windows, wchar_t is only 2 bytes and
      68             :  * therefore cannot hold all Unicode code points.  We do use functions
      69             :  * such as towlower to convert a Unicode code point to its lower-case
      70             :  * equivalent, but again on Windows, if the code point doesn't fit in
      71             :  * 2 bytes, we skip this conversion and compare the unconverted code
      72             :  * points.
      73             :  *
      74             :  * Note, towlower is also locale-dependent, but we don't need a UTF-8
      75             :  * locale in order to use it. */
      76             : 
      77             : /* helper function to convert a UTF-8 multibyte character to a wide
      78             :  * character */
      79             : static size_t
      80         275 : utfc8touc(uint32_t *restrict dest, const char *restrict src)
      81             : {
      82         275 :         if ((src[0] & 0x80) == 0) {
      83         218 :                 *dest = src[0];
      84         218 :                 return src[0] != 0;
      85          57 :         } else if ((src[0] & 0xE0) == 0xC0
      86          40 :                            && (src[1] & 0xC0) == 0x80 && (src[0] & 0x1E) != 0) {
      87          40 :                 *dest = (src[0] & 0x1F) << 6 | (src[1] & 0x3F);
      88          40 :                 return 2;
      89          17 :         } else if ((src[0] & 0xF0) == 0xE0
      90          17 :                            && (src[1] & 0xC0) == 0x80
      91          17 :                            && (src[2] & 0xC0) == 0x80
      92          17 :                            && ((src[0] & 0x0F) != 0 || (src[1] & 0x20) != 0)) {
      93          17 :                 *dest = (src[0] & 0x0F) << 12 | (src[1] & 0x3F) << 6 | (src[2] & 0x3F);
      94          17 :                 return 3;
      95           0 :         } else if ((src[0] & 0xF8) == 0xF0
      96           0 :                            && (src[1] & 0xC0) == 0x80
      97           0 :                            && (src[2] & 0xC0) == 0x80 && (src[3] & 0xC0) == 0x80) {
      98           0 :                 uint32_t c = (src[0] & 0x07) << 18
      99           0 :                                 | (src[1] & 0x3F) << 12
     100           0 :                                 | (src[2] & 0x3F) << 6 | (src[3] & 0x3F);
     101           0 :                 if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
     102             :                         return (size_t) -1;
     103           0 :                 *dest = c;
     104           0 :                 return 4;
     105             :         }
     106             :         return (size_t) -1;
     107             : }
     108             : 
     109             : /* helper function to convert a UTF-8 string to a wide character
     110             :  * string, the wide character string is allocated */
     111             : static uint32_t *
     112          49 : utf8stoucs(const char *src)
     113             : {
     114          49 :         uint32_t *dest;
     115          49 :         size_t i = 0;
     116          49 :         size_t j = 0;
     117             : 
     118             :         /* count how many uint32_t's we need, while also checking for
     119             :          * correctness of the input */
     120         263 :         while (src[j]) {
     121         214 :                 i++;
     122         214 :                 if ((src[j + 0] & 0x80) == 0) {
     123         165 :                         j += 1;
     124          49 :                 } else if ((src[j + 0] & 0xE0) == 0xC0
     125          24 :                                    && (src[j + 1] & 0xC0) == 0x80 && (src[j + 0] & 0x1E) != 0) {
     126          24 :                         j += 2;
     127          25 :                 } else if ((src[j + 0] & 0xF0) == 0xE0
     128          25 :                                    && (src[j + 1] & 0xC0) == 0x80
     129          25 :                                    && (src[j + 2] & 0xC0) == 0x80
     130          25 :                                    && ((src[j + 0] & 0x0F) != 0 || (src[j + 1] & 0x20) != 0)) {
     131          25 :                         j += 3;
     132           0 :                 } else if ((src[j + 0] & 0xF8) == 0xF0
     133           0 :                                    && (src[j + 1] & 0xC0) == 0x80
     134           0 :                                    && (src[j + 2] & 0xC0) == 0x80
     135           0 :                                    && (src[j + 3] & 0xC0) == 0x80) {
     136           0 :                         uint32_t c = (src[j + 0] & 0x07) << 18
     137           0 :                                         | (src[j + 1] & 0x3F) << 12
     138           0 :                                         | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
     139           0 :                         if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
     140             :                                 return NULL;
     141           0 :                         j += 4;
     142             :                 } else {
     143             :                         return NULL;
     144             :                 }
     145             :         }
     146          49 :         dest = GDKmalloc((i + 1) * sizeof(uint32_t));
     147          49 :         if (dest == NULL)
     148             :                 return NULL;
     149             :         /* go through the source string again, this time we can skip
     150             :          * the correctness tests */
     151             :         i = j = 0;
     152         263 :         while (src[j]) {
     153         214 :                 if ((src[j + 0] & 0x80) == 0) {
     154         165 :                         dest[i++] = src[j + 0];
     155         165 :                         j += 1;
     156          49 :                 } else if ((src[j + 0] & 0xE0) == 0xC0) {
     157          24 :                         dest[i++] = (src[j + 0] & 0x1F) << 6 | (src[j + 1] & 0x3F);
     158          24 :                         j += 2;
     159          25 :                 } else if ((src[j + 0] & 0xF0) == 0xE0) {
     160          25 :                         dest[i++] = (src[j + 0] & 0x0F) << 12
     161          25 :                                         | (src[j + 1] & 0x3F) << 6 | (src[j + 2] & 0x3F);
     162          25 :                         j += 3;
     163           0 :                 } else if ((src[j + 0] & 0xF8) == 0xF0) {
     164           0 :                         dest[i++] = (src[j + 0] & 0x07) << 18
     165           0 :                                         | (src[j + 1] & 0x3F) << 12
     166           0 :                                         | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
     167           0 :                         j += 4;
     168             :                 }
     169             :         }
     170          49 :         dest[i] = 0;
     171          49 :         return dest;
     172             : }
     173             : 
     174             : static size_t
     175          33 : myucslen(const uint32_t *ucs)
     176             : {
     177          33 :         size_t i = 0;
     178             : 
     179          66 :         while (ucs[i])
     180          33 :                 i++;
     181          33 :         return i;
     182             : }
     183             : 
     184             : static inline bool
     185          14 : mywstrncaseeq(const char *restrict s1, const uint32_t *restrict s2, size_t n2,
     186             :                           bool atend)
     187             : {
     188          14 :         uint32_t c1;
     189             : 
     190          27 :         while (n2 > 0) {
     191          20 :                 size_t nn1 = utfc8touc(&c1, s1);
     192          20 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     193           0 :                         return (*s2 == 0);
     194          20 :                 if (*s2 == 0)
     195             :                         return false;
     196             : #if SIZEOF_WCHAR_T == 2
     197             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     198             :                         if (c1 != *s2)
     199             :                                 return false;
     200             :                 } else
     201             : #endif
     202          20 :                 if (towlower((wint_t) c1) != towlower((wint_t) * s2))
     203             :                         return false;
     204          13 :                 s1 += nn1;
     205          13 :                 n2--;
     206          13 :                 s2++;
     207             :         }
     208          14 :         return !atend || *s1 == 0;
     209             : }
     210             : 
     211             : static inline int
     212           1 : mystrcasecmp(const char *s1, const char *s2)
     213             : {
     214           1 :         uint32_t c1 = 0, c2 = 0;
     215             : 
     216           1 :         for (;;) {
     217           1 :                 size_t nn1 = utfc8touc(&c1, s1);
     218           1 :                 size_t nn2 = utfc8touc(&c2, s2);
     219           1 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     220           0 :                         return -(nn2 != 0 && nn2 != (size_t) -1);
     221           1 :                 if (nn2 == 0 || nn2 == (size_t) -1)
     222             :                         return 1;
     223             : #if SIZEOF_WCHAR_T == 2
     224             :                 if (c1 > 0xFFFF || c2 > 0xFFFF) {
     225             :                         if (c1 != c2)
     226             :                                 return c1 - c2;
     227             :                 } else
     228             : #endif
     229           1 :                 if (towlower((wint_t) c1) != towlower((wint_t) c2))
     230           1 :                         return towlower((wint_t) c1) - towlower((wint_t) c2);
     231           0 :                 s1 += nn1;
     232           0 :                 s2 += nn2;
     233             :         }
     234             : }
     235             : 
     236             : static inline int
     237          41 : mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2)
     238             : {
     239          41 :         uint32_t c1 = 0;
     240             : 
     241         329 :         for (;;) {
     242         185 :                 size_t nn1 = utfc8touc(&c1, s1);
     243         185 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     244          22 :                         return -(*s2 != 0);
     245         163 :                 if (*s2 == 0)
     246             :                         return 1;
     247             : #if SIZEOF_WCHAR_T == 2
     248             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     249             :                         if (c1 != *s2)
     250             :                                 return c1 - *s2;
     251             :                 } else
     252             : #endif
     253         163 :                 if (towlower((wint_t) c1) != towlower((wint_t) * s2))
     254          19 :                         return towlower((wint_t) c1) - towlower((wint_t) * s2);
     255         144 :                 s1 += nn1;
     256         144 :                 s2++;
     257             :         }
     258             : }
     259             : 
     260             : static inline const char *
     261          33 : mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle,
     262             :                           bool atend)
     263             : {
     264          33 :         size_t nlen = myucslen(wneedle);
     265             : 
     266          33 :         if (nlen == 0)
     267           0 :                 return atend ? haystack + strlen(haystack) : haystack;
     268             : 
     269          86 :         while (*haystack) {
     270             :                 size_t i;
     271             :                 size_t h;
     272             :                 size_t step = 0;
     273          83 :                 for (i = h = 0; i < nlen; i++) {
     274          68 :                         uint32_t c = 0;
     275          68 :                         size_t j = utfc8touc(&c, haystack + h);
     276          68 :                         if (j == 0 || j == (size_t) -1)
     277           0 :                                 return NULL;
     278          68 :                         if (i == 0) {
     279          68 :                                 step = j;
     280             :                         }
     281             : #if SIZEOF_WCHAR_T == 2
     282             :                         if (c > 0xFFFF || wneedle[i] > 0xFFFF) {
     283             :                                 if (c != wneedle[i])
     284             :                                         break;
     285             :                         } else
     286             : #endif
     287          68 :                         if (towlower((wint_t) c) != towlower((wint_t) wneedle[i]))
     288             :                                 break;
     289          15 :                         h += j;
     290             :                 }
     291          68 :                 if (i == nlen && (!atend || haystack[h] == 0))
     292          15 :                         return haystack;
     293          53 :                 haystack += step;
     294             :         }
     295             :         return NULL;
     296             : }
     297             : 
     298             : /* returns true if the pattern does not contain unescaped `_' (single
     299             :  * character match) and ends with unescaped `%' (any sequence
     300             :  * match) */
     301             : static inline bool
     302        6216 : re_simple(const char *pat, unsigned char esc)
     303             : {
     304        6216 :         bool escaped = false;
     305             : 
     306        6216 :         if (pat == 0)
     307             :                 return false;
     308        6216 :         if (*pat == '%') {
     309        5444 :                 pat++;
     310             :         }
     311       45538 :         while (*pat) {
     312       39701 :                 if (escaped) {
     313             :                         escaped = false;
     314       39558 :                 } else if ((unsigned char) *pat == esc) {
     315             :                         escaped = true;
     316       39415 :                 } else if (*pat == '_') {
     317             :                         return false;
     318             :                 }
     319       39322 :                 pat++;
     320             :         }
     321             :         return true;
     322             : }
     323             : 
     324             : static inline bool
     325        7084 : re_is_pattern_properly_escaped(const char *pat, unsigned char esc)
     326             : {
     327        7084 :         bool escaped = false;
     328             : 
     329        7084 :         if (pat == 0)
     330             :                 return true;
     331       57898 :         while (*pat) {
     332       50814 :                 if (escaped) {
     333             :                         escaped = false;
     334       50662 :                 } else if ((unsigned char) *pat == esc) {
     335       50814 :                         escaped = true;
     336             :                 }
     337       50814 :                 pat++;
     338             :         }
     339        7084 :         return escaped ? false : true;
     340             : }
     341             : 
     342             : /* returns true if the pattern does not contain wildcard
     343             :  * characters ('%' or '_') and no character is escaped
     344             :  */
     345             : static inline bool
     346        7081 : is_strcmpable(const char *pat, const char *esc)
     347             : {
     348        7081 :         if (pat[strcspn(pat, "%_")])
     349             :                 return false;
     350        1794 :         return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
     351             : }
     352             : 
     353             : /* Compare two strings ignoring case. When both strings are
     354             :  * lower case this function returns the same result as strcmp.
     355             :  */
     356             : static int
     357         871 : istrcmp(const char *s1, const char *s2)
     358             : {
     359         871 :         char c1, c2;
     360         871 :         const char *p1, *p2;
     361        1298 :         for (p1 = s1, p2 = s2; *p1 && *p2; p1++, p2++) {
     362        1157 :                 c1 = *p1;
     363        1157 :                 c2 = *p2;
     364             : 
     365        1157 :                 if ('A' <= c1 && c1 <= 'Z')
     366         649 :                         c1 += 'a' - 'A';
     367             : 
     368        1157 :                 if ('A' <= c2 && c2 <= 'Z')
     369         697 :                         c2 += 'a' - 'A';
     370             : 
     371        1157 :                 if (c1 != c2)
     372         730 :                         return (c1 - c2);
     373             :         }
     374             : 
     375         141 :         if (*p1 != *p2)
     376          67 :                 return *p1 - *p2;
     377             : 
     378             :         return 0;
     379             : }
     380             : 
     381             : /* Compare at most len characters of two strings ignoring
     382             :  * case. When both strings are lowercase this function
     383             :  * returns the same result as strncmp.
     384             :  */
     385             : static int
     386          16 : istrncmp(const char *s1, const char *s2, size_t len)
     387             : {
     388          16 :         char c1, c2;
     389          16 :         const char *p1, *p2;
     390          16 :         size_t n = 0;
     391             : 
     392          32 :         for (p1 = s1, p2 = s2; *p1 && *p2 && (n < len); p1++, p2++, n++) {
     393          16 :                 c1 = *p1;
     394          16 :                 c2 = *p2;
     395             : 
     396          16 :                 if ('A' <= c1 && c1 <= 'Z')
     397           4 :                         c1 += 'a' - 'A';
     398             : 
     399          16 :                 if ('A' <= c2 && c2 <= 'Z')
     400           0 :                         c2 += 'a' - 'A';
     401             : 
     402          16 :                 if (c1 != c2)
     403           0 :                         return c1 - c2;
     404             :         }
     405             : 
     406          16 :         if (*p1 != *p2 && n < len)
     407           0 :                 return *p1 - *p2;
     408             : 
     409             :         return 0;
     410             : }
     411             : 
     412             : 
     413             : /* Find the first occurence of the substring needle in
     414             :  * haystack ignoring case.
     415             :  *
     416             :  * NOTE: This function assumes that the needle is already
     417             :  * lowercase.
     418             :  */
     419             : static const char *
     420        6746 : istrstr(const char *haystack, const char *needle)
     421             : {
     422        6746 :         const char *ph;
     423        6746 :         const char *pn;
     424        6746 :         const char *p1;
     425        6746 :         bool match = true;
     426             : 
     427      316015 :         for (ph = haystack; *ph; ph++) {
     428      373952 :                 match = true;
     429      373952 :                 for (pn = needle, p1 = ph; *pn && *p1; pn++, p1++) {
     430      371844 :                         char c1 = *pn;
     431      371844 :                         char c2 = ('A' <= *p1 && *p1 <= 'Z') ? *p1 - 'A' + 'a' : *p1;
     432      371844 :                         if (c1 != c2) {
     433             :                                 match = false;
     434             :                                 break;
     435             :                         }
     436             :                 }
     437             : 
     438             :                 /* We reached the end of the haystack, but we still have characters in
     439             :                  * needle. None of the future iterations will match.
     440             :                  */
     441      311377 :                 if (*p1 == 0 && *pn != 0) {
     442             :                         break;
     443             :                 }
     444             : 
     445      311377 :                 if (match) {
     446        2108 :                         return ph;
     447             :                 }
     448             :         }
     449             :         return NULL;
     450             : }
     451             : 
     452             : /* Match regular expression by comparing bytes.
     453             :  *
     454             :  * This is faster than re_match_ignore, because it does not
     455             :  * need to decode characters. This function should be used
     456             :  * in all cases except when we need to perform UTF-8
     457             :  * comparisons ignoring case.
     458             :  *
     459             :  * TODO: The name of the function is no longer accurate and
     460             :  * needs to change.
     461             :  */
     462             : static inline bool
     463      189629 : re_match_no_ignore(const char *restrict s, const struct RE *restrict pattern)
     464             : {
     465      189629 :         const struct RE *r;
     466      189629 :         size_t l;
     467             : 
     468      252293 :         for (r = pattern; r; r = r->n) {
     469      190431 :                 if (*r->k == 0 && (r->search || *s == 0))
     470             :                         return true;
     471      169010 :                 if (!*s ||
     472             :                         (r->search
     473      168937 :                          ? (r->atend
     474      155444 :                                 ? (r->case_ignore
     475        5970 :                                    ? (l = strlen(s)) < r->len || istrcmp(s + l - r->len, r->k) != 0
     476        5886 :                                    : (l = strlen(s)) < r->len || strcmp(s + l - r->len, r->k) != 0)
     477      149474 :                                 : (r->case_ignore ? (s = istrstr(s, r->k)) == NULL
     478      142741 :                                    : (s = strstr(s, r->k)) == NULL))
     479             :                          : (r->atend
     480       13493 :                                 ? (r->case_ignore ? istrcmp(s, r->k) != 0
     481          95 :                                    : strcmp(s, r->k) != 0)
     482       13398 :                                 : (r->case_ignore ? istrncmp(s, r->k, r->len) != 0
     483       13382 :                                    : strncmp(s, r->k, r->len) != 0))))
     484             :                         return false;
     485       62664 :                 s += r->len;
     486             :         }
     487             :         return true;
     488             : }
     489             : 
     490             : /* Match a regular expression by comparing wide characters.
     491             :  *
     492             :  * This needs to be used when we need to perform a
     493             :  * case-ignoring comparions involving UTF-8 characters.
     494             :  */
     495             : static inline bool
     496          44 : re_match_ignore(const char *restrict s, const struct RE *restrict pattern)
     497             : {
     498          44 :         const struct RE *r;
     499             : 
     500             :         /* Since the pattern is ascii, do the cheaper comparison */
     501          44 :         if (pattern->is_ascii) {
     502           0 :                 return re_match_no_ignore(s, pattern);
     503             :         }
     504             : 
     505          66 :         for (r = pattern; r; r = r->n) {
     506          47 :                 if (*r->w == 0 && (r->search || *s == 0))
     507             :                         return true;
     508          47 :                 if (!*s ||
     509             :                         (r->search
     510          47 :                          ? (s = mywstrcasestr(s, r->w, r->atend)) == NULL
     511          14 :                          : !mywstrncaseeq(s, r->w, r->len, r->atend)))
     512             :                         return false;
     513          22 :                 s += r->len;
     514             :         }
     515             :         return true;
     516             : }
     517             : 
     518             : static void
     519        5836 : re_destroy(struct RE *p)
     520             : {
     521        5836 :         if (p) {
     522        5836 :                 GDKfree(p->k);
     523        5837 :                 GDKfree(p->w);
     524        5931 :                 do {
     525        5931 :                         struct RE *n = p->n;
     526             : 
     527        5931 :                         GDKfree(p);
     528        5931 :                         p = n;
     529        5931 :                 } while (p);
     530             :         }
     531        5837 : }
     532             : 
     533             : /* Create a linked list of RE structures.  Depending on the
     534             :  * caseignore and the ascii_pattern flags, the w
     535             :  * (if caseignore == true && ascii_pattern == false) or the k
     536             :  * (in every other case) field is used.  These in the first
     537             :  * structure are allocated, whereas in all subsequent
     538             :  * structures the fields point into the allocated buffer of
     539             :  * the first.
     540             :  */
     541             : static struct RE *
     542        5837 : re_create(const char *pat, bool caseignore, bool ascii_pattern, uint32_t esc)
     543             : {
     544        5837 :         struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
     545        5837 :         bool escaped = false;
     546             : 
     547        5837 :         if (r == NULL)
     548             :                 return NULL;
     549        5837 :         *r = (struct RE) {.atend = true };
     550             : 
     551       11165 :         while (esc != '%' && *pat == '%') {
     552        5328 :                 pat++;                                  /* skip % */
     553        5328 :                 r->search = true;
     554             :         }
     555        5837 :         if (caseignore && !ascii_pattern) {
     556          20 :                 uint32_t *wp;
     557          20 :                 uint32_t *wq;
     558          20 :                 wp = utf8stoucs(pat);
     559          20 :                 if (wp == NULL) {
     560           0 :                         GDKfree(r);
     561           0 :                         return NULL;
     562             :                 }
     563          20 :                 r->w = wp;
     564          20 :                 wq = wp;
     565          68 :                 while (*wp) {
     566          48 :                         if (escaped) {
     567           0 :                                 *wq++ = *wp;
     568           0 :                                 n->len++;
     569           0 :                                 escaped = false;
     570          48 :                         } else if (*wp == esc) {
     571             :                                 escaped = true;
     572          48 :                         } else if (*wp == '%') {
     573          16 :                                 n->atend = false;
     574          16 :                                 while (wp[1] == '%')
     575           0 :                                         wp++;
     576          16 :                                 if (wp[1]) {
     577           4 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     578           4 :                                         if (n == NULL)
     579           0 :                                                 goto bailout;
     580           4 :                                         *n = (struct RE) {
     581             :                                                 .search = true,
     582             :                                                 .atend = true,
     583           4 :                                                 .w = wp + 1,
     584             :                                         };
     585             :                                 }
     586          16 :                                 *wq = 0;
     587          16 :                                 wq = wp + 1;
     588             :                         } else {
     589          32 :                                 *wq++ = *wp;
     590          32 :                                 n->len++;
     591             :                         }
     592          48 :                         wp++;
     593             :                 }
     594          20 :                 *wq = 0;
     595             :         } else {
     596        5817 :                 char *p, *q;
     597        5817 :                 if ((p = GDKstrdup(pat)) == NULL) {
     598           0 :                         GDKfree(r);
     599           0 :                         return NULL;
     600             :                 }
     601        5817 :                 if (ascii_pattern)
     602        5814 :                         n->is_ascii = true;
     603        5817 :                 if (caseignore)
     604          62 :                         n->case_ignore = true;
     605             : 
     606          62 :                 if (ascii_pattern && caseignore) {
     607         586 :                         for (q = p; *q != 0; q++) {
     608         524 :                                 if ('A' <= *q && *q <= 'Z')
     609          21 :                                         *q += 'a' - 'A';
     610             :                         }
     611             :                 }
     612             : 
     613        5817 :                 r->k = p;
     614        5817 :                 q = p;
     615       44175 :                 while (*p) {
     616       38358 :                         if (escaped) {
     617         136 :                                 *q++ = *p;
     618         136 :                                 n->len++;
     619         136 :                                 escaped = false;
     620       38222 :                         } else if ((unsigned char) *p == esc) {
     621             :                                 escaped = true;
     622       38086 :                         } else if (*p == '%') {
     623        5575 :                                 n->atend = false;
     624        5603 :                                 while (p[1] == '%')
     625          28 :                                         p++;
     626        5575 :                                 if (p[1]) {
     627          90 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     628          90 :                                         if (n == NULL)
     629           0 :                                                 goto bailout;
     630          90 :                                         *n = (struct RE) {
     631             :                                                 .search = true,
     632             :                                                 .atend = true,
     633          90 :                                                 .k = p + 1
     634             :                                         };
     635          90 :                                         if (ascii_pattern) {
     636          87 :                                                 n->is_ascii = true;
     637             :                                         }
     638          90 :                                         if (caseignore) {
     639          16 :                                                 n->case_ignore = true;
     640             :                                         }
     641             :                                 }
     642        5575 :                                 *q = 0;
     643        5575 :                                 q = p + 1;
     644             :                         } else {
     645       32511 :                                 char c = *p;
     646       32511 :                                 if (ascii_pattern && caseignore && 'A' <= c && c <= 'Z') {
     647           0 :                                         c += 'a' - 'A';
     648             :                                 }
     649       32511 :                                 *q++ = c;
     650       32511 :                                 n->len++;
     651             :                         }
     652       38358 :                         p++;
     653             :                 }
     654        5817 :                 *q = 0;
     655             :         }
     656             :         return r;
     657           0 :   bailout:
     658           0 :         re_destroy(r);
     659           0 :         return NULL;
     660             : }
     661             : 
     662             : #ifdef HAVE_LIBPCRE
     663             : static str
     664          25 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
     665             : {
     666          25 :         pcre *r;
     667          25 :         const char *err_p = NULL;
     668          25 :         int errpos = 0;
     669          25 :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
     670          25 :         if (insensitive)
     671           0 :                 options |= PCRE_CASELESS;
     672             : 
     673          25 :         if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
     674           0 :                 throw(MAL, "pcre.compile", OPERATION_FAILED
     675             :                           " with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern);
     676             :         }
     677          25 :         *res = r;
     678          25 :         return MAL_SUCCEED;
     679             : }
     680             : #endif
     681             : 
     682             : /* maximum number of back references and quoted \ or $ in replacement string */
     683             : #define MAX_NR_REFS             20
     684             : 
     685             : struct backref {
     686             :         int idx;
     687             :         int start;
     688             :         int end;
     689             : };
     690             : 
     691             : #ifdef HAVE_LIBPCRE
     692             : /* fill in parameter backrefs (length maxrefs) with information about
     693             :  * back references in the replacement string; a back reference is a
     694             :  * dollar or backslash followed by a number */
     695             : static int
     696          60 : parse_replacement(const char *replacement, int len_replacement,
     697             :                                   struct backref *backrefs, int maxrefs)
     698             : {
     699          60 :         int nbackrefs = 0;
     700             : 
     701         108 :         for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
     702          48 :                 if (replacement[i] == '$' || replacement[i] == '\\') {
     703           6 :                         char *endptr;
     704           6 :                         backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
     705           6 :                         if (endptr > replacement + i + 1) {
     706           6 :                                 int k = (int) (endptr - (replacement + i + 1));
     707           6 :                                 backrefs[nbackrefs].start = i;
     708           6 :                                 backrefs[nbackrefs].end = i + k + 1;
     709           6 :                                 nbackrefs++;
     710           0 :                         } else if (replacement[i] == replacement[i + 1]) {
     711             :                                 /* doubled $ or \, we must copy just one to the output */
     712           0 :                                 backrefs[nbackrefs].idx = INT_MAX;      /* impossible value > 0 */
     713           0 :                                 backrefs[nbackrefs].start = i;
     714           0 :                                 backrefs[nbackrefs].end = i + 1;
     715           0 :                                 i++;                    /* don't look at second $ or \ again */
     716           0 :                                 nbackrefs++;
     717             :                         }
     718             :                         /* else: $ or \ followed by something we don't recognize,
     719             :                          * so just leave it */
     720             :                 }
     721             :         }
     722          60 :         return nbackrefs;
     723             : }
     724             : 
     725             : static char *
     726       30269 : single_replace(pcre *pcre_code, pcre_extra *extra,
     727             :                            const char *origin_str, int len_origin_str,
     728             :                            int exec_options, int *ovector, int ovecsize,
     729             :                            const char *replacement, int len_replacement,
     730             :                            struct backref *backrefs, int nbackrefs,
     731             :                            bool global, char *result, int *max_result)
     732             : {
     733       30269 :         int offset = 0;
     734       30269 :         int len_result = 0;
     735      108913 :         int addlen;
     736      108913 :         char *tmp;
     737             : 
     738      108913 :         do {
     739      108913 :                 int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
     740             :                                                   exec_options, ovector, ovecsize);
     741      108968 :                 if (j <= 0)
     742             :                         break;
     743       80729 :                 addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0);
     744       80729 :                 if (len_result + addlen >= *max_result) {
     745        6892 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     746        6892 :                         if (tmp == NULL) {
     747           0 :                                 GDKfree(result);
     748           0 :                                 return NULL;
     749             :                         }
     750        6892 :                         result = tmp;
     751        6892 :                         *max_result = len_result + addlen + 1;
     752             :                 }
     753       80729 :                 if (ovector[0] > offset) {
     754       78643 :                         strncpy(result + len_result, origin_str + offset,
     755       78643 :                                         ovector[0] - offset);
     756       78643 :                         len_result += ovector[0] - offset;
     757             :                 }
     758       80729 :                 if (nbackrefs == 0) {
     759       78647 :                         strncpy(result + len_result, replacement, len_replacement);
     760       78647 :                         len_result += len_replacement;
     761             :                 } else {
     762             :                         int prevend = 0;
     763        4164 :                         for (int i = 0; i < nbackrefs; i++) {
     764        2082 :                                 int off, len;
     765        2082 :                                 if (backrefs[i].idx >= ovecsize / 3) {
     766             :                                         /* out of bounds, replace with empty string */
     767             :                                         off = 0;
     768             :                                         len = 0;
     769             :                                 } else {
     770        2082 :                                         off = ovector[backrefs[i].idx * 2];
     771        2082 :                                         len = ovector[backrefs[i].idx * 2 + 1] - off;
     772             :                                 }
     773        2082 :                                 addlen = backrefs[i].start - prevend + len;
     774        2082 :                                 if (len_result + addlen >= *max_result) {
     775          19 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     776          19 :                                         if (tmp == NULL) {
     777           0 :                                                 GDKfree(result);
     778           0 :                                                 return NULL;
     779             :                                         }
     780          19 :                                         result = tmp;
     781          19 :                                         *max_result = len_result + addlen + 1;
     782             :                                 }
     783        2082 :                                 if (backrefs[i].start > prevend) {
     784           2 :                                         strncpy(result + len_result, replacement + prevend,
     785           2 :                                                         backrefs[i].start - prevend);
     786           2 :                                         len_result += backrefs[i].start - prevend;
     787             :                                 }
     788        2082 :                                 if (len > 0) {
     789        2082 :                                         strncpy(result + len_result, origin_str + off, len);
     790        2082 :                                         len_result += len;
     791             :                                 }
     792        2082 :                                 prevend = backrefs[i].end;
     793             :                         }
     794             :                         /* copy rest of replacement string (after last backref) */
     795        2082 :                         addlen = len_replacement - prevend;
     796        2082 :                         if (addlen > 0) {
     797           2 :                                 if (len_result + addlen >= *max_result) {
     798           1 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     799           1 :                                         if (tmp == NULL) {
     800           0 :                                                 GDKfree(result);
     801           0 :                                                 return NULL;
     802             :                                         }
     803           1 :                                         result = tmp;
     804           1 :                                         *max_result = len_result + addlen + 1;
     805             :                                 }
     806           2 :                                 strncpy(result + len_result, replacement + prevend, addlen);
     807           2 :                                 len_result += addlen;
     808             :                         }
     809             :                 }
     810       80729 :                 offset = ovector[1];
     811       80729 :         } while (offset < len_origin_str && global);
     812       30324 :         if (offset < len_origin_str) {
     813       28239 :                 addlen = len_origin_str - offset;
     814       28239 :                 if (len_result + addlen >= *max_result) {
     815         314 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     816         314 :                         if (tmp == NULL) {
     817           0 :                                 GDKfree(result);
     818           0 :                                 return NULL;
     819             :                         }
     820         314 :                         result = tmp;
     821         314 :                         *max_result = len_result + addlen + 1;
     822             :                 }
     823       28239 :                 strncpy(result + len_result, origin_str + offset, addlen);
     824       28239 :                 len_result += addlen;
     825             :         }
     826             :         /* null terminate string */
     827       30324 :         result[len_result] = '\0';
     828       30324 :         return result;
     829             : }
     830             : #endif
     831             : 
     832             : static str
     833          10 : pcre_replace(str *res, const char *origin_str, const char *pattern,
     834             :                          const char *replacement, const char *flags, bool global)
     835             : {
     836             : #ifdef HAVE_LIBPCRE
     837          10 :         const char *err_p = NULL;
     838          10 :         pcre *pcre_code = NULL;
     839          10 :         pcre_extra *extra;
     840          10 :         char *tmpres;
     841          10 :         int max_result;
     842          10 :         int i, errpos = 0;
     843          10 :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     844          10 :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     845          10 :         int *ovector, ovecsize;
     846          10 :         int len_origin_str = (int) strlen(origin_str);
     847          10 :         int len_replacement = (int) strlen(replacement);
     848          10 :         struct backref backrefs[MAX_NR_REFS];
     849          10 :         int nbackrefs = 0;
     850             : 
     851          14 :         while (*flags) {
     852           4 :                 switch (*flags) {
     853             :                 case 'e':
     854             :                         exec_options &= ~PCRE_NOTEMPTY;
     855             :                         break;
     856           1 :                 case 'i':
     857           1 :                         compile_options |= PCRE_CASELESS;
     858           1 :                         break;
     859           1 :                 case 'm':
     860           1 :                         compile_options |= PCRE_MULTILINE;
     861           1 :                         break;
     862           1 :                 case 's':
     863           1 :                         compile_options |= PCRE_DOTALL;
     864           1 :                         break;
     865           1 :                 case 'x':
     866           1 :                         compile_options |= PCRE_EXTENDED;
     867           1 :                         break;
     868           0 :                 default:
     869           0 :                         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     870             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     871             :                                   *flags);
     872             :                 }
     873           4 :                 flags++;
     874             :         }
     875             : 
     876          10 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     877           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     878             :                           OPERATION_FAILED
     879             :                           ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     880             :                           pattern, errpos, err_p);
     881             :         }
     882             : 
     883             :         /* Since the compiled pattern is going to be used several times, it is
     884             :          * worth spending more time analyzing it in order to speed up the time
     885             :          * taken for matching.
     886             :          */
     887          10 :         extra = pcre_study(pcre_code, 0, &err_p);
     888          10 :         if (err_p != NULL) {
     889           0 :                 pcre_free(pcre_code);
     890           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     891             :                           OPERATION_FAILED
     892             :                           ": pcre study of pattern (%s) failed with '%s'.\n", pattern,
     893             :                           err_p);
     894             :         }
     895          10 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
     896          10 :         ovecsize = (i + 1) * 3;
     897          10 :         if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
     898           0 :                 pcre_free_study(extra);
     899           0 :                 pcre_free(pcre_code);
     900           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     901             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     902             :         }
     903             : 
     904             :         /* identify back references in the replacement string */
     905          10 :         nbackrefs = parse_replacement(replacement, len_replacement,
     906             :                                                                   backrefs, MAX_NR_REFS);
     907             : 
     908          10 :         max_result = len_origin_str + 1;
     909          10 :         tmpres = GDKmalloc(max_result);
     910          10 :         if (tmpres == NULL) {
     911           0 :                 GDKfree(ovector);
     912           0 :                 pcre_free_study(extra);
     913           0 :                 pcre_free(pcre_code);
     914           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     915             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     916             :         }
     917             : 
     918          10 :         tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
     919             :                                                         exec_options, ovector, ovecsize, replacement,
     920             :                                                         len_replacement, backrefs, nbackrefs, global,
     921             :                                                         tmpres, &max_result);
     922          10 :         GDKfree(ovector);
     923          10 :         pcre_free_study(extra);
     924          10 :         pcre_free(pcre_code);
     925          10 :         if (tmpres == NULL)
     926           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     927             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     928             : 
     929          10 :         *res = tmpres;
     930          10 :         return MAL_SUCCEED;
     931             : #else
     932             :         (void) res;
     933             :         (void) origin_str;
     934             :         (void) pattern;
     935             :         (void) replacement;
     936             :         (void) flags;
     937             :         (void) global;
     938             :         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     939             :                   "Database was compiled without PCRE support.");
     940             : #endif
     941             : }
     942             : 
     943             : static str
     944          50 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
     945             :                                  const char *replacement, const char *flags, bool global)
     946             : {
     947             : #ifdef HAVE_LIBPCRE
     948          50 :         const char *err_p = NULL;
     949          50 :         char *tmpres;
     950          50 :         int i, errpos = 0;
     951          50 :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     952          50 :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     953          50 :         pcre *pcre_code = NULL;
     954          50 :         pcre_extra *extra;
     955          50 :         BAT *tmpbat;
     956          50 :         BUN p, q;
     957          50 :         int *ovector, ovecsize;
     958          50 :         int len_replacement = (int) strlen(replacement);
     959          50 :         struct backref backrefs[MAX_NR_REFS];
     960          50 :         int nbackrefs = 0;
     961          50 :         const char *origin_str;
     962          50 :         int max_dest_size = 0;
     963             : 
     964          70 :         while (*flags) {
     965          20 :                 switch (*flags) {
     966             :                 case 'e':
     967             :                         exec_options &= ~PCRE_NOTEMPTY;
     968             :                         break;
     969           5 :                 case 'i':
     970           5 :                         compile_options |= PCRE_CASELESS;
     971           5 :                         break;
     972          10 :                 case 'm':
     973          10 :                         compile_options |= PCRE_MULTILINE;
     974          10 :                         break;
     975           5 :                 case 's':
     976           5 :                         compile_options |= PCRE_DOTALL;
     977           5 :                         break;
     978           0 :                 case 'x':
     979           0 :                         compile_options |= PCRE_EXTENDED;
     980           0 :                         break;
     981           0 :                 default:
     982           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     983             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     984             :                                   *flags);
     985             :                 }
     986          20 :                 flags++;
     987             :         }
     988             : 
     989          50 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     990           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     991             :                           OPERATION_FAILED
     992             :                           ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     993             :                           pattern, errpos, err_p);
     994             :         }
     995             : 
     996             :         /* Since the compiled pattern is going to be used several times,
     997             :          * it is worth spending more time analyzing it in order to speed
     998             :          * up the time taken for matching.
     999             :          */
    1000         100 :         extra = pcre_study(pcre_code,
    1001          50 :                                            BATcount(origin_strs) >
    1002             :                                            JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
    1003          50 :         if (err_p != NULL) {
    1004           0 :                 pcre_free(pcre_code);
    1005           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1006             :                           OPERATION_FAILED);
    1007             :         }
    1008          50 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
    1009          50 :         ovecsize = (i + 1) * 3;
    1010          50 :         if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
    1011           0 :                 pcre_free_study(extra);
    1012           0 :                 pcre_free(pcre_code);
    1013           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1014             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1015             :         }
    1016             : 
    1017             :         /* identify back references in the replacement string */
    1018          50 :         nbackrefs = parse_replacement(replacement, len_replacement,
    1019             :                                                                   backrefs, MAX_NR_REFS);
    1020             : 
    1021          50 :         tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs),
    1022             :                                         TRANSIENT);
    1023             : 
    1024             :         /* the buffer for all destination strings is allocated only once,
    1025             :          * and extended when needed */
    1026          50 :         max_dest_size = len_replacement + 1;
    1027          50 :         tmpres = GDKmalloc(max_dest_size);
    1028          50 :         if (tmpbat == NULL || tmpres == NULL) {
    1029           0 :                 pcre_free_study(extra);
    1030           0 :                 pcre_free(pcre_code);
    1031           0 :                 GDKfree(ovector);
    1032           0 :                 BBPreclaim(tmpbat);
    1033           0 :                 GDKfree(tmpres);
    1034           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1035             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1036             :         }
    1037          50 :         BATiter origin_strsi = bat_iterator(origin_strs);
    1038       30232 :         BATloop(origin_strs, p, q) {
    1039       30182 :                 origin_str = BUNtvar(origin_strsi, p);
    1040       60483 :                 tmpres = single_replace(pcre_code, extra, origin_str,
    1041       30182 :                                                                 (int) strlen(origin_str), exec_options,
    1042             :                                                                 ovector, ovecsize, replacement,
    1043             :                                                                 len_replacement, backrefs, nbackrefs, global,
    1044             :                                                                 tmpres, &max_dest_size);
    1045       30301 :                 if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
    1046           0 :                         bat_iterator_end(&origin_strsi);
    1047           0 :                         pcre_free_study(extra);
    1048           0 :                         pcre_free(pcre_code);
    1049           0 :                         GDKfree(ovector);
    1050           0 :                         GDKfree(tmpres);
    1051           0 :                         BBPreclaim(tmpbat);
    1052           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1053             :                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1054             :                 }
    1055             :         }
    1056          50 :         bat_iterator_end(&origin_strsi);
    1057          50 :         pcre_free_study(extra);
    1058          50 :         pcre_free(pcre_code);
    1059          50 :         GDKfree(ovector);
    1060          50 :         GDKfree(tmpres);
    1061          50 :         *res = tmpbat;
    1062          50 :         return MAL_SUCCEED;
    1063             : #else
    1064             :         (void) res;
    1065             :         (void) origin_strs;
    1066             :         (void) pattern;
    1067             :         (void) replacement;
    1068             :         (void) flags;
    1069             :         (void) global;
    1070             :         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1071             :                   "Database was compiled without PCRE support.");
    1072             : #endif
    1073             : }
    1074             : 
    1075             : static str
    1076         130 : pcre_match_with_flags(bit *ret, const char *val, const char *pat,
    1077             :                                           const char *flags)
    1078             : {
    1079         130 :         int pos;
    1080             : #ifdef HAVE_LIBPCRE
    1081         130 :         const char *err_p = NULL;
    1082         130 :         int errpos = 0;
    1083         130 :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
    1084         130 :         pcre *re;
    1085             : #else
    1086             :         int options = REG_NOSUB;
    1087             :         regex_t re;
    1088             :         int errcode;
    1089             :         int retval;
    1090             : #endif
    1091             : 
    1092         260 :         while (*flags) {
    1093         130 :                 switch (*flags) {
    1094           6 :                 case 'i':
    1095             : #ifdef HAVE_LIBPCRE
    1096           6 :                         options |= PCRE_CASELESS;
    1097             : #else
    1098             :                         options |= REG_ICASE;
    1099             : #endif
    1100           6 :                         break;
    1101           0 :                 case 'm':
    1102             : #ifdef HAVE_LIBPCRE
    1103           0 :                         options |= PCRE_MULTILINE;
    1104             : #else
    1105             :                         options |= REG_NEWLINE;
    1106             : #endif
    1107           0 :                         break;
    1108             : #ifdef HAVE_LIBPCRE
    1109         124 :                 case 's':
    1110         124 :                         options |= PCRE_DOTALL;
    1111         124 :                         break;
    1112             : #endif
    1113           0 :                 case 'x':
    1114             : #ifdef HAVE_LIBPCRE
    1115           0 :                         options |= PCRE_EXTENDED;
    1116             : #else
    1117             :                         options |= REG_EXTENDED;
    1118             : #endif
    1119           0 :                         break;
    1120           0 :                 default:
    1121           0 :                         throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
    1122             :                                   ": unsupported flag character '%c'\n", *flags);
    1123             :                 }
    1124         130 :                 flags++;
    1125             :         }
    1126         130 :         if (strNil(val)) {
    1127           0 :                 *ret = FALSE;
    1128           0 :                 return MAL_SUCCEED;
    1129             :         }
    1130             : 
    1131             : #ifdef HAVE_LIBPCRE
    1132         130 :         if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
    1133             : #else
    1134             :         if ((errcode = regcomp(&re, pat, options)) != 0)
    1135             : #endif
    1136             :         {
    1137           0 :                 throw(MAL, "pcre.match", OPERATION_FAILED
    1138             :                           ": compilation of regular expression (%s) failed "
    1139             : #ifdef HAVE_LIBPCRE
    1140             :                           "at %d with '%s'", pat, errpos, err_p
    1141             : #else
    1142             :                           , pat
    1143             : #endif
    1144             :                                 );
    1145             :         }
    1146             : #ifdef HAVE_LIBPCRE
    1147         130 :         pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK,
    1148             :                                         NULL, 0);
    1149         130 :         pcre_free(re);
    1150             : #else
    1151             :         retval = regexec(&re, val, (size_t) 0, NULL, 0);
    1152             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1153             :         regfree(&re);
    1154             : #endif
    1155         130 :         if (pos >= 0)
    1156          46 :                 *ret = TRUE;
    1157          84 :         else if (pos == -1)
    1158          84 :                 *ret = FALSE;
    1159             :         else
    1160           0 :                 throw(MAL, "pcre.match", OPERATION_FAILED
    1161             :                           ": matching of regular expression (%s) failed with %d", pat, pos);
    1162             :         return MAL_SUCCEED;
    1163             : }
    1164             : 
    1165             : #ifdef HAVE_LIBPCRE
    1166             : /* special characters in PCRE that need to be escaped */
    1167             : static const char *pcre_specials = ".+?*()[]{}|^$\\";
    1168             : #else
    1169             : /* special characters in POSIX basic regular expressions that need to
    1170             :  * be escaped */
    1171             : static const char *pcre_specials = "^.[$()|*+?{\\";
    1172             : #endif
    1173             : 
    1174             : /* change SQL LIKE pattern into PCRE pattern */
    1175             : static str
    1176         385 : sql2pcre(str *r, const char *pat, const char *esc_str)
    1177             : {
    1178         385 :         int escaped = 0;
    1179         385 :         int hasWildcard = 0;
    1180         385 :         char *ppat;
    1181         770 :         int esc = strNil(esc_str) ? 0 : esc_str[0];     /* should change to utf8_convert() */
    1182         385 :         int specials;
    1183         385 :         int c;
    1184             : 
    1185         385 :         if (strlen(esc_str) > 1)
    1186           0 :                 throw(MAL, "pcre.sql2pcre",
    1187             :                           SQLSTATE(22019) ILLEGAL_ARGUMENT
    1188             :                           ": ESCAPE string must have length 1");
    1189         385 :         if (pat == NULL)
    1190           0 :                 throw(MAL, "pcre.sql2pcre",
    1191             :                           SQLSTATE(22019) ILLEGAL_ARGUMENT
    1192             :                           ": (I)LIKE pattern must not be NULL");
    1193         385 :         ppat = GDKmalloc(strlen(pat) * 3 +
    1194             :                                          3 /* 3 = "^'the translated regexp'$0" */ );
    1195         385 :         if (ppat == NULL)
    1196           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1197             : 
    1198         385 :         *r = ppat;
    1199             :         /* The escape character can be a char which is special in a PCRE
    1200             :          * expression.  If the user used the "+" char as escape and has "++"
    1201             :          * in their pattern, then replacing this with "+" is not correct and
    1202             :          * should be "\+" instead. */
    1203         385 :         specials = (esc && strchr(pcre_specials, esc) != NULL);
    1204             : 
    1205         385 :         *ppat++ = '^';
    1206        2927 :         while ((c = *pat++) != 0) {
    1207        2542 :                 if (c == esc) {
    1208          15 :                         if (escaped) {
    1209           1 :                                 if (specials) { /* change ++ into \+ */
    1210           1 :                                         *ppat++ = esc;
    1211             :                                 } else {                /* do not escape simple escape symbols */
    1212           0 :                                         ppat[-1] = esc; /* overwrite backslash */
    1213             :                                 }
    1214             :                                 escaped = 0;
    1215             :                         } else {
    1216          14 :                                 *ppat++ = '\\';
    1217          14 :                                 escaped = 1;
    1218             :                         }
    1219             :                         hasWildcard = 1;
    1220        2527 :                 } else if (strchr(pcre_specials, c) != NULL) {
    1221             :                         /* escape PCRE special chars, avoid double backslash if the
    1222             :                          * user uses an invalid escape sequence */
    1223          28 :                         if (!escaped)
    1224          28 :                                 *ppat++ = '\\';
    1225          28 :                         *ppat++ = c;
    1226          28 :                         hasWildcard = 1;
    1227          28 :                         escaped = 0;
    1228        2499 :                 } else if (c == '%' && !escaped) {
    1229         317 :                         *ppat++ = '.';
    1230         317 :                         *ppat++ = '*';
    1231         317 :                         *ppat++ = '?';
    1232         317 :                         hasWildcard = 1;
    1233             :                         /* collapse multiple %, but only if it isn't the escape */
    1234         317 :                         if (esc != '%')
    1235         317 :                                 while (*pat == '%')
    1236           0 :                                         pat++;
    1237        2182 :                 } else if (c == '_' && !escaped) {
    1238         492 :                         *ppat++ = '.';
    1239         492 :                         hasWildcard = 1;
    1240             :                 } else {
    1241        1690 :                         if (escaped) {
    1242          13 :                                 ppat[-1] = c;   /* overwrite backslash of invalid escape */
    1243             :                         } else {
    1244        1677 :                                 *ppat++ = c;
    1245             :                         }
    1246             :                         escaped = 0;
    1247             :                 }
    1248             :         }
    1249             :         /* no wildcard or escape character at end of string */
    1250         385 :         if (!hasWildcard || escaped) {
    1251           1 :                 GDKfree(*r);
    1252           1 :                 *r = NULL;
    1253           1 :                 if (escaped)
    1254           0 :                         throw(MAL, "pcre.sql2pcre",
    1255             :                                   SQLSTATE(22019) ILLEGAL_ARGUMENT
    1256             :                                   ": (I)LIKE pattern must not end with escape character");
    1257           1 :                 *r = GDKstrdup(str_nil);
    1258           1 :                 if (*r == NULL)
    1259           0 :                         throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1260             :         } else {
    1261         384 :                 *ppat++ = '$';
    1262         384 :                 *ppat = 0;
    1263             :         }
    1264             :         return MAL_SUCCEED;
    1265             : }
    1266             : 
    1267             : #ifdef HAVE_LIBPCRE
    1268             : /* change SQL PATINDEX pattern into PCRE pattern */
    1269             : static str
    1270          25 : pat2pcre(str *r, const char *pat)
    1271             : {
    1272          25 :         size_t len = strlen(pat);
    1273          25 :         char *ppat = GDKmalloc(len * 2 + 3 /* 3 = "^'the translated regexp'$0" */ );
    1274          25 :         int start = 0;
    1275             : 
    1276          25 :         if (ppat == NULL)
    1277           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1278          25 :         *r = ppat;
    1279          77 :         while (*pat) {
    1280          52 :                 int c = *pat++;
    1281             : 
    1282          52 :                 if (strchr(pcre_specials, c) != NULL) {
    1283          17 :                         *ppat++ = '\\';
    1284          17 :                         *ppat++ = c;
    1285          35 :                 } else if (c == '%') {
    1286           3 :                         if (start && *pat) {
    1287           0 :                                 *ppat++ = '.';
    1288           0 :                                 *ppat++ = '*';
    1289             :                         }
    1290           3 :                         start++;
    1291          32 :                 } else if (c == '_') {
    1292           0 :                         *ppat++ = '.';
    1293             :                 } else {
    1294          32 :                         *ppat++ = c;
    1295             :                 }
    1296             :         }
    1297          25 :         *ppat = 0;
    1298          25 :         return MAL_SUCCEED;
    1299             : }
    1300             : #endif
    1301             : 
    1302             : /*
    1303             :  * @+ Wrapping
    1304             :  */
    1305             : 
    1306             : static str
    1307          10 : PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl,
    1308             :                                  const str *flags)
    1309             : {
    1310          10 :         return pcre_replace(res, *or, *pat, *repl, *flags, true);
    1311             : }
    1312             : 
    1313             : static str
    1314           0 : PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl,
    1315             :                                           const str *flags)
    1316             : {
    1317           0 :         return pcre_replace(res, *or, *pat, *repl, *flags, false);
    1318             : }
    1319             : 
    1320             : static str
    1321          50 : PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl,
    1322             :                                          const str *flags)
    1323             : {
    1324          50 :         BAT *b, *bn = NULL;
    1325          50 :         str msg;
    1326          50 :         if ((b = BATdescriptor(*bid)) == NULL)
    1327           0 :                 throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1328             : 
    1329          50 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
    1330          50 :         if (msg == MAL_SUCCEED) {
    1331          50 :                 *res = bn->batCacheid;
    1332          50 :                 BBPkeepref(bn);
    1333             :         }
    1334          50 :         BBPunfix(b->batCacheid);
    1335          50 :         return msg;
    1336             : }
    1337             : 
    1338             : static str
    1339           0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat,
    1340             :                                                   const str *repl, const str *flags)
    1341             : {
    1342           0 :         BAT *b, *bn = NULL;
    1343           0 :         str msg;
    1344           0 :         if ((b = BATdescriptor(*bid)) == NULL)
    1345           0 :                 throw(MAL, "batpcre.replace_first", RUNTIME_OBJECT_MISSING);
    1346             : 
    1347           0 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
    1348           0 :         if (msg == MAL_SUCCEED) {
    1349           0 :                 *res = bn->batCacheid;
    1350           0 :                 BBPkeepref(bn);
    1351             :         }
    1352           0 :         BBPunfix(b->batCacheid);
    1353           0 :         return msg;
    1354             : }
    1355             : 
    1356             : static str
    1357         124 : PCREmatch(bit *ret, const str *val, const str *pat)
    1358             : {
    1359           4 :         return pcre_match_with_flags(ret, *val, *pat,
    1360             : #ifdef HAVE_LIBPCRE
    1361             :                                                                  "s"
    1362             : #else
    1363             :                                                                  "x"
    1364             : #endif
    1365             :                         );
    1366             : }
    1367             : 
    1368             : static str
    1369           6 : PCREimatch(bit *ret, const str *val, const str *pat)
    1370             : {
    1371           0 :         return pcre_match_with_flags(ret, *val, *pat, "i"
    1372             : #ifndef HAVE_LIBPCRE
    1373             :                                                                  "x"
    1374             : #endif
    1375             :                         );
    1376             : }
    1377             : 
    1378             : static str
    1379          25 : PCREindex(int *res, const pcre *pattern, const str *s)
    1380             : {
    1381             : #ifdef HAVE_LIBPCRE
    1382          25 :         int v[3];
    1383             : 
    1384          25 :         v[0] = v[1] = *res = 0;
    1385          25 :         if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0,
    1386             :                                   PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
    1387          23 :                 *res = v[1];
    1388             :         }
    1389          25 :         return MAL_SUCCEED;
    1390             : #else
    1391             :         (void) res;
    1392             :         (void) pattern;
    1393             :         (void) s;
    1394             :         throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
    1395             : #endif
    1396             : }
    1397             : 
    1398             : static str
    1399          27 : PCREpatindex(int *ret, const str *pat, const str *val)
    1400             : {
    1401             : #ifdef HAVE_LIBPCRE
    1402          27 :         pcre *re = NULL;
    1403          27 :         char *ppat = NULL, *msg;
    1404             : 
    1405          53 :         if (strNil(*pat) || strNil(*val)) {
    1406           2 :                 *ret = int_nil;
    1407           2 :                 return MAL_SUCCEED;
    1408             :         }
    1409             : 
    1410          25 :         if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
    1411             :                 return msg;
    1412          25 :         if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
    1413           0 :                 GDKfree(ppat);
    1414           0 :                 return msg;
    1415             :         }
    1416          25 :         GDKfree(ppat);
    1417          25 :         msg = PCREindex(ret, re, val);
    1418          25 :         pcre_free(re);
    1419          25 :         return msg;
    1420             : #else
    1421             :         (void) ret;
    1422             :         (void) pat;
    1423             :         (void) val;
    1424             :         throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
    1425             : #endif
    1426             : }
    1427             : 
    1428             : static str
    1429           0 : PCREquote(str *ret, const str *val)
    1430             : {
    1431           0 :         char *p;
    1432           0 :         const char *s = *val;
    1433             : 
    1434           0 :         *ret = p = GDKmalloc(strlen(s) * 2 + 1);        /* certainly long enough */
    1435           0 :         if (p == NULL)
    1436           0 :                 throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1437             :         /* quote all non-alphanumeric ASCII characters (i.e. leave
    1438             :            non-ASCII and alphanumeric alone) */
    1439           0 :         while (*s) {
    1440           0 :                 if (!((*s & 0x80) != 0 ||
    1441           0 :                           ('a' <= *s && *s <= 'z') ||
    1442           0 :                           ('A' <= *s && *s <= 'Z') || isdigit((unsigned char) *s)))
    1443           0 :                         *p++ = '\\';
    1444           0 :                 *p++ = *s++;
    1445             :         }
    1446           0 :         *p = 0;
    1447           0 :         return MAL_SUCCEED;
    1448             : }
    1449             : 
    1450             : static str
    1451           6 : PCREsql2pcre(str *ret, const str *pat, const str *esc)
    1452             : {
    1453           6 :         return sql2pcre(ret, *pat, *esc);
    1454             : }
    1455             : 
    1456             : static bool
    1457        7562 : is_ascii_str(const char *pat)
    1458             : {
    1459        7562 :         size_t len = strlen(pat);
    1460       57350 :         for (size_t i = 0; i < len; i++) {
    1461       50436 :                 if (pat[i] & 0x80)
    1462             :                         return false;
    1463             :         }
    1464             : 
    1465             :         return true;
    1466             : }
    1467             : 
    1468             : static inline str
    1469        7562 : choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty,
    1470             :                                  bool *ascii_pattern, const char *pat, const char *esc)
    1471             : {
    1472        7562 :         str res = MAL_SUCCEED;
    1473        7562 :         *use_re = false;
    1474        7562 :         *use_strcmp = false;
    1475        7562 :         *empty = false;
    1476             : 
    1477             : 
    1478        7562 :         *ascii_pattern = is_ascii_str(pat);
    1479             : 
    1480       14649 :         if (strNil(pat) || strNil(esc)) {
    1481         475 :                 *empty = true;
    1482             :         } else {
    1483        7087 :                 if (!re_is_pattern_properly_escaped(pat, (unsigned char) *esc))
    1484           5 :                         throw(MAL, "pcre.sql2pcre",
    1485             :                                   SQLSTATE(22019) ILLEGAL_ARGUMENT
    1486             :                                   ": (I)LIKE pattern must not end with escape character");
    1487        7081 :                 if (is_strcmpable(pat, esc)) {
    1488         865 :                         *use_re = true;
    1489         865 :                         *use_strcmp = true;
    1490        6216 :                 } else if (re_simple(pat, (unsigned char) *esc)) {
    1491        5836 :                         *use_re = true;
    1492             :                 } else {
    1493         379 :                         if ((res = sql2pcre(ppat, pat, esc)) != MAL_SUCCEED)
    1494             :                                 return res;
    1495         379 :                         if (strNil(*ppat)) {
    1496           0 :                                 GDKfree(*ppat);
    1497           0 :                                 *ppat = NULL;
    1498           0 :                                 *use_re = true;
    1499           0 :                                 *use_strcmp = true;
    1500             :                         }
    1501             :                 }
    1502             :         }
    1503             :         return res;
    1504             : }
    1505             : 
    1506             : static str
    1507         420 : PCRElike_imp(bit *ret, const str *s, const str *pat, const str *esc,
    1508             :                          const bit *isens)
    1509             : {
    1510         420 :         str res = MAL_SUCCEED;
    1511         420 :         char *ppat = NULL;
    1512         420 :         bool use_re = false, use_strcmp = false, empty = false, ascii_pattern = false;
    1513         420 :         struct RE *re = NULL;
    1514             : 
    1515         420 :         if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
    1516             :                                                                 *pat, *esc)) != MAL_SUCCEED)
    1517             :                 return res;
    1518             : 
    1519         787 :         MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ?
    1520         372 :                                                    "pcrelike: pattern matching using strcmp" : use_re ?
    1521             :                                                    "pcrelike: pattern matching using RE" :
    1522             :                                                    "pcrelike: pattern matching using pcre");
    1523             : 
    1524         822 :         if (strNil(*s) || empty) {
    1525          12 :                 *ret = bit_nil;
    1526         403 :         } else if (use_re) {
    1527         277 :                 if (use_strcmp) {
    1528          31 :                         *ret = *isens ? (ascii_pattern
    1529           4 :                                                          ? istrcmp(*s, *pat) == 0
    1530           1 :                                                          : mystrcasecmp(*s, *pat) == 0)
    1531          27 :                                 : strcmp(*s, *pat) == 0;
    1532             :                 } else {
    1533         246 :                         if (!(re = re_create(*pat, *isens, ascii_pattern, (unsigned char) **esc)))
    1534           0 :                                 res = createException(MAL, "pcre.like4",
    1535             :                                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1536             :                         else
    1537         492 :                                 *ret = (*isens && !re->is_ascii)
    1538           0 :                                         ? re_match_ignore(*s, re)
    1539         246 :                                         : re_match_no_ignore(*s, re);
    1540             :                 }
    1541             :         } else {
    1542         126 :                 res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, &ppat);
    1543             :         }
    1544             : 
    1545         289 :         if (re)
    1546         246 :                 re_destroy(re);
    1547         415 :         GDKfree(ppat);
    1548         415 :         return res;
    1549             : }
    1550             : 
    1551             : static str
    1552         420 : PCRElike(bit *ret, const str *s, const str *pat, const str *esc,
    1553             :                  const bit *isens)
    1554             : {
    1555         313 :         return PCRElike_imp(ret, s, pat, esc, isens);
    1556             : }
    1557             : 
    1558             : static str
    1559         107 : PCREnotlike(bit *ret, const str *s, const str *pat, const str *esc,
    1560             :                         const bit *isens)
    1561             : {
    1562         107 :         str tmp;
    1563         107 :         bit r;
    1564             : 
    1565         107 :         rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
    1566         103 :         *ret = r == bit_nil ? bit_nil : !r;
    1567         103 :         return MAL_SUCCEED;
    1568             : }
    1569             : 
    1570             : static inline str
    1571        6420 : re_like_build(struct RE **re, uint32_t **wpat, const char *pat, bool caseignore,
    1572             :                           bool use_strcmp, bool ascii_pattern, uint32_t esc)
    1573             : {
    1574        6420 :         if (!use_strcmp) {
    1575        5590 :                 if (!(*re = re_create(pat, caseignore, ascii_pattern, esc)))
    1576           0 :                         return createException(MAL, "pcre.re_like_build",
    1577             :                                                                    SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1578         830 :         } else if (caseignore && !ascii_pattern) {
    1579          29 :                 if (!(*wpat = utf8stoucs(pat)))
    1580           0 :                         return createException(MAL, "pcre.re_like_build",
    1581             :                                                                    SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1582             :         }
    1583             :         return MAL_SUCCEED;
    1584             : }
    1585             : 
    1586             : #define proj_scanloop(TEST)     \
    1587             :         do {                                    \
    1588             :                 if (strNil(s))          \
    1589             :                         return bit_nil; \
    1590             :                 else                            \
    1591             :                         return TEST;    \
    1592             :         } while (0)
    1593             : 
    1594             : static inline bit
    1595        4459 : re_like_proj_apply(const char *s, const struct RE *restrict re,
    1596             :                                    const uint32_t *restrict wpat, const char *pat,
    1597             :                                    bool caseignore, bool anti, bool use_strcmp, bool is_ascii)
    1598             : {
    1599        4459 :         if (use_strcmp) {
    1600         635 :                 if (caseignore) {
    1601         158 :                         if (is_ascii) {
    1602         140 :                                 if (anti)
    1603         198 :                                         proj_scanloop(istrcmp(s, pat) != 0);
    1604             :                                 else
    1605          82 :                                         proj_scanloop(istrcmp(s, pat) == 0);
    1606             :                         } else {
    1607          18 :                                 if (anti)
    1608          28 :                                         proj_scanloop(mywstrcasecmp(s, wpat) != 0);
    1609             :                                 else
    1610           8 :                                         proj_scanloop(mywstrcasecmp(s, wpat) == 0);
    1611             :                         }
    1612             :                 } else {
    1613         477 :                         if (anti)
    1614         596 :                                 proj_scanloop(strcmp(s, pat) != 0);
    1615             :                         else
    1616         358 :                                 proj_scanloop(strcmp(s, pat) == 0);
    1617             :                 }
    1618             :         } else {
    1619             :                 /* Use re_match_ignore only if the pattern is UTF-8
    1620             :                  * and we need to ignore case
    1621             :                  */
    1622        3824 :                 if (caseignore && !is_ascii) {
    1623           3 :                         if (anti)
    1624           6 :                                 proj_scanloop(!re_match_ignore(s, re));
    1625             :                         else
    1626           0 :                                 proj_scanloop(re_match_ignore(s, re));
    1627             :                 } else {
    1628        3821 :                         if (anti)
    1629         160 :                                 proj_scanloop(!re_match_no_ignore(s, re));
    1630             :                         else
    1631        7482 :                                 proj_scanloop(re_match_no_ignore(s, re));
    1632             :                 }
    1633             :         }
    1634             : }
    1635             : 
    1636             : static inline void
    1637        6587 : re_like_clean(struct RE **re, uint32_t **wpat)
    1638             : {
    1639        6587 :         if (*re) {
    1640        5590 :                 re_destroy(*re);
    1641        5591 :                 *re = NULL;
    1642             :         }
    1643        6588 :         if (*wpat) {
    1644          29 :                 GDKfree(*wpat);
    1645          29 :                 *wpat = NULL;
    1646             :         }
    1647        6588 : }
    1648             : 
    1649             : #ifdef HAVE_LIBPCRE
    1650             : static inline str
    1651         253 : pcre_like_build(pcre **res, pcre_extra **ex, const char *ppat, bool caseignore,
    1652             :                                 BUN count)
    1653             : {
    1654         253 :         const char *err_p = NULL;
    1655         253 :         int errpos = 0;
    1656         253 :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE | PCRE_DOTALL;
    1657         253 :         int pcrestopt = count > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0;
    1658             : 
    1659         253 :         *res = NULL;
    1660         253 :         *ex = NULL;
    1661             : 
    1662         253 :         if (caseignore) {
    1663          12 :                 options |= PCRE_CASELESS;
    1664             :         }
    1665         253 :         if ((*res = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL)
    1666           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1667             :                                                            ": compilation of regular expression (%s) failed"
    1668             :                                                            " at %d with '%s'", ppat, errpos, err_p);
    1669         253 :         *ex = pcre_study(*res, pcrestopt, &err_p);
    1670         253 :         if (err_p != NULL)
    1671           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1672             :                                                            ": pcre study of pattern (%s) "
    1673             :                                                            "failed with '%s'", ppat, err_p);
    1674             :         return MAL_SUCCEED;
    1675             : }
    1676             : #else
    1677             : static inline str
    1678             : pcre_like_build(regex_t *res, void *ex, const char *ppat, bool caseignore,
    1679             :                                 BUN count)
    1680             : {
    1681             :         int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
    1682             :         int errcode;
    1683             : 
    1684             :         *res = (regex_t) {
    1685             :         0};
    1686             :         (void) count;
    1687             : 
    1688             :         if (caseignore) {
    1689             :                 options |= REG_ICASE;
    1690             :         }
    1691             :         if ((errcode = regcomp(res, ppat, options)) != 0)
    1692             :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1693             :                                                            ": compilation of regular expression (%s) failed",
    1694             :                                                            ppat);
    1695             :         (void) ex;
    1696             :         return MAL_SUCCEED;
    1697             : }
    1698             : #endif
    1699             : 
    1700             : #define PCRE_LIKE_BODY(LOOP_BODY, RES1, RES2) \
    1701             :         do { \
    1702             :                 LOOP_BODY  \
    1703             :                 if (strNil(s))          \
    1704             :                         *ret = bit_nil; \
    1705             :                 else if (pos >= 0) \
    1706             :                         *ret = RES1; \
    1707             :                 else if (pos == -1) \
    1708             :                         *ret = RES2; \
    1709             :                 else \
    1710             :                         return createException(MAL, "pcre.match", OPERATION_FAILED ": matching of regular expression (%s) failed with %d", ppat, pos); \
    1711             :         } while(0)
    1712             : 
    1713             : static inline str
    1714        1096 : pcre_like_apply(bit *ret, const char *s,
    1715             : #ifdef HAVE_LIBPCRE
    1716             :                                 const pcre *re, const pcre_extra *ex
    1717             : #else
    1718             :                                 regex_t re, void *ex
    1719             : #endif
    1720             :                                 , const char *ppat, bool anti)
    1721             : {
    1722        1096 :         int pos;
    1723             : 
    1724             : #ifdef HAVE_LIBPCRE
    1725             : #define LOOP_BODY       \
    1726             :         pos = pcre_exec(re, ex, s, (int) strlen(s), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
    1727             : #else
    1728             : #define LOOP_BODY       \
    1729             :         int retval = regexec(&re, s, (size_t) 0, NULL, 0); \
    1730             :         (void) ex; \
    1731             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1732             : #endif
    1733             : 
    1734        1096 :         if (anti)
    1735           6 :                 PCRE_LIKE_BODY(LOOP_BODY, FALSE, TRUE);
    1736             :         else
    1737        1090 :                 PCRE_LIKE_BODY(LOOP_BODY, TRUE, FALSE);
    1738             : 
    1739             :         return MAL_SUCCEED;
    1740             : }
    1741             : 
    1742             : static inline void
    1743         752 : pcre_clean(
    1744             : #ifdef HAVE_LIBPCRE
    1745             :                           pcre **re, pcre_extra **ex)
    1746             : {
    1747         752 :         if (*re)
    1748         253 :                 pcre_free(*re);
    1749         752 :         if (*ex)
    1750         253 :                 pcre_free_study(*ex);
    1751         752 :         *re = NULL;
    1752         752 :         *ex = NULL;
    1753             : #else
    1754             :                           regex_t *re, void *ex)
    1755             : {
    1756             :         regfree(re);
    1757             :         *re = (regex_t) {
    1758             :         0};
    1759             :         (void) ex;
    1760             : #endif
    1761         752 : }
    1762             : 
    1763             : static str
    1764         461 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci,
    1765             :                                 const str *esc, const bit *isens, const bit *not)
    1766             : {
    1767         461 :         str msg = MAL_SUCCEED;
    1768         461 :         BAT *b = NULL, *pbn = NULL, *bn = NULL;
    1769         461 :         char *ppat = NULL;
    1770         461 :         const char *input = NULL;
    1771         461 :         bool use_re = false,
    1772         461 :                 use_strcmp = false,
    1773         461 :                 empty = false,
    1774         461 :                 isensitive = (bool) *isens,
    1775         461 :                 anti = (bool) *not,
    1776         461 :                 has_nil = false,
    1777         461 :                 ascii_pattern = false,
    1778         461 :                 input_is_a_bat = isaBatType(getArgType(mb, pci, 1)),
    1779         461 :                 pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
    1780         461 :         bat *r = getArgReference_bat(stk, pci, 0);
    1781         461 :         BUN q = 0;
    1782         461 :         bit *restrict ret = NULL;
    1783             : #ifdef HAVE_LIBPCRE
    1784         461 :         pcre *re = NULL;
    1785         461 :         pcre_extra *ex = NULL;
    1786             : #else
    1787             :         regex_t re = (regex_t) { 0 };
    1788             :         void *ex = NULL;
    1789             : #endif
    1790         461 :         struct RE *re_simple = NULL;
    1791         461 :         uint32_t *wpat = NULL;
    1792         461 :         BATiter bi = (BATiter) { 0 }, pi;
    1793             : 
    1794         461 :         (void) cntxt;
    1795         461 :         if (input_is_a_bat) {
    1796         458 :                 bat *bid = getArgReference_bat(stk, pci, 1);
    1797         458 :                 if (!(b = BATdescriptor(*bid))) {
    1798           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3",
    1799             :                                                                   SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1800           0 :                         goto bailout;
    1801             :                 }
    1802             :         }
    1803         461 :         if (pattern_is_a_bat) {
    1804          80 :                 bat *pb = getArgReference_bat(stk, pci, 2);
    1805          80 :                 if (!(pbn = BATdescriptor(*pb))) {
    1806           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3",
    1807             :                                                                   SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1808           0 :                         goto bailout;
    1809             :                 }
    1810             :         }
    1811         461 :         assert((!b || ATOMstorage(b->ttype) == TYPE_str)
    1812             :                    && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
    1813             : 
    1814         461 :         q = BATcount(b ? b : pbn);
    1815         461 :         if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
    1816           0 :                 msg = createException(MAL, "batalgebra.batpcrelike3",
    1817             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1818           0 :                 goto bailout;
    1819             :         }
    1820         461 :         ret = (bit *) Tloc(bn, 0);
    1821             : 
    1822         461 :         if (pattern_is_a_bat) {
    1823          80 :                 pi = bat_iterator(pbn);
    1824          80 :                 if (b)
    1825          77 :                         bi = bat_iterator(b);
    1826             :                 else
    1827           3 :                         input = *getArgReference_str(stk, pci, 1);
    1828             : 
    1829        1167 :                 for (BUN p = 0; p < q; p++) {
    1830        1088 :                         const char *next_input = b ? BUNtvar(bi, p) : input,
    1831        1088 :                                 *np = BUNtvar(pi, p);
    1832             : 
    1833        1088 :                         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
    1834             :                                                                                 &ascii_pattern, np, *esc)) != MAL_SUCCEED) {
    1835           0 :                                 bat_iterator_end(&pi);
    1836           0 :                                 if (b)
    1837           0 :                                         bat_iterator_end(&bi);
    1838           0 :                                 goto bailout;
    1839             :                         }
    1840             : 
    1841        1087 :                         if (use_re) {
    1842         626 :                                 if ((msg = re_like_build(&re_simple, &wpat, np, isensitive,
    1843             :                                                                                  use_strcmp, ascii_pattern,
    1844         626 :                                                                                  (unsigned char) **esc)) != MAL_SUCCEED) {
    1845           0 :                                         bat_iterator_end(&pi);
    1846           0 :                                         if (b)
    1847           0 :                                                 bat_iterator_end(&bi);
    1848           0 :                                         goto bailout;
    1849             :                                 }
    1850         626 :                                 ret[p] = re_like_proj_apply(next_input, re_simple, wpat, np,
    1851             :                                                                                         isensitive, anti, use_strcmp,
    1852             :                                                                                         ascii_pattern);
    1853         626 :                                 re_like_clean(&re_simple, &wpat);
    1854         461 :                         } else if (empty) {
    1855         455 :                                 ret[p] = bit_nil;
    1856             :                         } else {
    1857           6 :                                 if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, 1)) != MAL_SUCCEED) {
    1858           0 :                                         bat_iterator_end(&pi);
    1859           0 :                                         if (b)
    1860           0 :                                                 bat_iterator_end(&bi);
    1861           0 :                                         goto bailout;
    1862             :                                 }
    1863           6 :                                 if ((msg = pcre_like_apply(&(ret[p]), next_input, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1864           0 :                                         bat_iterator_end(&pi);
    1865           0 :                                         if (b)
    1866           0 :                                                 bat_iterator_end(&bi);
    1867           0 :                                         goto bailout;
    1868             :                                 }
    1869           6 :                                 pcre_clean(&re, &ex);
    1870             :                         }
    1871        1087 :                         has_nil |= is_bit_nil(ret[p]);
    1872        1087 :                         GDKfree(ppat);
    1873        1087 :                         ppat = NULL;
    1874             :                 }
    1875          79 :                 bat_iterator_end(&pi);
    1876          80 :                 if (b)
    1877          77 :                         bat_iterator_end(&bi);
    1878             :         } else {
    1879         381 :                 const char *pat = *getArgReference_str(stk, pci, 2);
    1880         381 :                 if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
    1881             :                                                                         &ascii_pattern, pat, *esc)) != MAL_SUCCEED)
    1882           0 :                         goto bailout;
    1883             : 
    1884         381 :                 bi = bat_iterator(b);
    1885         753 :                 MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp
    1886             :                                                            ? "pcrelike: pattern matching using strcmp" :
    1887         373 :                                                            use_re ? "pcrelike: pattern matching using RE" :
    1888             :                                                            "pcrelike: pattern matching using pcre");
    1889             : 
    1890         380 :                 if (use_re) {
    1891         300 :                         if ((msg = re_like_build(&re_simple, &wpat, pat, isensitive, use_strcmp,
    1892         299 :                                                                          ascii_pattern, (unsigned char) **esc)) != MAL_SUCCEED) {
    1893           0 :                                 bat_iterator_end(&bi);
    1894           0 :                                 goto bailout;
    1895             :                         }
    1896        4134 :                         for (BUN p = 0; p < q; p++) {
    1897        3835 :                                 const char *s = BUNtvar(bi, p);
    1898        3834 :                                 ret[p] = re_like_proj_apply(s, re_simple, wpat, pat, isensitive,
    1899             :                                                                                         anti, use_strcmp, ascii_pattern);
    1900        3834 :                                 has_nil |= is_bit_nil(ret[p]);
    1901             :                         }
    1902          81 :                 } else if (empty) {
    1903           0 :                         for (BUN p = 0; p < q; p++)
    1904           0 :                                 ret[p] = bit_nil;
    1905             :                         has_nil = true;
    1906             :                 } else {
    1907          81 :                         if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, q)) != MAL_SUCCEED) {
    1908           0 :                                 bat_iterator_end(&bi);
    1909           0 :                                 goto bailout;
    1910             :                         }
    1911        1172 :                         for (BUN p = 0; p < q; p++) {
    1912        1091 :                                 const char *s = BUNtvar(bi, p);
    1913        1090 :                                 if ((msg = pcre_like_apply(&(ret[p]), s, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1914           0 :                                         bat_iterator_end(&bi);
    1915           0 :                                         goto bailout;
    1916             :                                 }
    1917        1091 :                                 has_nil |= is_bit_nil(ret[p]);
    1918             :                         }
    1919             :                 }
    1920         380 :                 bat_iterator_end(&bi);
    1921             :         }
    1922             : 
    1923         461 :   bailout:
    1924         461 :         GDKfree(ppat);
    1925         461 :         re_like_clean(&re_simple, &wpat);
    1926         461 :         pcre_clean(&re, &ex);
    1927         461 :         if (bn && !msg) {
    1928         461 :                 BATsetcount(bn, q);
    1929         461 :                 bn->tnil = has_nil;
    1930         461 :                 bn->tnonil = !has_nil;
    1931         461 :                 bn->tkey = BATcount(bn) <= 1;
    1932         461 :                 bn->tsorted = BATcount(bn) <= 1;
    1933         461 :                 bn->trevsorted = BATcount(bn) <= 1;
    1934         461 :                 *r = bn->batCacheid;
    1935         461 :                 BBPkeepref(bn);
    1936           0 :         } else if (bn)
    1937           0 :                 BBPreclaim(bn);
    1938         461 :         BBPreclaim(b);
    1939         461 :         BBPreclaim(pbn);
    1940         461 :         return msg;
    1941             : }
    1942             : 
    1943             : static str
    1944         429 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1945             : {
    1946         429 :         const str *esc = getArgReference_str(stk, pci, 3);
    1947         429 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1948         429 :         bit no = FALSE;
    1949             : 
    1950         429 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &no);
    1951             : }
    1952             : 
    1953             : static str
    1954          32 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1955             : {
    1956          32 :         const str *esc = getArgReference_str(stk, pci, 3);
    1957          32 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1958          32 :         bit yes = TRUE;
    1959             : 
    1960          32 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &yes);
    1961             : }
    1962             : 
    1963             : /* scan select loop with or without candidates */
    1964             : #define pcrescanloop(TEST, KEEP_NULLS)                                                                  \
    1965             :         do {                                                                                                                            \
    1966             :                 TRC_DEBUG(ALGO,                                                                                                 \
    1967             :                                   "PCREselect(b=%s#"BUNFMT",anti=%d): "                                     \
    1968             :                                   "scanselect %s\n", BATgetId(b), BATcount(b),                        \
    1969             :                                   anti, #TEST);                                                                                 \
    1970             :                 if (!s || BATtdense(s)) {                                                                               \
    1971             :                         for (; p < q; p++) {                                                                         \
    1972             :                                 GDK_CHECK_TIMEOUT(timeoffset, counter,                                  \
    1973             :                                                                   GOTO_LABEL_TIMEOUT_HANDLER(bailout)); \
    1974             :                                 const char *restrict v = BUNtvar(bi, p - off);                  \
    1975             :                                 if ((TEST) || ((KEEP_NULLS) && strNil(v)))                              \
    1976             :                                         vals[cnt++] = p;                                                                        \
    1977             :                         }                                                                                                                       \
    1978             :                 } else {                                                                                                                \
    1979             :                         for (; p < ncands; p++) {                                                                    \
    1980             :                                 GDK_CHECK_TIMEOUT(timeoffset, counter,                                  \
    1981             :                                                                   GOTO_LABEL_TIMEOUT_HANDLER(bailout)); \
    1982             :                                 oid o = canditer_next(ci);                                                              \
    1983             :                                 const char *restrict v = BUNtvar(bi, o - off);                  \
    1984             :                                 if ((TEST) || ((KEEP_NULLS) && strNil(v)))                              \
    1985             :                                         vals[cnt++] = o;                                                                        \
    1986             :                         }                                                                                                                       \
    1987             :                 }                                                                                                                               \
    1988             :         } while (0)
    1989             : 
    1990             : #ifdef HAVE_LIBPCRE
    1991             : #define PCRE_LIKESELECT_BODY (pcre_exec(re, ex, v, (int) strlen(v), 0, PCRE_NO_UTF8_CHECK, NULL, 0) >= 0)
    1992             : #else
    1993             : #define PCRE_LIKESELECT_BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH)
    1994             : #endif
    1995             : 
    1996             : static str
    1997         160 : pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
    1998             :                                 BUN *rcnt, const char *pat, bool caseignore, bool anti,
    1999             :                                 bool keep_nulls)
    2000             : {
    2001             : #ifdef HAVE_LIBPCRE
    2002         160 :         pcre *re = NULL;
    2003         160 :         pcre_extra *ex = NULL;
    2004             : #else
    2005             :         regex_t re = (regex_t) { 0 };
    2006             :         void *ex = NULL;
    2007             : #endif
    2008         160 :         BATiter bi = bat_iterator(b);
    2009         160 :         BUN cnt = 0, ncands = ci->ncand;
    2010         160 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    2011         160 :         str msg = MAL_SUCCEED;
    2012             : 
    2013         160 :         size_t counter = 0;
    2014         160 :         lng timeoffset = 0;
    2015         160 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2016         160 :         if (qry_ctx != NULL) {
    2017         144 :                 timeoffset = (qry_ctx->starttime
    2018         144 :                                           && qry_ctx->querytimeout) ? (qry_ctx->starttime +
    2019         144 :                                                                                                    qry_ctx->querytimeout) : 0;
    2020             :         }
    2021             : 
    2022         160 :         if ((msg = pcre_like_build(&re, &ex, pat, caseignore, ci->ncand)) != MAL_SUCCEED)
    2023           0 :                 goto bailout;
    2024             : 
    2025         160 :         if (anti)
    2026           0 :                 pcrescanloop(!strNil(v) && !PCRE_LIKESELECT_BODY, keep_nulls);
    2027             :         else
    2028       37521 :                 pcrescanloop(!strNil(v) && PCRE_LIKESELECT_BODY, keep_nulls);
    2029             : 
    2030           4 :   bailout:
    2031         160 :         bat_iterator_end(&bi);
    2032         160 :         pcre_clean(&re, &ex);
    2033         160 :         *rcnt = cnt;
    2034         160 :         return msg;
    2035             : }
    2036             : 
    2037             : static str
    2038        5376 : re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
    2039             :                           BUN *rcnt, const char *pat, bool caseignore, bool anti,
    2040             :                           bool use_strcmp, uint32_t esc, bool keep_nulls,
    2041             :                           bool ascii_pattern)
    2042             : {
    2043        5376 :         BATiter bi = bat_iterator(b);
    2044        5376 :         BUN cnt = 0, ncands = ci->ncand;
    2045        5376 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    2046        5376 :         struct RE *re = NULL;
    2047        5376 :         uint32_t *wpat = NULL;
    2048        5376 :         str msg = MAL_SUCCEED;
    2049             : 
    2050        5376 :         size_t counter = 0;
    2051        5376 :         lng timeoffset = 0;
    2052        5376 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2053        5376 :         if (qry_ctx != NULL) {
    2054        2898 :                 timeoffset = (qry_ctx->starttime
    2055        2898 :                                           && qry_ctx->querytimeout) ? (qry_ctx->starttime +
    2056        2898 :                                                                                                    qry_ctx->querytimeout) : 0;
    2057             :         }
    2058             : 
    2059        5376 :         if ((msg = re_like_build(&re, &wpat, pat, caseignore, use_strcmp, ascii_pattern,
    2060             :                                                          esc)) != MAL_SUCCEED)
    2061           0 :                 goto bailout;
    2062             : 
    2063        5375 :         if (use_strcmp) {
    2064          90 :                 if (caseignore) {
    2065          30 :                         if (ascii_pattern) {
    2066          22 :                                 if (anti)
    2067          64 :                                         pcrescanloop(!strNil(v)
    2068             :                                                                  && istrcmp(v, pat) != 0, keep_nulls);
    2069             :                                 else
    2070         635 :                                         pcrescanloop(!strNil(v)
    2071             :                                                                  && istrcmp(v, pat) == 0, keep_nulls);
    2072             :                         } else {
    2073           8 :                                 if (anti)
    2074           0 :                                         pcrescanloop(!strNil(v)
    2075             :                                                                  && mywstrcasecmp(v, wpat) != 0, keep_nulls);
    2076             :                                 else
    2077          36 :                                         pcrescanloop(!strNil(v)
    2078             :                                                                  && mywstrcasecmp(v, wpat) == 0, keep_nulls);
    2079             :                         }
    2080             :                 } else {
    2081          60 :                         if (anti)
    2082          54 :                                 pcrescanloop(!strNil(v) && strcmp(v, pat) != 0, keep_nulls);
    2083             :                         else
    2084       10263 :                                 pcrescanloop(!strNil(v) && strcmp(v, pat) == 0, keep_nulls);
    2085             :                 }
    2086             :         } else {
    2087        5285 :                 if (caseignore) {
    2088             :                         /* ascii_pattern == true is encoded in re */
    2089          52 :                         if (anti) {
    2090           0 :                                 if (ascii_pattern)
    2091           0 :                                         pcrescanloop(!strNil(v)
    2092             :                                                                  && !re_match_no_ignore(v, re), keep_nulls);
    2093             :                                 else
    2094           0 :                                         pcrescanloop(!strNil(v)
    2095             :                                                                  && !re_match_ignore(v, re), keep_nulls);
    2096             :                         } else {
    2097          52 :                                 if (ascii_pattern)
    2098        6795 :                                         pcrescanloop(!strNil(v)
    2099             :                                                                  && re_match_no_ignore(v, re), keep_nulls);
    2100             :                                 else
    2101          72 :                                         pcrescanloop(!strNil(v)
    2102             :                                                                  && re_match_ignore(v, re), keep_nulls);
    2103             :                         }
    2104             :                 } else {
    2105        5233 :                         if (anti)
    2106       60004 :                                 pcrescanloop(!strNil(v)
    2107             :                                                          && !re_match_no_ignore(v, re), keep_nulls);
    2108             :                         else
    2109      133119 :                                 pcrescanloop(!strNil(v)
    2110             :                                                          && re_match_no_ignore(v, re), keep_nulls);
    2111             :                 }
    2112             :         }
    2113             : 
    2114          80 :   bailout:
    2115        5375 :         bat_iterator_end(&bi);
    2116        5375 :         re_like_clean(&re, &wpat);
    2117        5376 :         *rcnt = cnt;
    2118        5376 :         return msg;
    2119             : }
    2120             : 
    2121             : static str
    2122        5536 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const str *pat,
    2123             :                            const str *esc, const bit *caseignore, const bit *anti)
    2124             : {
    2125        5536 :         BAT *b, *s = NULL, *bn = NULL, *old_s = NULL;
    2126        5536 :         str msg = MAL_SUCCEED;
    2127        5536 :         char *ppat = NULL;
    2128        5536 :         bool use_re = false,
    2129        5536 :                 use_strcmp = false,
    2130        5536 :                 empty = false,
    2131        5536 :                 ascii_pattern = false;
    2132        5536 :         bool with_strimps = false;
    2133        5536 :         bool with_strimps_anti = false;
    2134        5536 :         BUN p = 0, q = 0, rcnt = 0;
    2135        5536 :         struct canditer ci;
    2136             : 
    2137        5536 :         if ((b = BATdescriptor(*bid)) == NULL) {
    2138           0 :                 msg = createException(MAL, "algebra.likeselect",
    2139             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2140           0 :                 goto bailout;
    2141             :         }
    2142        5536 :         if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
    2143           0 :                 msg = createException(MAL, "algebra.likeselect",
    2144             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2145           0 :                 goto bailout;
    2146             :         }
    2147             : 
    2148        5536 :         assert(ATOMstorage(b->ttype) == TYPE_str);
    2149             : 
    2150        5536 :         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
    2151             :                                                                 *pat, *esc)) != MAL_SUCCEED)
    2152           0 :                 goto bailout;
    2153             : 
    2154        5536 :         if (empty) {
    2155           0 :                 if (!(bn = BATdense(0, 0, 0)))
    2156           0 :                         msg = createException(MAL, "algebra.likeselect",
    2157             :                                                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2158             : 
    2159           0 :                 goto bailout;
    2160             :         }
    2161             :         /* Since the strimp pre-filtering of a LIKE query produces a superset of the actual result the complement of that
    2162             :          * set will necessarily reject some of the matching entries in the NOT LIKE query.
    2163             :          *
    2164             :          * In this case we run the PCRElikeselect as a LIKE query with strimps and return the complement of the result,
    2165             :          * taking extra care to not return NULLs. This currently means that we do not run strimps for NOT LIKE queries if
    2166             :          * the BAT contains NULLs.
    2167             :          */
    2168        5536 :         if (BAThasstrimps(b)) {
    2169          24 :                 if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
    2170          24 :                         BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
    2171          24 :                         if (tmp_s) {
    2172          24 :                                 old_s = s;
    2173          24 :                                 s = tmp_s;
    2174          24 :                                 if (!*anti)
    2175             :                                         with_strimps = true;
    2176             :                                 else
    2177           0 :                                         with_strimps_anti = true;
    2178             :                         }
    2179             :                 } else {                                /* If we cannot filter with the strimp just continue normally */
    2180           0 :                         GDKclrerr();
    2181             :                 }
    2182             :         }
    2183             : 
    2184             : 
    2185        5535 :         MT_thread_setalgorithm(use_strcmp
    2186        5535 :                                                    ? (with_strimps ?
    2187             :                                                           "pcrelike: pattern matching using strcmp with strimps"
    2188             :                                                           : (with_strimps_anti ?
    2189             :                                                                  "pcrelike: pattern matching using strcmp with strimps anti"
    2190        5535 :                                                                  : "pcrelike: pattern matching using strcmp")) :
    2191        5445 :                                                    use_re ? (with_strimps ?
    2192             :                                                                          "pcrelike: pattern matching using RE with strimps"
    2193             :                                                                          : (with_strimps_anti ?
    2194             :                                                                                 "pcrelike: patterm matching using RE with strimps anti"
    2195             :                                                                                 :
    2196             :                                                                                 "pcrelike: pattern matching using RE"))
    2197             :                                                    : (with_strimps ?
    2198             :                                                           "pcrelike: pattern matching using pcre with strimps"
    2199             :                                                           : (with_strimps_anti ?
    2200             :                                                                  "pcrelike: pattermatching using pcre with strimps anti"
    2201             :                                                                  : "pcrelike: pattern matching using pcre")));
    2202             : 
    2203        5536 :         canditer_init(&ci, b, s);
    2204        5536 :         if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
    2205           0 :                 msg = createException(MAL, "algebra.likeselect",
    2206             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2207           0 :                 goto bailout;
    2208             :         }
    2209             : 
    2210        5535 :         if (!s || BATtdense(s)) {
    2211        1344 :                 if (s) {
    2212        4108 :                         assert(BATtdense(s));
    2213        4108 :                         p = (BUN) s->tseqbase;
    2214        4108 :                         q = p + BATcount(s);
    2215        4108 :                         if ((oid) p < b->hseqbase)
    2216             :                                 p = b->hseqbase;
    2217        4108 :                         if ((oid) q > b->hseqbase + BATcount(b))
    2218             :                                 q = b->hseqbase + BATcount(b);
    2219             :                 } else {
    2220        1344 :                         p = b->hseqbase;
    2221        1344 :                         q = BATcount(b) + b->hseqbase;
    2222             :                 }
    2223             :         }
    2224             : 
    2225        5535 :         if (use_re) {
    2226        5375 :                 msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, *caseignore, *anti
    2227         774 :                                                         && !with_strimps_anti, use_strcmp,
    2228        5375 :                                                         (unsigned char) **esc, with_strimps_anti,
    2229             :                                                         ascii_pattern);
    2230             :         } else {
    2231         160 :                 msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, *caseignore,
    2232         160 :                                                           *anti && !with_strimps_anti, with_strimps_anti);
    2233             :         }
    2234             : 
    2235        5534 :         if (!msg) {                                     /* set some properties */
    2236        5534 :                 BATsetcount(bn, rcnt);
    2237        5534 :                 bn->tsorted = true;
    2238        5534 :                 bn->trevsorted = bn->batCount <= 1;
    2239        5534 :                 bn->tkey = true;
    2240        5534 :                 bn->tnil = false;
    2241        5534 :                 bn->tnonil = true;
    2242        5534 :                 bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 ? *(const oid *) Tloc(bn, 0) : rcnt == b->batCount ? b->hseqbase : oid_nil;
    2243        5534 :                 if (with_strimps_anti) {
    2244             :                         /* Reverse the result taking into account the original candidate list. */
    2245             :                         // BAT *rev = BATdiffcand(BATdense(b->hseqbase, 0, b->batCount), bn);
    2246           0 :                         BAT *rev;
    2247           0 :                         if (old_s) {
    2248           0 :                                 rev = BATdiffcand(old_s, bn);
    2249             : #ifndef NDEBUG
    2250           0 :                                 BAT *is = BATintersectcand(old_s, bn);
    2251           0 :                                 if (is) {
    2252           0 :                                         assert(is->batCount == bn->batCount);
    2253           0 :                                         BBPreclaim(is);
    2254             :                                 }
    2255           0 :                                 assert(rev->batCount == old_s->batCount - bn->batCount);
    2256             : #endif
    2257             :                         }
    2258             : 
    2259             :                         else
    2260           0 :                                 rev = BATnegcands(b->batCount, bn);
    2261             :                         /* BAT *rev = BATnegcands(b->batCount, bn); */
    2262           0 :                         BBPunfix(bn->batCacheid);
    2263           0 :                         bn = rev;
    2264             :                 }
    2265             :         }
    2266             : 
    2267             : 
    2268        5534 :   bailout:
    2269        5534 :         BBPreclaim(b);
    2270        5534 :         BBPreclaim(s);
    2271        5536 :         BBPreclaim(old_s);
    2272        5536 :         GDKfree(ppat);
    2273        5535 :         if (bn && !msg) {
    2274        5535 :                 *ret = bn->batCacheid;
    2275        5535 :                 BBPkeepref(bn);
    2276           0 :         } else if (bn)
    2277           0 :                 BBPreclaim(bn);
    2278        5536 :         return msg;
    2279             : }
    2280             : 
    2281             : #define APPEND(b, o)    (((oid *) b->theap->base)[b->batCount++] = (o))
    2282             : #define VALUE(s, x)             (s##vars + VarHeapVal(s##vals, (x), s##i.width))
    2283             : 
    2284             : #ifdef HAVE_LIBPCRE
    2285             : #define PCRE_EXEC \
    2286             :         do { \
    2287             :                 retval = pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, PCRE_NO_UTF8_CHECK, NULL, 0); \
    2288             :         } while (0)
    2289             : #define PCRE_EXEC_COND (retval < 0)
    2290             : #else
    2291             : #define PCRE_EXEC \
    2292             :         do { \
    2293             :                 retval = regexec(&pcrere, vl, (size_t) 0, NULL, 0); \
    2294             :         } while (0)
    2295             : #define PCRE_EXEC_COND (retval == REG_NOMATCH || retval == REG_ENOSYS)
    2296             : #endif
    2297             : 
    2298             : /* nested loop implementation for PCRE join */
    2299             : #define pcre_join_loop(STRCMP, RE_MATCH, PCRE_COND)                                             \
    2300             :         do {                                                                                                                            \
    2301             :                 for (BUN ridx = 0; ridx < rci.ncand; ridx++) {                                       \
    2302             :                         GDK_CHECK_TIMEOUT(timeoffset, counter,                                          \
    2303             :                                                           GOTO_LABEL_TIMEOUT_HANDLER(bailout));         \
    2304             :                         ro = canditer_next(&rci);                                                                   \
    2305             :                         vr = VALUE(r, ro - rbase);                                                                      \
    2306             :                         nl = 0;                                                                                                         \
    2307             :                         ascii_pattern = use_re = use_strcmp = empty = false;            \
    2308             :                         if ((msg = choose_like_path(&pcrepat, &use_re, &use_strcmp, &empty, &ascii_pattern, vr, esc))) \
    2309             :                                 goto bailout;                                                                                   \
    2310             :                         if (!empty) {                                                                                           \
    2311             :                                 if (use_re) {                                                                                   \
    2312             :                                         if ((msg = re_like_build(&re, &wpat, vr, caseignore, use_strcmp, ascii_pattern, (unsigned char) *esc)) != MAL_SUCCEED) \
    2313             :                                                 goto bailout;                                                                   \
    2314             :                                 } else if (pcrepat) {                                                                   \
    2315             :                                         if ((msg = pcre_like_build(&pcrere, &pcreex, pcrepat, caseignore, lci.ncand)) != MAL_SUCCEED) \
    2316             :                                                 goto bailout;                                                                   \
    2317             :                                         GDKfree(pcrepat);                                                                       \
    2318             :                                         pcrepat = NULL;                                                                         \
    2319             :                                 }                                                                                                               \
    2320             :                                 canditer_reset(&lci);                                                                       \
    2321             :                                 for (BUN lidx = 0; lidx < lci.ncand; lidx++) {                       \
    2322             :                                         lo = canditer_next(&lci);                                                   \
    2323             :                                         vl = VALUE(l, lo - lbase);                                                      \
    2324             :                                         if (strNil(vl)) {                                                                       \
    2325             :                                                 continue;                                                                               \
    2326             :                                         } else if (use_re) {                                                            \
    2327             :                                                 if (use_strcmp) {                                                               \
    2328             :                                                         if (STRCMP)                                                                     \
    2329             :                                                                 continue;                                                               \
    2330             :                                                 } else {                                                                                \
    2331             :                                                         assert(re);                                                                     \
    2332             :                                                         if (RE_MATCH)                                                           \
    2333             :                                                                 continue;                                                               \
    2334             :                                                 }                                                                                               \
    2335             :                                         } else {                                                                                        \
    2336             :                                                 int retval;                                                                             \
    2337             :                                                 PCRE_EXEC;                                                                              \
    2338             :                                                 if (PCRE_COND)                                                                  \
    2339             :                                                         continue;                                                                       \
    2340             :                                         }                                                                                                       \
    2341             :                                         if (BATcount(r1) == BATcapacity(r1)) {                          \
    2342             :                                                 newcap = BATgrows(r1);                                                  \
    2343             :                                                 BATsetcount(r1, BATcount(r1));                                  \
    2344             :                                                 if (r2)                                                                                 \
    2345             :                                                         BATsetcount(r2, BATcount(r2));                          \
    2346             :                                                 if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
    2347             :                                                         msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
    2348             :                                                         goto bailout;                                                           \
    2349             :                                                 }                                                                                               \
    2350             :                                                 assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
    2351             :                                         }                                                                                                       \
    2352             :                                         if (BATcount(r1) > 0) {                                                              \
    2353             :                                                 if (lastl + 1 != lo)                                                    \
    2354             :                                                         r1->tseqbase = oid_nil;                                              \
    2355             :                                                 if (nl == 0) {                                                                  \
    2356             :                                                         if (r2)                                                                         \
    2357             :                                                                 r2->trevsorted = false;                                      \
    2358             :                                                         if (lastl > lo) {                                                    \
    2359             :                                                                 r1->tsorted = false;                                 \
    2360             :                                                                 r1->tkey = false;                                            \
    2361             :                                                         } else if (lastl < lo) {                                     \
    2362             :                                                                 r1->trevsorted = false;                                      \
    2363             :                                                         } else {                                                                        \
    2364             :                                                                 r1->tkey = false;                                            \
    2365             :                                                         }                                                                                       \
    2366             :                                                 }                                                                                               \
    2367             :                                         }                                                                                                       \
    2368             :                                         APPEND(r1, lo);                                                                         \
    2369             :                                         if (r2)                                                                                         \
    2370             :                                                 APPEND(r2, ro);                                                                 \
    2371             :                                         lastl = lo;                                                                                     \
    2372             :                                         nl++;                                                                                           \
    2373             :                                 }                                                                                                               \
    2374             :                                 re_like_clean(&re, &wpat);                                                              \
    2375             :                                 pcre_clean(&pcrere, &pcreex);                                                   \
    2376             :                         }                                                                                                                       \
    2377             :                         if (r2) {                                                                                                       \
    2378             :                                 if (nl > 1) {                                                                                        \
    2379             :                                         r2->tkey = false;                                                                    \
    2380             :                                         r2->tseqbase = oid_nil;                                                              \
    2381             :                                         r1->trevsorted = false;                                                              \
    2382             :                                 } else if (nl == 0) {                                                                   \
    2383             :                                         rskipped = BATcount(r2) > 0;                                         \
    2384             :                                 } else if (rskipped) {                                                                  \
    2385             :                                         r2->tseqbase = oid_nil;                                                              \
    2386             :                                 }                                                                                                               \
    2387             :                         } else if (nl > 1) {                                                                         \
    2388             :                                 r1->trevsorted = false;                                                                      \
    2389             :                         }                                                                                                                       \
    2390             :                 }                                                                                                                               \
    2391             :         } while (0)
    2392             : 
    2393             : static char *
    2394          43 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc,
    2395             :                  bit caseignore, bit anti)
    2396             : {
    2397          43 :         struct canditer lci, rci;
    2398          43 :         const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
    2399          43 :         int rskipped = 0;                       /* whether we skipped values in r */
    2400          43 :         oid lbase, rbase, lo, ro, lastl = 0;    /* last value inserted into r1 */
    2401          43 :         BUN nl, newcap;
    2402          43 :         char *pcrepat = NULL, *msg = MAL_SUCCEED;
    2403          43 :         struct RE *re = NULL;
    2404          43 :         bool use_re = false,
    2405          43 :                 use_strcmp = false,
    2406          43 :                 empty = false,
    2407          43 :                 ascii_pattern = false;
    2408          43 :         uint32_t *wpat = NULL;
    2409             : #ifdef HAVE_LIBPCRE
    2410          43 :         pcre *pcrere = NULL;
    2411          43 :         pcre_extra *pcreex = NULL;
    2412             : #else
    2413             :         regex_t pcrere = (regex_t) { 0 };
    2414             :         void *pcreex = NULL;
    2415             : #endif
    2416             : 
    2417          43 :         size_t counter = 0;
    2418          43 :         lng timeoffset = 0;
    2419          43 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2420          43 :         if (qry_ctx != NULL) {
    2421          43 :                 timeoffset = (qry_ctx->starttime
    2422          43 :                                           && qry_ctx->querytimeout) ? (qry_ctx->starttime +
    2423          43 :                                                                                                    qry_ctx->querytimeout) : 0;
    2424             :         }
    2425             : 
    2426          43 :         TRC_DEBUG(ALGO,
    2427             :                           "pcrejoin(l=%s#" BUNFMT "[%s]%s%s,"
    2428             :                           "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
    2429             :                           "sr=%s#" BUNFMT "%s%s)\n",
    2430             :                           BATgetId(l), BATcount(l), ATOMname(l->ttype),
    2431             :                           l->tsorted ? "-sorted" : "",
    2432             :                           l->trevsorted ? "-revsorted" : "",
    2433             :                           BATgetId(r), BATcount(r), ATOMname(r->ttype),
    2434             :                           r->tsorted ? "-sorted" : "",
    2435             :                           r->trevsorted ? "-revsorted" : "",
    2436             :                           sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
    2437             :                           sl && sl->tsorted ? "-sorted" : "",
    2438             :                           sl && sl->trevsorted ? "-revsorted" : "",
    2439             :                           sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
    2440             :                           sr && sr->tsorted ? "-sorted" : "",
    2441             :                           sr && sr->trevsorted ? "-revsorted" : "");
    2442             : 
    2443         129 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
    2444          43 :         assert(ATOMtype(l->ttype) == TYPE_str);
    2445             : 
    2446          43 :         canditer_init(&lci, l, sl);
    2447          43 :         canditer_init(&rci, r, sr);
    2448             : 
    2449          43 :         BATiter li = bat_iterator(l);
    2450          43 :         BATiter ri = bat_iterator(r);
    2451          43 :         lbase = l->hseqbase;
    2452          43 :         rbase = r->hseqbase;
    2453          43 :         lvals = (const char *) li.base;
    2454          43 :         rvals = (const char *) ri.base;
    2455          43 :         assert(ri.vh && r->ttype);
    2456          43 :         lvars = li.vh->base;
    2457          43 :         rvars = ri.vh->base;
    2458             : 
    2459          43 :         r1->tkey = true;
    2460          43 :         r1->tsorted = true;
    2461          43 :         r1->trevsorted = true;
    2462          43 :         r1->tnil = false;
    2463          43 :         r1->tnonil = true;
    2464          43 :         if (r2) {
    2465          26 :                 r2->tkey = true;
    2466          26 :                 r2->tsorted = true;
    2467          26 :                 r2->trevsorted = true;
    2468          26 :                 r2->tnil = false;
    2469          26 :                 r2->tnonil = true;
    2470             :         }
    2471             : 
    2472          43 :         if (anti) {
    2473          23 :                 if (caseignore) {
    2474         123 :                         pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) == 0 : mywstrcasecmp(vl, wpat) == 0,
    2475             :                                                    re_match_ignore(vl, re), !PCRE_EXEC_COND);
    2476             :                 } else {
    2477         326 :                         pcre_join_loop(strcmp(vl, vr) == 0, re_match_no_ignore(vl, re), !PCRE_EXEC_COND);
    2478             :                 }
    2479             :         } else {
    2480          20 :                 if (caseignore) {
    2481           5 :                         pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) != 0 : mywstrcasecmp(vl, wpat) != 0,
    2482             :                                                    !re_match_ignore(vl, re), PCRE_EXEC_COND);
    2483             :                 } else {
    2484         381 :                         pcre_join_loop(strcmp(vl, vr) != 0, !re_match_no_ignore(vl, re), PCRE_EXEC_COND);
    2485             :                 }
    2486             :         }
    2487          43 :         bat_iterator_end(&li);
    2488          43 :         bat_iterator_end(&ri);
    2489             : 
    2490          43 :         assert(!r2 || BATcount(r1) == BATcount(r2));
    2491             :         /* also set other bits of heap to correct value to indicate size */
    2492          43 :         BATsetcount(r1, BATcount(r1));
    2493          43 :         if (r2)
    2494          26 :                 BATsetcount(r2, BATcount(r2));
    2495          43 :         if (BATcount(r1) > 0) {
    2496          30 :                 if (BATtdense(r1))
    2497           7 :                         r1->tseqbase = ((oid *) r1->theap->base)[0];
    2498          30 :                 if (r2 && BATtdense(r2))
    2499          14 :                         r2->tseqbase = ((oid *) r2->theap->base)[0];
    2500             :         } else {
    2501          13 :                 r1->tseqbase = 0;
    2502          13 :                 if (r2)
    2503           6 :                         r2->tseqbase = 0;
    2504             :         }
    2505          20 :         if (r2)
    2506          26 :                 TRC_DEBUG(ALGO,
    2507             :                                   "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s\n",
    2508             :                                   BATgetId(l), BATgetId(r),
    2509             :                                   BATgetId(r1), BATcount(r1),
    2510             :                                   r1->tsorted ? "-sorted" : "",
    2511             :                                   r1->trevsorted ? "-revsorted" : "",
    2512             :                                   BATgetId(r2), BATcount(r2),
    2513             :                                   r2->tsorted ? "-sorted" : "",
    2514             :                                   r2->trevsorted ? "-revsorted" : "");
    2515             :         else
    2516          17 :                 TRC_DEBUG(ALGO,
    2517             :                                   "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s\n",
    2518             :                                   BATgetId(l), BATgetId(r),
    2519             :                                   BATgetId(r1), BATcount(r1),
    2520             :                                   r1->tsorted ? "-sorted" : "",
    2521             :                                   r1->trevsorted ? "-revsorted" : "");
    2522             :         return MAL_SUCCEED;
    2523             : 
    2524           0 :   bailout:
    2525           0 :         bat_iterator_end(&li);
    2526           0 :         bat_iterator_end(&ri);
    2527           0 :         GDKfree(pcrepat);
    2528           0 :         re_like_clean(&re, &wpat);
    2529           0 :         pcre_clean(&pcrere, &pcreex);
    2530           0 :         assert(msg != MAL_SUCCEED);
    2531             :         return msg;
    2532             : }
    2533             : 
    2534             : static str
    2535          43 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid,
    2536             :                  bat ciid, bit anti)
    2537             : {
    2538          43 :         BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL,
    2539          43 :                 *candleft = NULL, *candright = NULL;
    2540          43 :         BAT *result1 = NULL, *result2 = NULL;
    2541          43 :         char *msg = MAL_SUCCEED;
    2542          43 :         const char *esc = "";
    2543          43 :         bit ci;
    2544          43 :         BATiter bi;
    2545             : 
    2546          43 :         if ((left = BATdescriptor(lid)) == NULL)
    2547           0 :                 goto fail;
    2548          43 :         if ((right = BATdescriptor(rid)) == NULL)
    2549           0 :                 goto fail;
    2550          43 :         if ((escape = BATdescriptor(elid)) == NULL)
    2551           0 :                 goto fail;
    2552          43 :         if ((caseignore = BATdescriptor(ciid)) == NULL)
    2553           0 :                 goto fail;
    2554          43 :         if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
    2555           0 :                 goto fail;
    2556          43 :         if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
    2557           0 :                 goto fail;
    2558          43 :         result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2559          43 :         if (r2)
    2560          26 :                 result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2561          43 :         if (!result1 || (r2 && !result2)) {
    2562           0 :                 msg = createException(MAL, "pcre.join",
    2563             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2564           0 :                 goto fail;
    2565             :         }
    2566          43 :         result1->tnil = false;
    2567          43 :         result1->tnonil = true;
    2568          43 :         result1->tkey = true;
    2569          43 :         result1->tsorted = true;
    2570          43 :         result1->trevsorted = true;
    2571          43 :         result1->tseqbase = 0;
    2572          43 :         if (r2) {
    2573          26 :                 result2->tnil = false;
    2574          26 :                 result2->tnonil = true;
    2575          26 :                 result2->tkey = true;
    2576          26 :                 result2->tsorted = true;
    2577          26 :                 result2->trevsorted = true;
    2578          26 :                 result2->tseqbase = 0;
    2579             :         }
    2580          43 :         if (BATcount(escape) != 1) {
    2581           0 :                 msg = createException(MAL, "pcre.join",
    2582             :                                                           SQLSTATE(42000)
    2583             :                                                           "At the moment, only one value is allowed for the escape input at pcre join");
    2584           0 :                 goto fail;
    2585             :         }
    2586          43 :         if (BATcount(caseignore) != 1) {
    2587           0 :                 msg = createException(MAL, "pcre.join",
    2588             :                                                           SQLSTATE(42000)
    2589             :                                                           "At the moment, only one value is allowed for the case ignore input at pcre join");
    2590           0 :                 goto fail;
    2591             :         }
    2592          43 :         bi = bat_iterator(caseignore);
    2593          43 :         ci = *(bit *) BUNtloc(bi, 0);
    2594          43 :         bat_iterator_end(&bi);
    2595          43 :         bi = bat_iterator(escape);
    2596          43 :         esc = BUNtvar(bi, 0);
    2597          43 :         msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci,
    2598             :                                    anti);
    2599          43 :         bat_iterator_end(&bi);
    2600          43 :         if (msg)
    2601           0 :                 goto fail;
    2602          43 :         *r1 = result1->batCacheid;
    2603          43 :         BBPkeepref(result1);
    2604          43 :         if (r2) {
    2605          26 :                 *r2 = result2->batCacheid;
    2606          26 :                 BBPkeepref(result2);
    2607             :         }
    2608          43 :         BBPunfix(left->batCacheid);
    2609          43 :         BBPunfix(right->batCacheid);
    2610          43 :         BBPreclaim(escape);
    2611          43 :         BBPreclaim(caseignore);
    2612          43 :         BBPreclaim(candleft);
    2613          43 :         BBPreclaim(candright);
    2614             :         return MAL_SUCCEED;
    2615             : 
    2616           0 :   fail:
    2617           0 :         BBPreclaim(left);
    2618           0 :         BBPreclaim(right);
    2619           0 :         BBPreclaim(escape);
    2620           0 :         BBPreclaim(caseignore);
    2621           0 :         BBPreclaim(candleft);
    2622           0 :         BBPreclaim(candright);
    2623           0 :         BBPreclaim(result1);
    2624           0 :         BBPreclaim(result2);
    2625           0 :         if (msg)
    2626             :                 return msg;
    2627           0 :         throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2628             : }
    2629             : 
    2630             : static str
    2631          26 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid,
    2632             :                  const bat *cid, const bat *slid, const bat *srid,
    2633             :                  const bit *nil_matches, const lng *estimate, const bit *anti)
    2634             : {
    2635          26 :         (void) nil_matches;
    2636          26 :         (void) estimate;
    2637          26 :         return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
    2638          26 :                                         *elid, *cid, *anti);
    2639             : }
    2640             : 
    2641             : static str
    2642          17 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid,
    2643             :                   const bat *cid, const bat *slid, const bat *srid,
    2644             :                   const bit *nil_matches, const lng *estimate, const bit *anti)
    2645             : {
    2646          17 :         (void) nil_matches;
    2647          17 :         (void) estimate;
    2648          17 :         return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
    2649          17 :                                         *elid, *cid, *anti);
    2650             : }
    2651             : 
    2652             : #include "mel.h"
    2653             : mel_atom pcre_init_atoms[] = {
    2654             :  { .name="pcre", },  { .cmp=NULL }
    2655             : };
    2656             : mel_func pcre_init_funcs[] = {
    2657             :  command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
    2658             :  command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2659             :  command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2660             :  command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
    2661             :  command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2662             :  command("pcre", "replace_first", PCREreplacefirst_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2663             :  command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
    2664             :  command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
    2665             :  command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2666             :  command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2667             :  command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2668             :  command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2669             :  command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2670             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2671             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2672             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2673             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2674             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2675             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2676             :  command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds.  The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
    2677             :  command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    2678             :  command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
    2679             :  { .imp=NULL }
    2680             : };
    2681             : #include "mal_import.h"
    2682             : #ifdef _MSC_VER
    2683             : #undef read
    2684             : #pragma section(".CRT$XCU",read)
    2685             : #endif
    2686         329 : LIB_STARTUP_FUNC(init_pcre_mal)
    2687         329 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }

Generated by: LCOV version 1.14