Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * N. Nes
15 : * PCRE library interface
16 : * The PCRE library is a set of functions that implement regular
17 : * expression pattern matching using the same syntax and semantics as Perl,
18 : * with just a few differences. The current implementation of PCRE
19 : * (release 4.x) corresponds approximately with Perl 5.8, including support
20 : * for UTF-8 encoded strings. However, this support has to be
21 : * explicitly enabled; it is not the default.
22 : *
23 : * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
24 : */
25 : #include "monetdb_config.h"
26 : #include <string.h>
27 :
28 : #include "mal.h"
29 : #include "mal_client.h"
30 : #include "mal_interpreter.h"
31 : #include "mal_exception.h"
32 :
33 : #include <wchar.h>
34 : #include <wctype.h>
35 :
36 : #ifdef HAVE_LIBPCRE
37 : #include <pcre.h>
38 : #ifndef PCRE_STUDY_JIT_COMPILE
39 : /* old library version on e.g. EPEL 6 */
40 : #define pcre_free_study(x) pcre_free(x)
41 : #define PCRE_STUDY_JIT_COMPILE 0
42 : #endif
43 : #define JIT_COMPILE_MIN 1024 /* when to try JIT compilation of patterns */
44 :
45 : #else
46 :
47 : #include <regex.h>
48 :
49 : typedef regex_t pcre;
50 : #endif
51 :
52 : /* current implementation assumes simple %keyword% [keyw%]* */
53 : struct RE {
54 : char *k;
55 : uint32_t *w;
56 : bool search:1, atend:1, is_ascii:1, case_ignore:1;
57 : size_t len;
58 : struct RE *n;
59 : };
60 :
61 : /* We cannot use strcasecmp and strncasecmp since they work byte for
62 : * byte and don't deal with multibyte encodings (such as UTF-8).
63 : *
64 : * We implement our own conversion from UTF-8 encoding to Unicode code
65 : * points which we store in uint32_t. The reason for this is,
66 : * functions like mbsrtowcs are locale-dependent (so we need a UTF-8
67 : * locale to use them), and on Windows, wchar_t is only 2 bytes and
68 : * therefore cannot hold all Unicode code points. We do use functions
69 : * such as towlower to convert a Unicode code point to its lower-case
70 : * equivalent, but again on Windows, if the code point doesn't fit in
71 : * 2 bytes, we skip this conversion and compare the unconverted code
72 : * points.
73 : *
74 : * Note, towlower is also locale-dependent, but we don't need a UTF-8
75 : * locale in order to use it. */
76 :
77 : /* helper function to convert a UTF-8 multibyte character to a wide
78 : * character */
79 : static size_t
80 274 : utfc8touc(uint32_t *restrict dest, const char *restrict src)
81 : {
82 274 : if ((src[0] & 0x80) == 0) {
83 217 : *dest = src[0];
84 217 : return src[0] != 0;
85 57 : } else if ((src[0] & 0xE0) == 0xC0
86 40 : && (src[1] & 0xC0) == 0x80 && (src[0] & 0x1E) != 0) {
87 40 : *dest = (src[0] & 0x1F) << 6 | (src[1] & 0x3F);
88 40 : return 2;
89 17 : } else if ((src[0] & 0xF0) == 0xE0
90 17 : && (src[1] & 0xC0) == 0x80
91 17 : && (src[2] & 0xC0) == 0x80
92 17 : && ((src[0] & 0x0F) != 0 || (src[1] & 0x20) != 0)) {
93 17 : *dest = (src[0] & 0x0F) << 12 | (src[1] & 0x3F) << 6 | (src[2] & 0x3F);
94 17 : return 3;
95 0 : } else if ((src[0] & 0xF8) == 0xF0
96 0 : && (src[1] & 0xC0) == 0x80
97 0 : && (src[2] & 0xC0) == 0x80 && (src[3] & 0xC0) == 0x80) {
98 0 : uint32_t c = (src[0] & 0x07) << 18
99 0 : | (src[1] & 0x3F) << 12
100 0 : | (src[2] & 0x3F) << 6 | (src[3] & 0x3F);
101 0 : if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
102 : return (size_t) -1;
103 0 : *dest = c;
104 0 : return 4;
105 : }
106 : return (size_t) -1;
107 : }
108 :
109 : /* helper function to convert a UTF-8 string to a wide character
110 : * string, the wide character string is allocated */
111 : static uint32_t *
112 74 : utf8stoucs(const char *src)
113 : {
114 74 : uint32_t *dest;
115 74 : size_t i = 0;
116 74 : size_t j = 0;
117 :
118 : /* count how many uint32_t's we need, while also checking for
119 : * correctness of the input */
120 340 : while (src[j]) {
121 266 : i++;
122 266 : if ((src[j + 0] & 0x80) == 0) {
123 192 : j += 1;
124 74 : } else if ((src[j + 0] & 0xE0) == 0xC0
125 49 : && (src[j + 1] & 0xC0) == 0x80 && (src[j + 0] & 0x1E) != 0) {
126 49 : j += 2;
127 25 : } else if ((src[j + 0] & 0xF0) == 0xE0
128 25 : && (src[j + 1] & 0xC0) == 0x80
129 25 : && (src[j + 2] & 0xC0) == 0x80
130 25 : && ((src[j + 0] & 0x0F) != 0 || (src[j + 1] & 0x20) != 0)) {
131 25 : j += 3;
132 0 : } else if ((src[j + 0] & 0xF8) == 0xF0
133 0 : && (src[j + 1] & 0xC0) == 0x80
134 0 : && (src[j + 2] & 0xC0) == 0x80
135 0 : && (src[j + 3] & 0xC0) == 0x80) {
136 0 : uint32_t c = (src[j + 0] & 0x07) << 18
137 0 : | (src[j + 1] & 0x3F) << 12
138 0 : | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
139 0 : if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
140 : return NULL;
141 0 : j += 4;
142 : } else {
143 : return NULL;
144 : }
145 : }
146 74 : dest = GDKmalloc((i + 1) * sizeof(uint32_t));
147 74 : if (dest == NULL)
148 : return NULL;
149 : /* go through the source string again, this time we can skip
150 : * the correctness tests */
151 : i = j = 0;
152 340 : while (src[j]) {
153 266 : if ((src[j + 0] & 0x80) == 0) {
154 192 : dest[i++] = src[j + 0];
155 192 : j += 1;
156 74 : } else if ((src[j + 0] & 0xE0) == 0xC0) {
157 49 : dest[i++] = (src[j + 0] & 0x1F) << 6 | (src[j + 1] & 0x3F);
158 49 : j += 2;
159 25 : } else if ((src[j + 0] & 0xF0) == 0xE0) {
160 25 : dest[i++] = (src[j + 0] & 0x0F) << 12
161 25 : | (src[j + 1] & 0x3F) << 6 | (src[j + 2] & 0x3F);
162 25 : j += 3;
163 0 : } else if ((src[j + 0] & 0xF8) == 0xF0) {
164 0 : dest[i++] = (src[j + 0] & 0x07) << 18
165 0 : | (src[j + 1] & 0x3F) << 12
166 0 : | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
167 0 : j += 4;
168 : }
169 : }
170 74 : dest[i] = 0;
171 74 : return dest;
172 : }
173 :
174 : static size_t
175 33 : myucslen(const uint32_t *ucs)
176 : {
177 33 : size_t i = 0;
178 :
179 66 : while (ucs[i])
180 33 : i++;
181 33 : return i;
182 : }
183 :
184 : static inline bool
185 14 : mywstrncaseeq(const char *restrict s1, const uint32_t *restrict s2, size_t n2,
186 : bool atend)
187 : {
188 14 : uint32_t c1;
189 :
190 27 : while (n2 > 0) {
191 20 : size_t nn1 = utfc8touc(&c1, s1);
192 20 : if (nn1 == 0 || nn1 == (size_t) -1)
193 0 : return (*s2 == 0);
194 20 : if (*s2 == 0)
195 : return false;
196 : #if SIZEOF_WCHAR_T == 2
197 : if (c1 > 0xFFFF || *s2 > 0xFFFF) {
198 : if (c1 != *s2)
199 : return false;
200 : } else
201 : #endif
202 20 : if (towlower((wint_t) c1) != towlower((wint_t) * s2))
203 : return false;
204 13 : s1 += nn1;
205 13 : n2--;
206 13 : s2++;
207 : }
208 14 : return !atend || *s1 == 0;
209 : }
210 :
211 : static inline int
212 0 : mystrcasecmp(const char *s1, const char *s2)
213 : {
214 0 : uint32_t c1 = 0, c2 = 0;
215 :
216 0 : for (;;) {
217 0 : size_t nn1 = utfc8touc(&c1, s1);
218 0 : size_t nn2 = utfc8touc(&c2, s2);
219 0 : if (nn1 == 0 || nn1 == (size_t) -1)
220 0 : return -(nn2 != 0 && nn2 != (size_t) -1);
221 0 : if (nn2 == 0 || nn2 == (size_t) -1)
222 : return 1;
223 : #if SIZEOF_WCHAR_T == 2
224 : if (c1 > 0xFFFF || c2 > 0xFFFF) {
225 : if (c1 != c2)
226 : return c1 - c2;
227 : } else
228 : #endif
229 0 : if (towlower((wint_t) c1) != towlower((wint_t) c2))
230 0 : return towlower((wint_t) c1) - towlower((wint_t) c2);
231 0 : s1 += nn1;
232 0 : s2 += nn2;
233 : }
234 : }
235 :
236 : static inline int
237 42 : mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2)
238 : {
239 42 : uint32_t c1 = 0;
240 :
241 330 : for (;;) {
242 186 : size_t nn1 = utfc8touc(&c1, s1);
243 186 : if (nn1 == 0 || nn1 == (size_t) -1)
244 22 : return -(*s2 != 0);
245 164 : if (*s2 == 0)
246 : return 1;
247 : #if SIZEOF_WCHAR_T == 2
248 : if (c1 > 0xFFFF || *s2 > 0xFFFF) {
249 : if (c1 != *s2)
250 : return c1 - *s2;
251 : } else
252 : #endif
253 164 : if (towlower((wint_t) c1) != towlower((wint_t) * s2))
254 20 : return towlower((wint_t) c1) - towlower((wint_t) * s2);
255 144 : s1 += nn1;
256 144 : s2++;
257 : }
258 : }
259 :
260 : static inline const char *
261 33 : mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle,
262 : bool atend)
263 : {
264 33 : size_t nlen = myucslen(wneedle);
265 :
266 33 : if (nlen == 0)
267 0 : return atend ? haystack + strlen(haystack) : haystack;
268 :
269 86 : while (*haystack) {
270 : size_t i;
271 : size_t h;
272 : size_t step = 0;
273 83 : for (i = h = 0; i < nlen; i++) {
274 68 : uint32_t c = 0;
275 68 : size_t j = utfc8touc(&c, haystack + h);
276 68 : if (j == 0 || j == (size_t) -1)
277 0 : return NULL;
278 68 : if (i == 0) {
279 68 : step = j;
280 : }
281 : #if SIZEOF_WCHAR_T == 2
282 : if (c > 0xFFFF || wneedle[i] > 0xFFFF) {
283 : if (c != wneedle[i])
284 : break;
285 : } else
286 : #endif
287 68 : if (towlower((wint_t) c) != towlower((wint_t) wneedle[i]))
288 : break;
289 15 : h += j;
290 : }
291 68 : if (i == nlen && (!atend || haystack[h] == 0))
292 15 : return haystack;
293 53 : haystack += step;
294 : }
295 : return NULL;
296 : }
297 :
298 : /* returns true if the pattern does not contain unescaped `_' (single
299 : * character match) and ends with unescaped `%' (any sequence
300 : * match) */
301 : static inline bool
302 8626 : re_simple(const char *pat, unsigned char esc)
303 : {
304 8626 : bool escaped = false;
305 :
306 8626 : if (pat == 0)
307 : return false;
308 8626 : if (*pat == '%') {
309 7494 : pat++;
310 : }
311 61516 : while (*pat) {
312 53645 : if (escaped) {
313 : escaped = false;
314 53485 : } else if ((unsigned char) *pat == esc) {
315 : escaped = true;
316 53321 : } else if (*pat == '_') {
317 : return false;
318 : }
319 52890 : pat++;
320 : }
321 : return true;
322 : }
323 :
324 : static inline bool
325 9550 : re_is_pattern_properly_escaped(const char *pat, unsigned char esc)
326 : {
327 9550 : bool escaped = false;
328 :
329 9550 : if (pat == 0)
330 : return true;
331 79106 : while (*pat) {
332 69556 : if (escaped) {
333 : escaped = false;
334 69374 : } else if ((unsigned char) *pat == esc) {
335 69556 : escaped = true;
336 : }
337 69556 : pat++;
338 : }
339 9550 : return escaped ? false : true;
340 : }
341 :
342 : /* returns true if the pattern does not contain wildcard
343 : * characters ('%' or '_') and no character is escaped
344 : */
345 : static inline bool
346 9558 : is_strcmpable(const char *pat, const char *esc)
347 : {
348 9558 : if (pat[strcspn(pat, "%_")])
349 : return false;
350 1941 : return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
351 : }
352 :
353 : /* Compare two strings ignoring case. When both strings are
354 : * lower case this function returns the same result as strcmp.
355 : */
356 : static int
357 601 : istrcmp(const char *s1, const char *s2)
358 : {
359 601 : char c1, c2;
360 601 : const char *p1, *p2;
361 1000 : for (p1 = s1, p2 = s2; *p1 && *p2; p1++, p2++) {
362 530 : c1 = *p1;
363 530 : c2 = *p2;
364 :
365 530 : if ('A' <= c1 && c1 <= 'Z')
366 19 : c1 += 'a' - 'A';
367 :
368 530 : if ('A' <= c2 && c2 <= 'Z')
369 72 : c2 += 'a' - 'A';
370 :
371 530 : if (c1 != c2)
372 131 : return (c1 - c2);
373 : }
374 :
375 470 : if (*p1 != *p2)
376 397 : return *p1 - *p2;
377 :
378 : return 0;
379 : }
380 :
381 : /* Compare at most len characters of two strings ignoring
382 : * case. When both strings are lowercase this function
383 : * returns the same result as strncmp.
384 : */
385 : static int
386 16 : istrncmp(const char *s1, const char *s2, size_t len)
387 : {
388 16 : char c1, c2;
389 16 : const char *p1, *p2;
390 16 : size_t n = 0;
391 :
392 32 : for (p1 = s1, p2 = s2; *p1 && *p2 && (n < len); p1++, p2++, n++) {
393 16 : c1 = *p1;
394 16 : c2 = *p2;
395 :
396 16 : if ('A' <= c1 && c1 <= 'Z')
397 4 : c1 += 'a' - 'A';
398 :
399 16 : if ('A' <= c2 && c2 <= 'Z')
400 0 : c2 += 'a' - 'A';
401 :
402 16 : if (c1 != c2)
403 0 : return c1 - c2;
404 : }
405 :
406 16 : if (*p1 != *p2 && n < len)
407 0 : return *p1 - *p2;
408 :
409 : return 0;
410 : }
411 :
412 :
413 : /* Find the first occurence of the substring needle in
414 : * haystack ignoring case.
415 : *
416 : * NOTE: This function assumes that the needle is already
417 : * lowercase.
418 : */
419 : static const char *
420 6204 : istrstr(const char *haystack, const char *needle)
421 : {
422 6204 : const char *ph;
423 6204 : const char *pn;
424 6204 : const char *p1;
425 6204 : bool match = true;
426 :
427 227049 : for (ph = haystack; *ph; ph++) {
428 270770 : match = true;
429 270770 : for (pn = needle, p1 = ph; *pn && *p1; pn++, p1++) {
430 268593 : char c1 = *pn;
431 268593 : char c2 = ('A' <= *p1 && *p1 <= 'Z') ? *p1 - 'A' + 'a' : *p1;
432 268593 : if (c1 != c2) {
433 : match = false;
434 : break;
435 : }
436 : }
437 :
438 : /* We reached the end of the haystack, but we still have characters in
439 : * needle. None of the future iterations will match.
440 : */
441 223021 : if (*p1 == 0 && *pn != 0) {
442 : break;
443 : }
444 :
445 223021 : if (match) {
446 2176 : return ph;
447 : }
448 : }
449 : return NULL;
450 : }
451 :
452 : /* Match regular expression by comparing bytes.
453 : *
454 : * This is faster than re_match_ignore, because it does not
455 : * need to decode characters. This function should be used
456 : * in all cases except when we need to perform UTF-8
457 : * comparisons ignoring case.
458 : *
459 : * TODO: The name of the function is no longer accurate and
460 : * needs to change.
461 : */
462 : static inline bool
463 135083 : re_match_no_ignore(const char *restrict s, const struct RE *restrict pattern)
464 : {
465 135083 : const struct RE *r;
466 135083 : size_t l;
467 :
468 184916 : for (r = pattern; r; r = r->n) {
469 135834 : if (*r->k == 0 && (r->search || *s == 0))
470 : return true;
471 122028 : if (!*s ||
472 : (r->search
473 121955 : ? (r->atend
474 108567 : ? (r->case_ignore
475 4997 : ? (l = strlen(s)) < r->len || istrcmp(s + l - r->len, r->k) != 0
476 4913 : : (l = strlen(s)) < r->len || strcmp(s + l - r->len, r->k) != 0)
477 103570 : : (r->case_ignore ? (s = istrstr(s, r->k)) == NULL
478 97464 : : (s = strstr(s, r->k)) == NULL))
479 : : (r->atend
480 13388 : ? (r->case_ignore ? istrcmp(s, r->k) != 0
481 93 : : strcmp(s, r->k) != 0)
482 13295 : : (r->case_ignore ? istrncmp(s, r->k, r->len) != 0
483 13279 : : strncmp(s, r->k, r->len) != 0))))
484 : return false;
485 49833 : s += r->len;
486 : }
487 : return true;
488 : }
489 :
490 : /* Match a regular expression by comparing wide characters.
491 : *
492 : * This needs to be used when we need to perform a
493 : * case-ignoring comparions involving UTF-8 characters.
494 : */
495 : static inline bool
496 44 : re_match_ignore(const char *restrict s, const struct RE *restrict pattern)
497 : {
498 44 : const struct RE *r;
499 :
500 : /* Since the pattern is ascii, do the cheaper comparison */
501 44 : if (pattern->is_ascii) {
502 0 : return re_match_no_ignore(s, pattern);
503 : }
504 :
505 66 : for (r = pattern; r; r = r->n) {
506 47 : if (*r->w == 0 && (r->search || *s == 0))
507 : return true;
508 47 : if (!*s ||
509 : (r->search
510 47 : ? (s = mywstrcasestr(s, r->w, r->atend)) == NULL
511 14 : : !mywstrncaseeq(s, r->w, r->len, r->atend)))
512 : return false;
513 22 : s += r->len;
514 : }
515 : return true;
516 : }
517 :
518 : static void
519 7894 : re_destroy(struct RE *p)
520 : {
521 7894 : if (p) {
522 7894 : GDKfree(p->k);
523 7898 : GDKfree(p->w);
524 8029 : do {
525 8029 : struct RE *n = p->n;
526 :
527 8029 : GDKfree(p);
528 8031 : p = n;
529 8031 : } while (p);
530 : }
531 7900 : }
532 :
533 : /* Create a linked list of RE structures. Depending on the
534 : * caseignore and the ascii_pattern flags, the w
535 : * (if caseignore == true && ascii_pattern == false) or the k
536 : * (in every other case) field is used. These in the first
537 : * structure are allocated, whereas in all subsequent
538 : * structures the fields point into the allocated buffer of
539 : * the first.
540 : */
541 : static struct RE *
542 7891 : re_create(const char *pat, bool caseignore, bool ascii_pattern, uint32_t esc)
543 : {
544 7891 : struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
545 7897 : bool escaped = false;
546 :
547 7897 : if (r == NULL)
548 : return NULL;
549 7897 : *r = (struct RE) {.atend = true };
550 :
551 14997 : while (esc != '%' && *pat == '%') {
552 7100 : pat++; /* skip % */
553 7100 : r->search = true;
554 : }
555 7897 : if (caseignore && !ascii_pattern) {
556 36 : uint32_t *wp;
557 36 : uint32_t *wq;
558 36 : wp = utf8stoucs(pat);
559 36 : if (wp == NULL) {
560 0 : GDKfree(r);
561 0 : return NULL;
562 : }
563 36 : r->w = wp;
564 36 : wq = wp;
565 112 : while (*wp) {
566 76 : if (escaped) {
567 0 : *wq++ = *wp;
568 0 : n->len++;
569 0 : escaped = false;
570 76 : } else if (*wp == esc) {
571 : escaped = true;
572 76 : } else if (*wp == '%') {
573 28 : n->atend = false;
574 28 : while (wp[1] == '%')
575 0 : wp++;
576 28 : if (wp[1]) {
577 4 : n = n->n = GDKmalloc(sizeof(struct RE));
578 4 : if (n == NULL)
579 0 : goto bailout;
580 4 : *n = (struct RE) {
581 : .search = true,
582 : .atend = true,
583 4 : .w = wp + 1,
584 : };
585 : }
586 28 : *wq = 0;
587 28 : wq = wp + 1;
588 : } else {
589 48 : *wq++ = *wp;
590 48 : n->len++;
591 : }
592 76 : wp++;
593 : }
594 36 : *wq = 0;
595 : } else {
596 7861 : char *p, *q;
597 7861 : if ((p = GDKstrdup(pat)) == NULL) {
598 0 : GDKfree(r);
599 0 : return NULL;
600 : }
601 7856 : if (ascii_pattern)
602 7853 : n->is_ascii = true;
603 7856 : if (caseignore)
604 94 : n->case_ignore = true;
605 :
606 94 : if (ascii_pattern && caseignore) {
607 991 : for (q = p; *q != 0; q++) {
608 896 : if ('A' <= *q && *q <= 'Z')
609 21 : *q += 'a' - 'A';
610 : }
611 : }
612 :
613 7856 : r->k = p;
614 7856 : q = p;
615 58761 : while (*p) {
616 50905 : if (escaped) {
617 158 : *q++ = *p;
618 158 : n->len++;
619 158 : escaped = false;
620 50747 : } else if ((unsigned char) *p == esc) {
621 : escaped = true;
622 50589 : } else if (*p == '%') {
623 7507 : n->atend = false;
624 7563 : while (p[1] == '%')
625 56 : p++;
626 7507 : if (p[1]) {
627 127 : n = n->n = GDKmalloc(sizeof(struct RE));
628 127 : if (n == NULL)
629 0 : goto bailout;
630 127 : *n = (struct RE) {
631 : .search = true,
632 : .atend = true,
633 127 : .k = p + 1
634 : };
635 127 : if (ascii_pattern) {
636 124 : n->is_ascii = true;
637 : }
638 127 : if (caseignore) {
639 25 : n->case_ignore = true;
640 : }
641 : }
642 7507 : *q = 0;
643 7507 : q = p + 1;
644 : } else {
645 43082 : char c = *p;
646 43082 : if (ascii_pattern && caseignore && 'A' <= c && c <= 'Z') {
647 0 : c += 'a' - 'A';
648 : }
649 43082 : *q++ = c;
650 43082 : n->len++;
651 : }
652 50905 : p++;
653 : }
654 7856 : *q = 0;
655 : }
656 : return r;
657 0 : bailout:
658 0 : re_destroy(r);
659 0 : return NULL;
660 : }
661 :
662 : #ifdef HAVE_LIBPCRE
663 : static str
664 25 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
665 : {
666 25 : pcre *r;
667 25 : const char *err_p = NULL;
668 25 : int errpos = 0;
669 25 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
670 25 : if (insensitive)
671 0 : options |= PCRE_CASELESS;
672 :
673 25 : if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
674 0 : throw(MAL, "pcre.compile", OPERATION_FAILED
675 : " with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern);
676 : }
677 25 : *res = r;
678 25 : return MAL_SUCCEED;
679 : }
680 : #endif
681 :
682 : /* maximum number of back references and quoted \ or $ in replacement string */
683 : #define MAX_NR_REFS 20
684 :
685 : struct backref {
686 : int idx;
687 : int start;
688 : int end;
689 : };
690 :
691 : #ifdef HAVE_LIBPCRE
692 : /* fill in parameter backrefs (length maxrefs) with information about
693 : * back references in the replacement string; a back reference is a
694 : * dollar or backslash followed by a number */
695 : static int
696 78 : parse_replacement(const char *replacement, int len_replacement,
697 : struct backref *backrefs, int maxrefs)
698 : {
699 78 : int nbackrefs = 0;
700 :
701 126 : for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
702 48 : if (replacement[i] == '$' || replacement[i] == '\\') {
703 6 : char *endptr;
704 6 : backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
705 6 : if (endptr > replacement + i + 1) {
706 6 : int k = (int) (endptr - (replacement + i + 1));
707 6 : backrefs[nbackrefs].start = i;
708 6 : backrefs[nbackrefs].end = i + k + 1;
709 6 : nbackrefs++;
710 0 : } else if (replacement[i] == replacement[i + 1]) {
711 : /* doubled $ or \, we must copy just one to the output */
712 0 : backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */
713 0 : backrefs[nbackrefs].start = i;
714 0 : backrefs[nbackrefs].end = i + 1;
715 0 : i++; /* don't look at second $ or \ again */
716 0 : nbackrefs++;
717 : }
718 : /* else: $ or \ followed by something we don't recognize,
719 : * so just leave it */
720 : }
721 : }
722 78 : return nbackrefs;
723 : }
724 :
725 : static char *
726 27892 : single_replace(pcre *pcre_code, pcre_extra *extra,
727 : const char *origin_str, int len_origin_str,
728 : int exec_options, int *ovector, int ovecsize,
729 : const char *replacement, int len_replacement,
730 : struct backref *backrefs, int nbackrefs,
731 : bool global, char *result, int *max_result)
732 : {
733 27892 : int offset = 0;
734 27892 : int len_result = 0;
735 104375 : int addlen;
736 104375 : char *tmp;
737 :
738 104375 : do {
739 104375 : int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
740 : exec_options, ovector, ovecsize);
741 104471 : if (j <= 0)
742 : break;
743 78653 : addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0);
744 78653 : if (len_result + addlen >= *max_result) {
745 6840 : tmp = GDKrealloc(result, len_result + addlen + 1);
746 6840 : if (tmp == NULL) {
747 0 : GDKfree(result);
748 0 : return NULL;
749 : }
750 6840 : result = tmp;
751 6840 : *max_result = len_result + addlen + 1;
752 : }
753 78653 : if (ovector[0] > offset) {
754 76482 : strncpy(result + len_result, origin_str + offset,
755 76482 : ovector[0] - offset);
756 76482 : len_result += ovector[0] - offset;
757 : }
758 78653 : if (nbackrefs == 0) {
759 76486 : strncpy(result + len_result, replacement, len_replacement);
760 76486 : len_result += len_replacement;
761 : } else {
762 : int prevend = 0;
763 4334 : for (int i = 0; i < nbackrefs; i++) {
764 2167 : int off, len;
765 2167 : if (backrefs[i].idx >= ovecsize / 3) {
766 : /* out of bounds, replace with empty string */
767 : off = 0;
768 : len = 0;
769 : } else {
770 2167 : off = ovector[backrefs[i].idx * 2];
771 2167 : len = ovector[backrefs[i].idx * 2 + 1] - off;
772 : }
773 2167 : addlen = backrefs[i].start - prevend + len;
774 2167 : if (len_result + addlen >= *max_result) {
775 21 : tmp = GDKrealloc(result, len_result + addlen + 1);
776 21 : if (tmp == NULL) {
777 0 : GDKfree(result);
778 0 : return NULL;
779 : }
780 21 : result = tmp;
781 21 : *max_result = len_result + addlen + 1;
782 : }
783 2167 : if (backrefs[i].start > prevend) {
784 2 : strncpy(result + len_result, replacement + prevend,
785 2 : backrefs[i].start - prevend);
786 2 : len_result += backrefs[i].start - prevend;
787 : }
788 2167 : if (len > 0) {
789 2167 : strncpy(result + len_result, origin_str + off, len);
790 2167 : len_result += len;
791 : }
792 2167 : prevend = backrefs[i].end;
793 : }
794 : /* copy rest of replacement string (after last backref) */
795 2167 : addlen = len_replacement - prevend;
796 2167 : if (addlen > 0) {
797 2 : if (len_result + addlen >= *max_result) {
798 1 : tmp = GDKrealloc(result, len_result + addlen + 1);
799 1 : if (tmp == NULL) {
800 0 : GDKfree(result);
801 0 : return NULL;
802 : }
803 1 : result = tmp;
804 1 : *max_result = len_result + addlen + 1;
805 : }
806 2 : strncpy(result + len_result, replacement + prevend, addlen);
807 2 : len_result += addlen;
808 : }
809 : }
810 78653 : offset = ovector[1];
811 78653 : } while (offset < len_origin_str && global);
812 27988 : if (offset < len_origin_str) {
813 25768 : addlen = len_origin_str - offset;
814 25768 : if (len_result + addlen >= *max_result) {
815 367 : tmp = GDKrealloc(result, len_result + addlen + 1);
816 368 : if (tmp == NULL) {
817 0 : GDKfree(result);
818 0 : return NULL;
819 : }
820 368 : result = tmp;
821 368 : *max_result = len_result + addlen + 1;
822 : }
823 25769 : strncpy(result + len_result, origin_str + offset, addlen);
824 25769 : len_result += addlen;
825 : }
826 : /* null terminate string */
827 27989 : result[len_result] = '\0';
828 27989 : return result;
829 : }
830 : #endif
831 :
832 : static str
833 10 : pcre_replace(str *res, const char *origin_str, const char *pattern,
834 : const char *replacement, const char *flags, bool global)
835 : {
836 : #ifdef HAVE_LIBPCRE
837 10 : const char *err_p = NULL;
838 10 : pcre *pcre_code = NULL;
839 10 : pcre_extra *extra;
840 10 : char *tmpres;
841 10 : int max_result;
842 10 : int i, errpos = 0;
843 10 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
844 10 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
845 10 : int *ovector, ovecsize;
846 10 : int len_origin_str = (int) strlen(origin_str);
847 10 : int len_replacement = (int) strlen(replacement);
848 10 : struct backref backrefs[MAX_NR_REFS];
849 10 : int nbackrefs = 0;
850 :
851 14 : while (*flags) {
852 4 : switch (*flags) {
853 : case 'e':
854 : exec_options &= ~PCRE_NOTEMPTY;
855 : break;
856 1 : case 'i':
857 1 : compile_options |= PCRE_CASELESS;
858 1 : break;
859 1 : case 'm':
860 1 : compile_options |= PCRE_MULTILINE;
861 1 : break;
862 1 : case 's':
863 1 : compile_options |= PCRE_DOTALL;
864 1 : break;
865 1 : case 'x':
866 1 : compile_options |= PCRE_EXTENDED;
867 1 : break;
868 0 : default:
869 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
870 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
871 : *flags);
872 : }
873 4 : flags++;
874 : }
875 :
876 10 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
877 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
878 : OPERATION_FAILED
879 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
880 : pattern, errpos, err_p);
881 : }
882 :
883 : /* Since the compiled pattern is going to be used several times, it is
884 : * worth spending more time analyzing it in order to speed up the time
885 : * taken for matching.
886 : */
887 10 : extra = pcre_study(pcre_code, 0, &err_p);
888 10 : if (err_p != NULL) {
889 0 : pcre_free(pcre_code);
890 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
891 : OPERATION_FAILED
892 : ": pcre study of pattern (%s) failed with '%s'.\n", pattern,
893 : err_p);
894 : }
895 10 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
896 10 : ovecsize = (i + 1) * 3;
897 10 : if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
898 0 : pcre_free_study(extra);
899 0 : pcre_free(pcre_code);
900 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
901 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
902 : }
903 :
904 : /* identify back references in the replacement string */
905 10 : nbackrefs = parse_replacement(replacement, len_replacement,
906 : backrefs, MAX_NR_REFS);
907 :
908 10 : max_result = len_origin_str + 1;
909 10 : tmpres = GDKmalloc(max_result);
910 10 : if (tmpres == NULL) {
911 0 : GDKfree(ovector);
912 0 : pcre_free_study(extra);
913 0 : pcre_free(pcre_code);
914 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
915 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
916 : }
917 :
918 10 : tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
919 : exec_options, ovector, ovecsize, replacement,
920 : len_replacement, backrefs, nbackrefs, global,
921 : tmpres, &max_result);
922 10 : GDKfree(ovector);
923 10 : pcre_free_study(extra);
924 10 : pcre_free(pcre_code);
925 10 : if (tmpres == NULL)
926 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
927 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
928 :
929 10 : *res = tmpres;
930 10 : return MAL_SUCCEED;
931 : #else
932 : (void) res;
933 : (void) origin_str;
934 : (void) pattern;
935 : (void) replacement;
936 : (void) flags;
937 : (void) global;
938 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
939 : "Database was compiled without PCRE support.");
940 : #endif
941 : }
942 :
943 : static str
944 70 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
945 : const char *replacement, const char *flags, bool global)
946 : {
947 : #ifdef HAVE_LIBPCRE
948 70 : const char *err_p = NULL;
949 70 : char *tmpres;
950 70 : int i, errpos = 0;
951 70 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
952 70 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
953 70 : pcre *pcre_code = NULL;
954 70 : pcre_extra *extra;
955 70 : BAT *tmpbat;
956 70 : BUN p, q;
957 70 : int *ovector, ovecsize;
958 70 : int len_replacement = (int) strlen(replacement);
959 70 : struct backref backrefs[MAX_NR_REFS];
960 70 : int nbackrefs = 0;
961 70 : const char *origin_str;
962 70 : int max_dest_size = 0;
963 :
964 90 : while (*flags) {
965 20 : switch (*flags) {
966 : case 'e':
967 : exec_options &= ~PCRE_NOTEMPTY;
968 : break;
969 5 : case 'i':
970 5 : compile_options |= PCRE_CASELESS;
971 5 : break;
972 10 : case 'm':
973 10 : compile_options |= PCRE_MULTILINE;
974 10 : break;
975 5 : case 's':
976 5 : compile_options |= PCRE_DOTALL;
977 5 : break;
978 0 : case 'x':
979 0 : compile_options |= PCRE_EXTENDED;
980 0 : break;
981 0 : default:
982 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
983 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
984 : *flags);
985 : }
986 20 : flags++;
987 : }
988 :
989 70 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
990 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
991 : OPERATION_FAILED
992 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
993 : pattern, errpos, err_p);
994 : }
995 :
996 : /* Since the compiled pattern is going to be used several times,
997 : * it is worth spending more time analyzing it in order to speed
998 : * up the time taken for matching.
999 : */
1000 138 : extra = pcre_study(pcre_code,
1001 69 : BATcount(origin_strs) >
1002 : JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
1003 69 : if (err_p != NULL) {
1004 0 : pcre_free(pcre_code);
1005 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1006 : OPERATION_FAILED);
1007 : }
1008 69 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
1009 67 : ovecsize = (i + 1) * 3;
1010 67 : if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
1011 0 : pcre_free_study(extra);
1012 0 : pcre_free(pcre_code);
1013 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1014 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1015 : }
1016 :
1017 : /* identify back references in the replacement string */
1018 70 : nbackrefs = parse_replacement(replacement, len_replacement,
1019 : backrefs, MAX_NR_REFS);
1020 :
1021 68 : tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs),
1022 : TRANSIENT);
1023 :
1024 : /* the buffer for all destination strings is allocated only once,
1025 : * and extended when needed */
1026 70 : max_dest_size = len_replacement + 1;
1027 70 : tmpres = GDKmalloc(max_dest_size);
1028 70 : if (tmpbat == NULL || tmpres == NULL) {
1029 0 : pcre_free_study(extra);
1030 0 : pcre_free(pcre_code);
1031 0 : GDKfree(ovector);
1032 0 : BBPreclaim(tmpbat);
1033 0 : GDKfree(tmpres);
1034 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1035 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1036 : }
1037 70 : BATiter origin_strsi = bat_iterator(origin_strs);
1038 28092 : BATloop(origin_strs, p, q) {
1039 28023 : origin_str = BUNtvar(origin_strsi, p);
1040 56003 : tmpres = single_replace(pcre_code, extra, origin_str,
1041 28045 : (int) strlen(origin_str), exec_options,
1042 : ovector, ovecsize, replacement,
1043 : len_replacement, backrefs, nbackrefs, global,
1044 : tmpres, &max_dest_size);
1045 27958 : if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
1046 0 : bat_iterator_end(&origin_strsi);
1047 0 : pcre_free_study(extra);
1048 0 : pcre_free(pcre_code);
1049 0 : GDKfree(ovector);
1050 0 : GDKfree(tmpres);
1051 0 : BBPreclaim(tmpbat);
1052 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1053 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1054 : }
1055 : }
1056 69 : bat_iterator_end(&origin_strsi);
1057 70 : pcre_free_study(extra);
1058 70 : pcre_free(pcre_code);
1059 70 : GDKfree(ovector);
1060 70 : GDKfree(tmpres);
1061 69 : *res = tmpbat;
1062 69 : return MAL_SUCCEED;
1063 : #else
1064 : (void) res;
1065 : (void) origin_strs;
1066 : (void) pattern;
1067 : (void) replacement;
1068 : (void) flags;
1069 : (void) global;
1070 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1071 : "Database was compiled without PCRE support.");
1072 : #endif
1073 : }
1074 :
1075 : static str
1076 74 : pcre_match_with_flags(bit *ret, const char *val, const char *pat,
1077 : const char *flags)
1078 : {
1079 74 : int pos;
1080 : #ifdef HAVE_LIBPCRE
1081 74 : const char *err_p = NULL;
1082 74 : int errpos = 0;
1083 74 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
1084 74 : pcre *re;
1085 : #else
1086 : int options = REG_NOSUB;
1087 : regex_t re;
1088 : int errcode;
1089 : int retval;
1090 : #endif
1091 :
1092 148 : while (*flags) {
1093 74 : switch (*flags) {
1094 0 : case 'i':
1095 : #ifdef HAVE_LIBPCRE
1096 0 : options |= PCRE_CASELESS;
1097 : #else
1098 : options |= REG_ICASE;
1099 : #endif
1100 0 : break;
1101 0 : case 'm':
1102 : #ifdef HAVE_LIBPCRE
1103 0 : options |= PCRE_MULTILINE;
1104 : #else
1105 : options |= REG_NEWLINE;
1106 : #endif
1107 0 : break;
1108 : #ifdef HAVE_LIBPCRE
1109 74 : case 's':
1110 74 : options |= PCRE_DOTALL;
1111 74 : break;
1112 : #endif
1113 0 : case 'x':
1114 : #ifdef HAVE_LIBPCRE
1115 0 : options |= PCRE_EXTENDED;
1116 : #else
1117 : options |= REG_EXTENDED;
1118 : #endif
1119 0 : break;
1120 0 : default:
1121 0 : throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
1122 : ": unsupported flag character '%c'\n", *flags);
1123 : }
1124 74 : flags++;
1125 : }
1126 74 : if (strNil(val)) {
1127 0 : *ret = FALSE;
1128 0 : return MAL_SUCCEED;
1129 : }
1130 :
1131 : #ifdef HAVE_LIBPCRE
1132 74 : if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
1133 : #else
1134 : if ((errcode = regcomp(&re, pat, options)) != 0)
1135 : #endif
1136 : {
1137 0 : throw(MAL, "pcre.match", OPERATION_FAILED
1138 : ": compilation of regular expression (%s) failed "
1139 : #ifdef HAVE_LIBPCRE
1140 : "at %d with '%s'", pat, errpos, err_p
1141 : #else
1142 : , pat
1143 : #endif
1144 : );
1145 : }
1146 : #ifdef HAVE_LIBPCRE
1147 74 : pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK,
1148 : NULL, 0);
1149 74 : pcre_free(re);
1150 : #else
1151 : retval = regexec(&re, val, (size_t) 0, NULL, 0);
1152 : pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
1153 : regfree(&re);
1154 : #endif
1155 74 : if (pos >= 0)
1156 10 : *ret = TRUE;
1157 64 : else if (pos == -1)
1158 64 : *ret = FALSE;
1159 : else
1160 0 : throw(MAL, "pcre.match", OPERATION_FAILED
1161 : ": matching of regular expression (%s) failed with %d", pat, pos);
1162 : return MAL_SUCCEED;
1163 : }
1164 :
1165 : #ifdef HAVE_LIBPCRE
1166 : /* special characters in PCRE that need to be escaped */
1167 : static const char *pcre_specials = ".+?*()[]{}|^$\\";
1168 : #else
1169 : /* special characters in POSIX basic regular expressions that need to
1170 : * be escaped */
1171 : static const char *pcre_specials = "^.[$()|*+?{\\";
1172 : #endif
1173 :
1174 : /* change SQL LIKE pattern into PCRE pattern */
1175 : static str
1176 753 : sql2pcre(str *r, const char *pat, const char *esc_str)
1177 : {
1178 753 : int escaped = 0;
1179 753 : int hasWildcard = 0;
1180 753 : char *ppat;
1181 1505 : int esc = strNil(esc_str) ? 0 : esc_str[0]; /* should change to utf8_convert() */
1182 753 : int specials;
1183 753 : int c;
1184 :
1185 753 : if (strlen(esc_str) > 1)
1186 0 : throw(MAL, "pcre.sql2pcre",
1187 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1188 : ": ESCAPE string must have length 1");
1189 753 : if (pat == NULL)
1190 0 : throw(MAL, "pcre.sql2pcre",
1191 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1192 : ": (I)LIKE pattern must not be NULL");
1193 753 : ppat = GDKmalloc(strlen(pat) * 3 +
1194 : 3 /* 3 = "^'the translated regexp'$0" */ );
1195 753 : if (ppat == NULL)
1196 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1197 :
1198 753 : *r = ppat;
1199 : /* The escape character can be a char which is special in a PCRE
1200 : * expression. If the user used the "+" char as escape and has "++"
1201 : * in their pattern, then replacing this with "+" is not correct and
1202 : * should be "\+" instead. */
1203 753 : specials = (esc && strchr(pcre_specials, esc) != NULL);
1204 :
1205 753 : *ppat++ = '^';
1206 7264 : while ((c = *pat++) != 0) {
1207 6511 : if (c == esc) {
1208 13 : if (escaped) {
1209 0 : if (specials) { /* change ++ into \+ */
1210 0 : *ppat++ = esc;
1211 : } else { /* do not escape simple escape symbols */
1212 0 : ppat[-1] = esc; /* overwrite backslash */
1213 : }
1214 : escaped = 0;
1215 : } else {
1216 13 : *ppat++ = '\\';
1217 13 : escaped = 1;
1218 : }
1219 : hasWildcard = 1;
1220 6498 : } else if (strchr(pcre_specials, c) != NULL) {
1221 : /* escape PCRE special chars, avoid double backslash if the
1222 : * user uses an invalid escape sequence */
1223 36 : if (!escaped)
1224 36 : *ppat++ = '\\';
1225 36 : *ppat++ = c;
1226 36 : hasWildcard = 1;
1227 36 : escaped = 0;
1228 6462 : } else if (c == '%' && !escaped) {
1229 909 : *ppat++ = '.';
1230 909 : *ppat++ = '*';
1231 909 : *ppat++ = '?';
1232 909 : hasWildcard = 1;
1233 : /* collapse multiple %, but only if it isn't the escape */
1234 909 : if (esc != '%')
1235 909 : while (*pat == '%')
1236 0 : pat++;
1237 5553 : } else if (c == '_' && !escaped) {
1238 901 : *ppat++ = '.';
1239 901 : hasWildcard = 1;
1240 : } else {
1241 4652 : if (escaped) {
1242 13 : ppat[-1] = c; /* overwrite backslash of invalid escape */
1243 : } else {
1244 4639 : *ppat++ = c;
1245 : }
1246 : escaped = 0;
1247 : }
1248 : }
1249 : /* no wildcard or escape character at end of string */
1250 753 : if (!hasWildcard || escaped) {
1251 0 : GDKfree(*r);
1252 0 : *r = NULL;
1253 0 : if (escaped)
1254 0 : throw(MAL, "pcre.sql2pcre",
1255 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1256 : ": (I)LIKE pattern must not end with escape character");
1257 0 : *r = GDKstrdup(str_nil);
1258 0 : if (*r == NULL)
1259 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1260 : } else {
1261 753 : *ppat++ = '$';
1262 753 : *ppat = 0;
1263 : }
1264 : return MAL_SUCCEED;
1265 : }
1266 :
1267 : #ifdef HAVE_LIBPCRE
1268 : /* change SQL PATINDEX pattern into PCRE pattern */
1269 : static str
1270 25 : pat2pcre(str *r, const char *pat)
1271 : {
1272 25 : size_t len = strlen(pat);
1273 25 : char *ppat = GDKmalloc(len * 2 + 3 /* 3 = "^'the translated regexp'$0" */ );
1274 25 : int start = 0;
1275 :
1276 25 : if (ppat == NULL)
1277 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1278 25 : *r = ppat;
1279 77 : while (*pat) {
1280 52 : int c = *pat++;
1281 :
1282 52 : if (strchr(pcre_specials, c) != NULL) {
1283 17 : *ppat++ = '\\';
1284 17 : *ppat++ = c;
1285 35 : } else if (c == '%') {
1286 3 : if (start && *pat) {
1287 0 : *ppat++ = '.';
1288 0 : *ppat++ = '*';
1289 : }
1290 3 : start++;
1291 32 : } else if (c == '_') {
1292 0 : *ppat++ = '.';
1293 : } else {
1294 32 : *ppat++ = c;
1295 : }
1296 : }
1297 25 : *ppat = 0;
1298 25 : return MAL_SUCCEED;
1299 : }
1300 : #endif
1301 :
1302 : /*
1303 : * @+ Wrapping
1304 : */
1305 :
1306 : static str
1307 10 : PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl,
1308 : const str *flags)
1309 : {
1310 10 : return pcre_replace(res, *or, *pat, *repl, *flags, true);
1311 : }
1312 :
1313 : static str
1314 0 : PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl,
1315 : const str *flags)
1316 : {
1317 0 : return pcre_replace(res, *or, *pat, *repl, *flags, false);
1318 : }
1319 :
1320 : static str
1321 70 : PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl,
1322 : const str *flags)
1323 : {
1324 70 : BAT *b, *bn = NULL;
1325 70 : str msg;
1326 70 : if ((b = BATdescriptor(*bid)) == NULL)
1327 0 : throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1328 :
1329 70 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
1330 69 : if (msg == MAL_SUCCEED) {
1331 70 : *res = bn->batCacheid;
1332 70 : BBPkeepref(bn);
1333 : }
1334 69 : BBPunfix(b->batCacheid);
1335 69 : return msg;
1336 : }
1337 :
1338 : static str
1339 0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat,
1340 : const str *repl, const str *flags)
1341 : {
1342 0 : BAT *b, *bn = NULL;
1343 0 : str msg;
1344 0 : if ((b = BATdescriptor(*bid)) == NULL)
1345 0 : throw(MAL, "batpcre.replace_first", RUNTIME_OBJECT_MISSING);
1346 :
1347 0 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
1348 0 : if (msg == MAL_SUCCEED) {
1349 0 : *res = bn->batCacheid;
1350 0 : BBPkeepref(bn);
1351 : }
1352 0 : BBPunfix(b->batCacheid);
1353 0 : return msg;
1354 : }
1355 :
1356 : static str
1357 74 : PCREmatch(bit *ret, const str *val, const str *pat)
1358 : {
1359 4 : return pcre_match_with_flags(ret, *val, *pat,
1360 : #ifdef HAVE_LIBPCRE
1361 : "s"
1362 : #else
1363 : "x"
1364 : #endif
1365 : );
1366 : }
1367 :
1368 : static str
1369 0 : PCREimatch(bit *ret, const str *val, const str *pat)
1370 : {
1371 0 : return pcre_match_with_flags(ret, *val, *pat, "i"
1372 : #ifndef HAVE_LIBPCRE
1373 : "x"
1374 : #endif
1375 : );
1376 : }
1377 :
1378 : static str
1379 25 : PCREindex(int *res, const pcre *pattern, const str *s)
1380 : {
1381 : #ifdef HAVE_LIBPCRE
1382 25 : int v[3];
1383 :
1384 25 : v[0] = v[1] = *res = 0;
1385 25 : if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0,
1386 : PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
1387 23 : *res = v[1];
1388 : }
1389 25 : return MAL_SUCCEED;
1390 : #else
1391 : (void) res;
1392 : (void) pattern;
1393 : (void) s;
1394 : throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
1395 : #endif
1396 : }
1397 :
1398 : static str
1399 27 : PCREpatindex(int *ret, const str *pat, const str *val)
1400 : {
1401 : #ifdef HAVE_LIBPCRE
1402 27 : pcre *re = NULL;
1403 27 : char *ppat = NULL, *msg;
1404 :
1405 53 : if (strNil(*pat) || strNil(*val)) {
1406 2 : *ret = int_nil;
1407 2 : return MAL_SUCCEED;
1408 : }
1409 :
1410 25 : if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
1411 : return msg;
1412 25 : if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
1413 0 : GDKfree(ppat);
1414 0 : return msg;
1415 : }
1416 25 : GDKfree(ppat);
1417 25 : msg = PCREindex(ret, re, val);
1418 25 : pcre_free(re);
1419 25 : return msg;
1420 : #else
1421 : (void) ret;
1422 : (void) pat;
1423 : (void) val;
1424 : throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
1425 : #endif
1426 : }
1427 :
1428 : static str
1429 0 : PCREquote(str *ret, const str *val)
1430 : {
1431 0 : char *p;
1432 0 : const char *s = *val;
1433 :
1434 0 : *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */
1435 0 : if (p == NULL)
1436 0 : throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1437 : /* quote all non-alphanumeric ASCII characters (i.e. leave
1438 : non-ASCII and alphanumeric alone) */
1439 0 : while (*s) {
1440 0 : if (!((*s & 0x80) != 0 ||
1441 0 : ('a' <= *s && *s <= 'z') ||
1442 0 : ('A' <= *s && *s <= 'Z') || isdigit((unsigned char) *s)))
1443 0 : *p++ = '\\';
1444 0 : *p++ = *s++;
1445 : }
1446 0 : *p = 0;
1447 0 : return MAL_SUCCEED;
1448 : }
1449 :
1450 : static str
1451 0 : PCREsql2pcre(str *ret, const str *pat, const str *esc)
1452 : {
1453 0 : return sql2pcre(ret, *pat, *esc);
1454 : }
1455 :
1456 : static bool
1457 10018 : is_ascii_str(const char *pat)
1458 : {
1459 10018 : size_t len = strlen(pat);
1460 78267 : for (size_t i = 0; i < len; i++) {
1461 68937 : if (pat[i] & 0x80)
1462 : return false;
1463 : }
1464 :
1465 : return true;
1466 : }
1467 :
1468 : static inline str
1469 10045 : choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty,
1470 : bool *ascii_pattern, const char *pat, const char *esc)
1471 : {
1472 10045 : str res = MAL_SUCCEED;
1473 10045 : *use_re = false;
1474 10045 : *use_strcmp = false;
1475 10045 : *empty = false;
1476 :
1477 :
1478 10045 : *ascii_pattern = is_ascii_str(pat);
1479 :
1480 19618 : if (strNil(pat) || strNil(esc)) {
1481 472 : *empty = true;
1482 : } else {
1483 9573 : if (!re_is_pattern_properly_escaped(pat, (unsigned char) *esc))
1484 5 : throw(MAL, "pcre.sql2pcre",
1485 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1486 : ": (I)LIKE pattern must not end with escape character");
1487 9551 : if (is_strcmpable(pat, esc)) {
1488 923 : *use_re = true;
1489 923 : *use_strcmp = true;
1490 8628 : } else if (re_simple(pat, (unsigned char) *esc)) {
1491 7889 : *use_re = true;
1492 : } else {
1493 755 : if ((res = sql2pcre(ppat, pat, esc)) != MAL_SUCCEED)
1494 : return res;
1495 756 : if (strNil(*ppat)) {
1496 0 : GDKfree(*ppat);
1497 0 : *ppat = NULL;
1498 0 : *use_re = true;
1499 0 : *use_strcmp = true;
1500 : }
1501 : }
1502 : }
1503 : return res;
1504 : }
1505 :
1506 : static str
1507 234 : PCRElike_imp(bit *ret, const str *s, const str *pat, const str *esc,
1508 : const bit *isens)
1509 : {
1510 234 : str res = MAL_SUCCEED;
1511 234 : char *ppat = NULL;
1512 234 : bool use_re = false, use_strcmp = false, empty = false, ascii_pattern = false;
1513 234 : struct RE *re = NULL;
1514 :
1515 234 : if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
1516 : *pat, *esc)) != MAL_SUCCEED)
1517 : return res;
1518 :
1519 459 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ?
1520 225 : "pcrelike: pattern matching using strcmp" : use_re ?
1521 : "pcrelike: pattern matching using RE" :
1522 : "pcrelike: pattern matching using pcre");
1523 :
1524 468 : if (strNil(*s) || empty) {
1525 0 : *ret = bit_nil;
1526 234 : } else if (use_re) {
1527 164 : if (use_strcmp) {
1528 9 : *ret = *isens ? (ascii_pattern
1529 2 : ? istrcmp(*s, *pat) == 0
1530 0 : : mystrcasecmp(*s, *pat) == 0)
1531 7 : : strcmp(*s, *pat) == 0;
1532 : } else {
1533 155 : if (!(re = re_create(*pat, *isens, ascii_pattern, (unsigned char) **esc)))
1534 0 : res = createException(MAL, "pcre.like4",
1535 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1536 : else
1537 310 : *ret = (*isens && !re->is_ascii)
1538 0 : ? re_match_ignore(*s, re)
1539 155 : : re_match_no_ignore(*s, re);
1540 : }
1541 : } else {
1542 70 : res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, &ppat);
1543 : }
1544 :
1545 164 : if (re)
1546 155 : re_destroy(re);
1547 234 : GDKfree(ppat);
1548 234 : return res;
1549 : }
1550 :
1551 : static str
1552 234 : PCRElike(bit *ret, const str *s, const str *pat, const str *esc,
1553 : const bit *isens)
1554 : {
1555 229 : return PCRElike_imp(ret, s, pat, esc, isens);
1556 : }
1557 :
1558 : static str
1559 5 : PCREnotlike(bit *ret, const str *s, const str *pat, const str *esc,
1560 : const bit *isens)
1561 : {
1562 5 : str tmp;
1563 5 : bit r;
1564 :
1565 5 : rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
1566 5 : *ret = r == bit_nil ? bit_nil : !r;
1567 5 : return MAL_SUCCEED;
1568 : }
1569 :
1570 : static inline str
1571 8655 : re_like_build(struct RE **re, uint32_t **wpat, const char *pat, bool caseignore,
1572 : bool use_strcmp, bool ascii_pattern, uint32_t esc)
1573 : {
1574 8655 : if (!use_strcmp) {
1575 7740 : if (!(*re = re_create(pat, caseignore, ascii_pattern, esc)))
1576 0 : return createException(MAL, "pcre.re_like_build",
1577 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1578 915 : } else if (caseignore && !ascii_pattern) {
1579 38 : if (!(*wpat = utf8stoucs(pat)))
1580 0 : return createException(MAL, "pcre.re_like_build",
1581 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1582 : }
1583 : return MAL_SUCCEED;
1584 : }
1585 :
1586 : #define proj_scanloop(TEST) \
1587 : do { \
1588 : if (strNil(s)) \
1589 : return bit_nil; \
1590 : else \
1591 : return TEST; \
1592 : } while (0)
1593 :
1594 : static inline bit
1595 5034 : re_like_proj_apply(const char *s, const struct RE *restrict re,
1596 : const uint32_t *restrict wpat, const char *pat,
1597 : bool caseignore, bool anti, bool use_strcmp, bool is_ascii)
1598 : {
1599 5034 : if (use_strcmp) {
1600 1126 : if (caseignore) {
1601 498 : if (is_ascii) {
1602 479 : if (anti)
1603 874 : proj_scanloop(istrcmp(s, pat) != 0);
1604 : else
1605 84 : proj_scanloop(istrcmp(s, pat) == 0);
1606 : } else {
1607 19 : if (anti)
1608 28 : proj_scanloop(mywstrcasecmp(s, wpat) != 0);
1609 : else
1610 10 : proj_scanloop(mywstrcasecmp(s, wpat) == 0);
1611 : }
1612 : } else {
1613 628 : if (anti)
1614 606 : proj_scanloop(strcmp(s, pat) != 0);
1615 : else
1616 650 : proj_scanloop(strcmp(s, pat) == 0);
1617 : }
1618 : } else {
1619 : /* Use re_match_ignore only if the pattern is UTF-8
1620 : * and we need to ignore case
1621 : */
1622 3908 : if (caseignore && !is_ascii) {
1623 3 : if (anti)
1624 6 : proj_scanloop(!re_match_ignore(s, re));
1625 : else
1626 0 : proj_scanloop(re_match_ignore(s, re));
1627 : } else {
1628 3905 : if (anti)
1629 174 : proj_scanloop(!re_match_no_ignore(s, re));
1630 : else
1631 7636 : proj_scanloop(re_match_no_ignore(s, re));
1632 : }
1633 : }
1634 : }
1635 :
1636 : static inline void
1637 9010 : re_like_clean(struct RE **re, uint32_t **wpat)
1638 : {
1639 9010 : if (*re) {
1640 7742 : re_destroy(*re);
1641 7745 : *re = NULL;
1642 : }
1643 9013 : if (*wpat) {
1644 38 : GDKfree(*wpat);
1645 38 : *wpat = NULL;
1646 : }
1647 9013 : }
1648 :
1649 : #ifdef HAVE_LIBPCRE
1650 : static inline str
1651 687 : pcre_like_build(pcre **res, pcre_extra **ex, const char *ppat, bool caseignore,
1652 : BUN count)
1653 : {
1654 687 : const char *err_p = NULL;
1655 687 : int errpos = 0;
1656 687 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE | PCRE_DOTALL;
1657 687 : int pcrestopt = count > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0;
1658 :
1659 687 : *res = NULL;
1660 687 : *ex = NULL;
1661 :
1662 687 : if (caseignore) {
1663 22 : options |= PCRE_CASELESS;
1664 : }
1665 687 : if ((*res = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL)
1666 0 : return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
1667 : ": compilation of regular expression (%s) failed"
1668 : " at %d with '%s'", ppat, errpos, err_p);
1669 676 : *ex = pcre_study(*res, pcrestopt, &err_p);
1670 678 : if (err_p != NULL)
1671 0 : return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
1672 : ": pcre study of pattern (%s) "
1673 : "failed with '%s'", ppat, err_p);
1674 : return MAL_SUCCEED;
1675 : }
1676 : #else
1677 : static inline str
1678 : pcre_like_build(regex_t *res, void *ex, const char *ppat, bool caseignore,
1679 : BUN count)
1680 : {
1681 : int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
1682 : int errcode;
1683 :
1684 : *res = (regex_t) {
1685 : 0};
1686 : (void) count;
1687 :
1688 : if (caseignore) {
1689 : options |= REG_ICASE;
1690 : }
1691 : if ((errcode = regcomp(res, ppat, options)) != 0)
1692 : return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
1693 : ": compilation of regular expression (%s) failed",
1694 : ppat);
1695 : (void) ex;
1696 : return MAL_SUCCEED;
1697 : }
1698 : #endif
1699 :
1700 : #define PCRE_LIKE_BODY(LOOP_BODY, RES1, RES2) \
1701 : do { \
1702 : LOOP_BODY \
1703 : if (strNil(s)) \
1704 : *ret = bit_nil; \
1705 : else if (pos >= 0) \
1706 : *ret = RES1; \
1707 : else if (pos == -1) \
1708 : *ret = RES2; \
1709 : else \
1710 : return createException(MAL, "pcre.match", OPERATION_FAILED ": matching of regular expression (%s) failed with %d", ppat, pos); \
1711 : } while(0)
1712 :
1713 : static inline str
1714 1120 : pcre_like_apply(bit *ret, const char *s,
1715 : #ifdef HAVE_LIBPCRE
1716 : const pcre *re, const pcre_extra *ex
1717 : #else
1718 : regex_t re, void *ex
1719 : #endif
1720 : , const char *ppat, bool anti)
1721 : {
1722 1120 : int pos;
1723 :
1724 : #ifdef HAVE_LIBPCRE
1725 : #define LOOP_BODY \
1726 : pos = pcre_exec(re, ex, s, (int) strlen(s), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
1727 : #else
1728 : #define LOOP_BODY \
1729 : int retval = regexec(&re, s, (size_t) 0, NULL, 0); \
1730 : (void) ex; \
1731 : pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
1732 : #endif
1733 :
1734 1120 : if (anti)
1735 43 : PCRE_LIKE_BODY(LOOP_BODY, FALSE, TRUE);
1736 : else
1737 1077 : PCRE_LIKE_BODY(LOOP_BODY, TRUE, FALSE);
1738 :
1739 : return MAL_SUCCEED;
1740 : }
1741 :
1742 : static inline void
1743 1636 : pcre_clean(
1744 : #ifdef HAVE_LIBPCRE
1745 : pcre **re, pcre_extra **ex)
1746 : {
1747 1636 : if (*re)
1748 683 : pcre_free(*re);
1749 1638 : if (*ex)
1750 685 : pcre_free_study(*ex);
1751 1635 : *re = NULL;
1752 1635 : *ex = NULL;
1753 : #else
1754 : regex_t *re, void *ex)
1755 : {
1756 : regfree(re);
1757 : *re = (regex_t) {
1758 : 0};
1759 : (void) ex;
1760 : #endif
1761 1635 : }
1762 :
1763 : static str
1764 1041 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci,
1765 : const str *esc, const bit *isens, const bit *not)
1766 : {
1767 1041 : str msg = MAL_SUCCEED;
1768 1041 : BAT *b = NULL, *pbn = NULL, *bn = NULL;
1769 1041 : char *ppat = NULL;
1770 1041 : const char *input = NULL;
1771 1041 : bool use_re = false,
1772 1041 : use_strcmp = false,
1773 1041 : empty = false,
1774 1041 : isensitive = (bool) *isens,
1775 1041 : anti = (bool) *not,
1776 1041 : has_nil = false,
1777 1041 : ascii_pattern = false,
1778 1041 : input_is_a_bat = isaBatType(getArgType(mb, pci, 1)),
1779 1041 : pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
1780 1041 : bat *r = getArgReference_bat(stk, pci, 0);
1781 1041 : BUN q = 0;
1782 1041 : bit *restrict ret = NULL;
1783 : #ifdef HAVE_LIBPCRE
1784 1041 : pcre *re = NULL;
1785 1041 : pcre_extra *ex = NULL;
1786 : #else
1787 : regex_t re = (regex_t) { 0 };
1788 : void *ex = NULL;
1789 : #endif
1790 1041 : struct RE *re_simple = NULL;
1791 1041 : uint32_t *wpat = NULL;
1792 1041 : BATiter bi = (BATiter) { 0 }, pi;
1793 :
1794 1041 : (void) cntxt;
1795 1041 : if (input_is_a_bat) {
1796 1043 : bat *bid = getArgReference_bat(stk, pci, 1);
1797 1043 : if (!(b = BATdescriptor(*bid))) {
1798 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1799 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1800 0 : goto bailout;
1801 : }
1802 : }
1803 1047 : if (pattern_is_a_bat) {
1804 112 : bat *pb = getArgReference_bat(stk, pci, 2);
1805 112 : if (!(pbn = BATdescriptor(*pb))) {
1806 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1807 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1808 0 : goto bailout;
1809 : }
1810 : }
1811 1047 : assert((!b || ATOMstorage(b->ttype) == TYPE_str)
1812 : && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
1813 :
1814 1047 : q = BATcount(b ? b : pbn);
1815 1047 : if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
1816 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1817 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1818 0 : goto bailout;
1819 : }
1820 1042 : ret = (bit *) Tloc(bn, 0);
1821 :
1822 1042 : if (pattern_is_a_bat) {
1823 111 : pi = bat_iterator(pbn);
1824 112 : if (b)
1825 112 : bi = bat_iterator(b);
1826 : else
1827 0 : input = *getArgReference_str(stk, pci, 1);
1828 :
1829 1188 : for (BUN p = 0; p < q; p++) {
1830 1076 : const char *next_input = b ? BUNtvar(bi, p) : input,
1831 1074 : *np = BUNtvar(pi, p);
1832 :
1833 1072 : if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
1834 : &ascii_pattern, np, *esc)) != MAL_SUCCEED) {
1835 0 : bat_iterator_end(&pi);
1836 0 : if (b)
1837 0 : bat_iterator_end(&bi);
1838 0 : goto bailout;
1839 : }
1840 :
1841 1091 : if (use_re) {
1842 639 : if ((msg = re_like_build(&re_simple, &wpat, np, isensitive,
1843 : use_strcmp, ascii_pattern,
1844 639 : (unsigned char) **esc)) != MAL_SUCCEED) {
1845 0 : bat_iterator_end(&pi);
1846 0 : if (b)
1847 0 : bat_iterator_end(&bi);
1848 0 : goto bailout;
1849 : }
1850 639 : ret[p] = re_like_proj_apply(next_input, re_simple, wpat, np,
1851 : isensitive, anti, use_strcmp,
1852 : ascii_pattern);
1853 639 : re_like_clean(&re_simple, &wpat);
1854 452 : } else if (empty) {
1855 446 : ret[p] = bit_nil;
1856 : } else {
1857 6 : if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, 1)) != MAL_SUCCEED) {
1858 0 : bat_iterator_end(&pi);
1859 0 : if (b)
1860 0 : bat_iterator_end(&bi);
1861 0 : goto bailout;
1862 : }
1863 6 : if ((msg = pcre_like_apply(&(ret[p]), next_input, re, ex, ppat, anti)) != MAL_SUCCEED) {
1864 0 : bat_iterator_end(&pi);
1865 0 : if (b)
1866 0 : bat_iterator_end(&bi);
1867 0 : goto bailout;
1868 : }
1869 6 : pcre_clean(&re, &ex);
1870 : }
1871 1091 : has_nil |= is_bit_nil(ret[p]);
1872 1091 : GDKfree(ppat);
1873 1076 : ppat = NULL;
1874 : }
1875 112 : bat_iterator_end(&pi);
1876 112 : if (b)
1877 112 : bat_iterator_end(&bi);
1878 : } else {
1879 931 : const char *pat = *getArgReference_str(stk, pci, 2);
1880 931 : if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
1881 : &ascii_pattern, pat, *esc)) != MAL_SUCCEED)
1882 5 : goto bailout;
1883 :
1884 926 : bi = bat_iterator(b);
1885 1787 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp
1886 : ? "pcrelike: pattern matching using strcmp" :
1887 857 : use_re ? "pcrelike: pattern matching using RE" :
1888 : "pcrelike: pattern matching using pcre");
1889 :
1890 932 : if (use_re) {
1891 701 : if ((msg = re_like_build(&re_simple, &wpat, pat, isensitive, use_strcmp,
1892 701 : ascii_pattern, (unsigned char) **esc)) != MAL_SUCCEED) {
1893 0 : bat_iterator_end(&bi);
1894 0 : goto bailout;
1895 : }
1896 5097 : for (BUN p = 0; p < q; p++) {
1897 4397 : const char *s = BUNtvar(bi, p);
1898 4399 : ret[p] = re_like_proj_apply(s, re_simple, wpat, pat, isensitive,
1899 : anti, use_strcmp, ascii_pattern);
1900 4396 : has_nil |= is_bit_nil(ret[p]);
1901 : }
1902 231 : } else if (empty) {
1903 40 : for (BUN p = 0; p < q; p++)
1904 23 : ret[p] = bit_nil;
1905 : has_nil = true;
1906 : } else {
1907 214 : if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, q)) != MAL_SUCCEED) {
1908 0 : bat_iterator_end(&bi);
1909 0 : goto bailout;
1910 : }
1911 1328 : for (BUN p = 0; p < q; p++) {
1912 1115 : const char *s = BUNtvar(bi, p);
1913 1117 : if ((msg = pcre_like_apply(&(ret[p]), s, re, ex, ppat, anti)) != MAL_SUCCEED) {
1914 0 : bat_iterator_end(&bi);
1915 0 : goto bailout;
1916 : }
1917 1119 : has_nil |= is_bit_nil(ret[p]);
1918 : }
1919 : }
1920 930 : bat_iterator_end(&bi);
1921 : }
1922 :
1923 1049 : bailout:
1924 1049 : GDKfree(ppat);
1925 1044 : re_like_clean(&re_simple, &wpat);
1926 1046 : pcre_clean(&re, &ex);
1927 1043 : if (bn && !msg) {
1928 1038 : BATsetcount(bn, q);
1929 1043 : bn->tnil = has_nil;
1930 1043 : bn->tnonil = !has_nil;
1931 1043 : bn->tkey = BATcount(bn) <= 1;
1932 1043 : bn->tsorted = BATcount(bn) <= 1;
1933 1043 : bn->trevsorted = BATcount(bn) <= 1;
1934 1043 : *r = bn->batCacheid;
1935 1043 : BBPkeepref(bn);
1936 5 : } else if (bn)
1937 5 : BBPreclaim(bn);
1938 1045 : BBPreclaim(b);
1939 1049 : BBPreclaim(pbn);
1940 1048 : return msg;
1941 : }
1942 :
1943 : static str
1944 887 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1945 : {
1946 887 : const str *esc = getArgReference_str(stk, pci, 3);
1947 887 : const bit *ci = getArgReference_bit(stk, pci, 4);
1948 887 : bit no = FALSE;
1949 :
1950 887 : return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &no);
1951 : }
1952 :
1953 : static str
1954 158 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1955 : {
1956 158 : const str *esc = getArgReference_str(stk, pci, 3);
1957 158 : const bit *ci = getArgReference_bit(stk, pci, 4);
1958 158 : bit yes = TRUE;
1959 :
1960 158 : return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &yes);
1961 : }
1962 :
1963 : /* scan select loop with or without candidates */
1964 : #define pcrescanloop(TEST, KEEP_NULLS) \
1965 : do { \
1966 : TRC_DEBUG(ALGO, \
1967 : "PCREselect(b=%s#"BUNFMT",anti=%d): " \
1968 : "scanselect %s\n", BATgetId(b), BATcount(b), \
1969 : anti, #TEST); \
1970 : if (!s || BATtdense(s)) { \
1971 : for (; p < q; p++) { \
1972 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1973 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1974 : const char *restrict v = BUNtvar(bi, p - off); \
1975 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1976 : vals[cnt++] = p; \
1977 : } \
1978 : } else { \
1979 : for (; p < ncands; p++) { \
1980 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1981 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1982 : oid o = canditer_next(ci); \
1983 : const char *restrict v = BUNtvar(bi, o - off); \
1984 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1985 : vals[cnt++] = o; \
1986 : } \
1987 : } \
1988 : } while (0)
1989 :
1990 : #ifdef HAVE_LIBPCRE
1991 : #define PCRE_LIKESELECT_BODY (pcre_exec(re, ex, v, (int) strlen(v), 0, PCRE_NO_UTF8_CHECK, NULL, 0) >= 0)
1992 : #else
1993 : #define PCRE_LIKESELECT_BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH)
1994 : #endif
1995 :
1996 : static str
1997 458 : pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
1998 : BUN *rcnt, const char *pat, bool caseignore, bool anti,
1999 : bool keep_nulls)
2000 : {
2001 : #ifdef HAVE_LIBPCRE
2002 458 : pcre *re = NULL;
2003 458 : pcre_extra *ex = NULL;
2004 : #else
2005 : regex_t re = (regex_t) { 0 };
2006 : void *ex = NULL;
2007 : #endif
2008 458 : BATiter bi = bat_iterator(b);
2009 460 : BUN cnt = 0, ncands = ci->ncand;
2010 460 : oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
2011 460 : str msg = MAL_SUCCEED;
2012 :
2013 460 : size_t counter = 0;
2014 460 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2015 :
2016 461 : if ((msg = pcre_like_build(&re, &ex, pat, caseignore, ci->ncand)) != MAL_SUCCEED)
2017 0 : goto bailout;
2018 :
2019 456 : if (anti)
2020 0 : pcrescanloop(!strNil(v) && !PCRE_LIKESELECT_BODY, keep_nulls);
2021 : else
2022 37477 : pcrescanloop(!strNil(v) && PCRE_LIKESELECT_BODY, keep_nulls);
2023 :
2024 4 : bailout:
2025 455 : bat_iterator_end(&bi);
2026 459 : pcre_clean(&re, &ex);
2027 458 : *rcnt = cnt;
2028 458 : return msg;
2029 : }
2030 :
2031 : static str
2032 7162 : re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
2033 : BUN *rcnt, const char *pat, bool caseignore, bool anti,
2034 : bool use_strcmp, uint32_t esc, bool keep_nulls,
2035 : bool ascii_pattern)
2036 : {
2037 7162 : BATiter bi = bat_iterator(b);
2038 7199 : BUN cnt = 0, ncands = ci->ncand;
2039 7199 : oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
2040 7199 : struct RE *re = NULL;
2041 7199 : uint32_t *wpat = NULL;
2042 7199 : str msg = MAL_SUCCEED;
2043 :
2044 7199 : size_t counter = 0;
2045 7199 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2046 :
2047 7199 : if ((msg = re_like_build(&re, &wpat, pat, caseignore, use_strcmp, ascii_pattern,
2048 : esc)) != MAL_SUCCEED)
2049 0 : goto bailout;
2050 :
2051 7188 : if (use_strcmp) {
2052 114 : if (caseignore) {
2053 31 : if (ascii_pattern) {
2054 15 : if (anti)
2055 59 : pcrescanloop(!strNil(v)
2056 : && istrcmp(v, pat) != 0, keep_nulls);
2057 : else
2058 19 : pcrescanloop(!strNil(v)
2059 : && istrcmp(v, pat) == 0, keep_nulls);
2060 : } else {
2061 16 : if (anti)
2062 0 : pcrescanloop(!strNil(v)
2063 : && mywstrcasecmp(v, wpat) != 0, keep_nulls);
2064 : else
2065 52 : pcrescanloop(!strNil(v)
2066 : && mywstrcasecmp(v, wpat) == 0, keep_nulls);
2067 : }
2068 : } else {
2069 83 : if (anti)
2070 62 : pcrescanloop(!strNil(v) && strcmp(v, pat) != 0, keep_nulls);
2071 : else
2072 9863 : pcrescanloop(!strNil(v) && strcmp(v, pat) == 0, keep_nulls);
2073 : }
2074 : } else {
2075 7074 : if (caseignore) {
2076 : /* ascii_pattern == true is encoded in re */
2077 100 : if (anti) {
2078 1 : if (ascii_pattern)
2079 42 : pcrescanloop(!strNil(v)
2080 : && !re_match_no_ignore(v, re), keep_nulls);
2081 : else
2082 0 : pcrescanloop(!strNil(v)
2083 : && !re_match_ignore(v, re), keep_nulls);
2084 : } else {
2085 99 : if (ascii_pattern)
2086 6226 : pcrescanloop(!strNil(v)
2087 : && re_match_no_ignore(v, re), keep_nulls);
2088 : else
2089 104 : pcrescanloop(!strNil(v)
2090 : && re_match_ignore(v, re), keep_nulls);
2091 : }
2092 : } else {
2093 6974 : if (anti)
2094 37389 : pcrescanloop(!strNil(v)
2095 : && !re_match_no_ignore(v, re), keep_nulls);
2096 : else
2097 105183 : pcrescanloop(!strNil(v)
2098 : && re_match_no_ignore(v, re), keep_nulls);
2099 : }
2100 : }
2101 :
2102 87 : bailout:
2103 7185 : bat_iterator_end(&bi);
2104 7200 : re_like_clean(&re, &wpat);
2105 7200 : *rcnt = cnt;
2106 7200 : return msg;
2107 : }
2108 :
2109 : static str
2110 7642 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const str *pat,
2111 : const str *esc, const bit *caseignore, const bit *anti)
2112 : {
2113 7642 : BAT *b, *s = NULL, *bn = NULL, *old_s = NULL;
2114 7642 : str msg = MAL_SUCCEED;
2115 7642 : char *ppat = NULL;
2116 7642 : bool use_re = false,
2117 7642 : use_strcmp = false,
2118 7642 : empty = false,
2119 7642 : ascii_pattern = false;
2120 7642 : bool with_strimps = false;
2121 7642 : bool with_strimps_anti = false;
2122 7642 : BUN p = 0, q = 0, rcnt = 0;
2123 7642 : struct canditer ci;
2124 :
2125 7642 : if ((b = BATdescriptor(*bid)) == NULL) {
2126 0 : msg = createException(MAL, "algebra.likeselect",
2127 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2128 0 : goto bailout;
2129 : }
2130 7661 : if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
2131 0 : msg = createException(MAL, "algebra.likeselect",
2132 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2133 0 : goto bailout;
2134 : }
2135 :
2136 7662 : assert(ATOMstorage(b->ttype) == TYPE_str);
2137 :
2138 7662 : if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
2139 : *pat, *esc)) != MAL_SUCCEED)
2140 0 : goto bailout;
2141 :
2142 7634 : if (empty) {
2143 0 : if (!(bn = BATdense(0, 0, 0)))
2144 0 : msg = createException(MAL, "algebra.likeselect",
2145 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
2146 :
2147 0 : goto bailout;
2148 : }
2149 : /* Since the strimp pre-filtering of a LIKE query produces a superset of the actual result the complement of that
2150 : * set will necessarily reject some of the matching entries in the NOT LIKE query.
2151 : *
2152 : * In this case we run the PCRElikeselect as a LIKE query with strimps and return the complement of the result,
2153 : * taking extra care to not return NULLs. This currently means that we do not run strimps for NOT LIKE queries if
2154 : * the BAT contains NULLs.
2155 : */
2156 7634 : if (BAThasstrimps(b)) {
2157 48 : if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
2158 48 : BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
2159 48 : if (tmp_s) {
2160 48 : old_s = s;
2161 48 : s = tmp_s;
2162 48 : if (!*anti)
2163 : with_strimps = true;
2164 : else
2165 0 : with_strimps_anti = true;
2166 : }
2167 : } else { /* If we cannot filter with the strimp just continue normally */
2168 0 : GDKclrerr();
2169 : }
2170 : }
2171 :
2172 :
2173 7656 : MT_thread_setalgorithm(use_strcmp
2174 7656 : ? (with_strimps ?
2175 : "pcrelike: pattern matching using strcmp with strimps"
2176 : : (with_strimps_anti ?
2177 : "pcrelike: pattern matching using strcmp with strimps anti"
2178 7656 : : "pcrelike: pattern matching using strcmp")) :
2179 7540 : use_re ? (with_strimps ?
2180 : "pcrelike: pattern matching using RE with strimps"
2181 : : (with_strimps_anti ?
2182 : "pcrelike: patterm matching using RE with strimps anti"
2183 : :
2184 : "pcrelike: pattern matching using RE"))
2185 : : (with_strimps ?
2186 : "pcrelike: pattern matching using pcre with strimps"
2187 : : (with_strimps_anti ?
2188 : "pcrelike: pattermatching using pcre with strimps anti"
2189 : : "pcrelike: pattern matching using pcre")));
2190 :
2191 7660 : canditer_init(&ci, b, s);
2192 7659 : if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
2193 0 : msg = createException(MAL, "algebra.likeselect",
2194 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
2195 0 : goto bailout;
2196 : }
2197 :
2198 7640 : if (!s || BATtdense(s)) {
2199 2429 : if (s) {
2200 5128 : assert(BATtdense(s));
2201 5128 : p = (BUN) s->tseqbase;
2202 5128 : q = p + BATcount(s);
2203 5128 : if ((oid) p < b->hseqbase)
2204 : p = b->hseqbase;
2205 5128 : if ((oid) q > b->hseqbase + BATcount(b))
2206 : q = b->hseqbase + BATcount(b);
2207 : } else {
2208 2429 : p = b->hseqbase;
2209 2429 : q = BATcount(b) + b->hseqbase;
2210 : }
2211 : }
2212 :
2213 7640 : if (use_re) {
2214 7182 : msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, *caseignore, *anti
2215 1584 : && !with_strimps_anti, use_strcmp,
2216 7182 : (unsigned char) **esc, with_strimps_anti,
2217 : ascii_pattern);
2218 : } else {
2219 458 : msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, *caseignore,
2220 458 : *anti && !with_strimps_anti, with_strimps_anti);
2221 : }
2222 :
2223 7646 : if (!msg) { /* set some properties */
2224 7646 : BATsetcount(bn, rcnt);
2225 7649 : bn->tsorted = true;
2226 7649 : bn->trevsorted = bn->batCount <= 1;
2227 7649 : bn->tkey = true;
2228 7649 : bn->tnil = false;
2229 7649 : bn->tnonil = true;
2230 7649 : bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 ? *(const oid *) Tloc(bn, 0) : rcnt == b->batCount ? b->hseqbase : oid_nil;
2231 7649 : if (with_strimps_anti) {
2232 : /* Reverse the result taking into account the original candidate list. */
2233 : // BAT *rev = BATdiffcand(BATdense(b->hseqbase, 0, b->batCount), bn);
2234 0 : BAT *rev;
2235 0 : if (old_s) {
2236 0 : rev = BATdiffcand(old_s, bn);
2237 : #ifndef NDEBUG
2238 0 : BAT *is = BATintersectcand(old_s, bn);
2239 0 : if (is) {
2240 0 : assert(is->batCount == bn->batCount);
2241 0 : BBPreclaim(is);
2242 : }
2243 0 : assert(rev->batCount == old_s->batCount - bn->batCount);
2244 : #endif
2245 : }
2246 :
2247 : else
2248 0 : rev = BATnegcands(b->batCount, bn);
2249 : /* BAT *rev = BATnegcands(b->batCount, bn); */
2250 0 : BBPunfix(bn->batCacheid);
2251 0 : bn = rev;
2252 : }
2253 : }
2254 :
2255 :
2256 7649 : bailout:
2257 7649 : BBPreclaim(b);
2258 7655 : BBPreclaim(s);
2259 7654 : BBPreclaim(old_s);
2260 7653 : GDKfree(ppat);
2261 7647 : if (bn && !msg) {
2262 7647 : *ret = bn->batCacheid;
2263 7647 : BBPkeepref(bn);
2264 0 : } else if (bn)
2265 0 : BBPreclaim(bn);
2266 7653 : return msg;
2267 : }
2268 :
2269 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
2270 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##i.width))
2271 :
2272 : #ifdef HAVE_LIBPCRE
2273 : #define PCRE_EXEC \
2274 : do { \
2275 : retval = pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, PCRE_NO_UTF8_CHECK, NULL, 0); \
2276 : } while (0)
2277 : #define PCRE_EXEC_COND (retval < 0)
2278 : #else
2279 : #define PCRE_EXEC \
2280 : do { \
2281 : retval = regexec(&pcrere, vl, (size_t) 0, NULL, 0); \
2282 : } while (0)
2283 : #define PCRE_EXEC_COND (retval == REG_NOMATCH || retval == REG_ENOSYS)
2284 : #endif
2285 :
2286 : /* nested loop implementation for PCRE join */
2287 : #define pcre_join_loop(STRCMP, RE_MATCH, PCRE_COND) \
2288 : do { \
2289 : for (BUN ridx = 0; ridx < rci.ncand; ridx++) { \
2290 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
2291 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
2292 : ro = canditer_next(&rci); \
2293 : vr = VALUE(r, ro - rbase); \
2294 : nl = 0; \
2295 : ascii_pattern = use_re = use_strcmp = empty = false; \
2296 : if ((msg = choose_like_path(&pcrepat, &use_re, &use_strcmp, &empty, &ascii_pattern, vr, esc))) \
2297 : goto bailout; \
2298 : if (!empty) { \
2299 : if (use_re) { \
2300 : if ((msg = re_like_build(&re, &wpat, vr, caseignore, use_strcmp, ascii_pattern, (unsigned char) *esc)) != MAL_SUCCEED) \
2301 : goto bailout; \
2302 : } else if (pcrepat) { \
2303 : if ((msg = pcre_like_build(&pcrere, &pcreex, pcrepat, caseignore, lci.ncand)) != MAL_SUCCEED) \
2304 : goto bailout; \
2305 : GDKfree(pcrepat); \
2306 : pcrepat = NULL; \
2307 : } \
2308 : canditer_reset(&lci); \
2309 : for (BUN lidx = 0; lidx < lci.ncand; lidx++) { \
2310 : lo = canditer_next(&lci); \
2311 : vl = VALUE(l, lo - lbase); \
2312 : if (strNil(vl)) { \
2313 : continue; \
2314 : } else if (use_re) { \
2315 : if (use_strcmp) { \
2316 : if (STRCMP) \
2317 : continue; \
2318 : } else { \
2319 : assert(re); \
2320 : if (RE_MATCH) \
2321 : continue; \
2322 : } \
2323 : } else { \
2324 : int retval; \
2325 : PCRE_EXEC; \
2326 : if (PCRE_COND) \
2327 : continue; \
2328 : } \
2329 : if (BATcount(r1) == BATcapacity(r1)) { \
2330 : newcap = BATgrows(r1); \
2331 : BATsetcount(r1, BATcount(r1)); \
2332 : if (r2) \
2333 : BATsetcount(r2, BATcount(r2)); \
2334 : if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
2335 : msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
2336 : goto bailout; \
2337 : } \
2338 : assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
2339 : } \
2340 : if (BATcount(r1) > 0) { \
2341 : if (lastl + 1 != lo) \
2342 : r1->tseqbase = oid_nil; \
2343 : if (nl == 0) { \
2344 : if (r2) \
2345 : r2->trevsorted = false; \
2346 : if (lastl > lo) { \
2347 : r1->tsorted = false; \
2348 : r1->tkey = false; \
2349 : } else if (lastl < lo) { \
2350 : r1->trevsorted = false; \
2351 : } else { \
2352 : r1->tkey = false; \
2353 : } \
2354 : } \
2355 : } \
2356 : APPEND(r1, lo); \
2357 : if (r2) \
2358 : APPEND(r2, ro); \
2359 : lastl = lo; \
2360 : nl++; \
2361 : } \
2362 : re_like_clean(&re, &wpat); \
2363 : pcre_clean(&pcrere, &pcreex); \
2364 : } \
2365 : if (r2) { \
2366 : if (nl > 1) { \
2367 : r2->tkey = false; \
2368 : r2->tseqbase = oid_nil; \
2369 : r1->trevsorted = false; \
2370 : } else if (nl == 0) { \
2371 : rskipped = BATcount(r2) > 0; \
2372 : } else if (rskipped) { \
2373 : r2->tseqbase = oid_nil; \
2374 : } \
2375 : } else if (nl > 1) { \
2376 : r1->trevsorted = false; \
2377 : } \
2378 : } \
2379 : } while (0)
2380 :
2381 : static char *
2382 59 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc,
2383 : bit caseignore, bit anti)
2384 : {
2385 59 : struct canditer lci, rci;
2386 59 : const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
2387 59 : int rskipped = 0; /* whether we skipped values in r */
2388 59 : oid lbase, rbase, lo, ro, lastl = 0; /* last value inserted into r1 */
2389 59 : BUN nl, newcap;
2390 59 : char *pcrepat = NULL, *msg = MAL_SUCCEED;
2391 59 : struct RE *re = NULL;
2392 59 : bool use_re = false,
2393 59 : use_strcmp = false,
2394 59 : empty = false,
2395 59 : ascii_pattern = false;
2396 59 : uint32_t *wpat = NULL;
2397 : #ifdef HAVE_LIBPCRE
2398 59 : pcre *pcrere = NULL;
2399 59 : pcre_extra *pcreex = NULL;
2400 : #else
2401 : regex_t pcrere = (regex_t) { 0 };
2402 : void *pcreex = NULL;
2403 : #endif
2404 :
2405 59 : size_t counter = 0;
2406 59 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2407 :
2408 59 : TRC_DEBUG(ALGO,
2409 : "pcrejoin(l=%s#" BUNFMT "[%s]%s%s,"
2410 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
2411 : "sr=%s#" BUNFMT "%s%s)\n",
2412 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
2413 : l->tsorted ? "-sorted" : "",
2414 : l->trevsorted ? "-revsorted" : "",
2415 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
2416 : r->tsorted ? "-sorted" : "",
2417 : r->trevsorted ? "-revsorted" : "",
2418 : sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
2419 : sl && sl->tsorted ? "-sorted" : "",
2420 : sl && sl->trevsorted ? "-revsorted" : "",
2421 : sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
2422 : sr && sr->tsorted ? "-sorted" : "",
2423 : sr && sr->trevsorted ? "-revsorted" : "");
2424 :
2425 177 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
2426 59 : assert(ATOMtype(l->ttype) == TYPE_str);
2427 :
2428 59 : canditer_init(&lci, l, sl);
2429 59 : canditer_init(&rci, r, sr);
2430 :
2431 59 : BATiter li = bat_iterator(l);
2432 59 : BATiter ri = bat_iterator(r);
2433 59 : lbase = l->hseqbase;
2434 59 : rbase = r->hseqbase;
2435 59 : lvals = (const char *) li.base;
2436 59 : rvals = (const char *) ri.base;
2437 59 : assert(ri.vh && r->ttype);
2438 59 : lvars = li.vh->base;
2439 59 : rvars = ri.vh->base;
2440 :
2441 59 : r1->tkey = true;
2442 59 : r1->tsorted = true;
2443 59 : r1->trevsorted = true;
2444 59 : r1->tnil = false;
2445 59 : r1->tnonil = true;
2446 59 : if (r2) {
2447 43 : r2->tkey = true;
2448 43 : r2->tsorted = true;
2449 43 : r2->trevsorted = true;
2450 43 : r2->tnil = false;
2451 43 : r2->tnonil = true;
2452 : }
2453 :
2454 59 : if (anti) {
2455 35 : if (caseignore) {
2456 127 : pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) == 0 : mywstrcasecmp(vl, wpat) == 0,
2457 : re_match_ignore(vl, re), !PCRE_EXEC_COND);
2458 : } else {
2459 327 : pcre_join_loop(strcmp(vl, vr) == 0, re_match_no_ignore(vl, re), !PCRE_EXEC_COND);
2460 : }
2461 : } else {
2462 24 : if (caseignore) {
2463 5 : pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) != 0 : mywstrcasecmp(vl, wpat) != 0,
2464 : !re_match_ignore(vl, re), PCRE_EXEC_COND);
2465 : } else {
2466 387 : pcre_join_loop(strcmp(vl, vr) != 0, !re_match_no_ignore(vl, re), PCRE_EXEC_COND);
2467 : }
2468 : }
2469 57 : bat_iterator_end(&li);
2470 58 : bat_iterator_end(&ri);
2471 :
2472 58 : assert(!r2 || BATcount(r1) == BATcount(r2));
2473 : /* also set other bits of heap to correct value to indicate size */
2474 58 : BATsetcount(r1, BATcount(r1));
2475 59 : if (r2)
2476 43 : BATsetcount(r2, BATcount(r2));
2477 59 : if (BATcount(r1) > 0) {
2478 44 : if (BATtdense(r1))
2479 14 : r1->tseqbase = ((oid *) r1->theap->base)[0];
2480 44 : if (r2 && BATtdense(r2))
2481 33 : r2->tseqbase = ((oid *) r2->theap->base)[0];
2482 : } else {
2483 15 : r1->tseqbase = 0;
2484 15 : if (r2)
2485 9 : r2->tseqbase = 0;
2486 : }
2487 42 : if (r2)
2488 43 : TRC_DEBUG(ALGO,
2489 : "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s\n",
2490 : BATgetId(l), BATgetId(r),
2491 : BATgetId(r1), BATcount(r1),
2492 : r1->tsorted ? "-sorted" : "",
2493 : r1->trevsorted ? "-revsorted" : "",
2494 : BATgetId(r2), BATcount(r2),
2495 : r2->tsorted ? "-sorted" : "",
2496 : r2->trevsorted ? "-revsorted" : "");
2497 : else
2498 16 : TRC_DEBUG(ALGO,
2499 : "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s\n",
2500 : BATgetId(l), BATgetId(r),
2501 : BATgetId(r1), BATcount(r1),
2502 : r1->tsorted ? "-sorted" : "",
2503 : r1->trevsorted ? "-revsorted" : "");
2504 : return MAL_SUCCEED;
2505 :
2506 0 : bailout:
2507 0 : bat_iterator_end(&li);
2508 0 : bat_iterator_end(&ri);
2509 0 : GDKfree(pcrepat);
2510 0 : re_like_clean(&re, &wpat);
2511 0 : pcre_clean(&pcrere, &pcreex);
2512 0 : assert(msg != MAL_SUCCEED);
2513 : return msg;
2514 : }
2515 :
2516 : static str
2517 55 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid,
2518 : bat ciid, bit anti)
2519 : {
2520 55 : BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL,
2521 55 : *candleft = NULL, *candright = NULL;
2522 55 : BAT *result1 = NULL, *result2 = NULL;
2523 55 : char *msg = MAL_SUCCEED;
2524 55 : const char *esc = "";
2525 55 : bit ci;
2526 55 : BATiter bi;
2527 :
2528 55 : if ((left = BATdescriptor(lid)) == NULL)
2529 0 : goto fail;
2530 59 : if ((right = BATdescriptor(rid)) == NULL)
2531 0 : goto fail;
2532 59 : if ((escape = BATdescriptor(elid)) == NULL)
2533 0 : goto fail;
2534 59 : if ((caseignore = BATdescriptor(ciid)) == NULL)
2535 0 : goto fail;
2536 59 : if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
2537 0 : goto fail;
2538 59 : if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
2539 0 : goto fail;
2540 59 : result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
2541 58 : if (r2)
2542 43 : result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
2543 57 : if (!result1 || (r2 && !result2)) {
2544 0 : msg = createException(MAL, "pcre.join",
2545 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
2546 0 : goto fail;
2547 : }
2548 57 : result1->tnil = false;
2549 57 : result1->tnonil = true;
2550 57 : result1->tkey = true;
2551 57 : result1->tsorted = true;
2552 57 : result1->trevsorted = true;
2553 57 : result1->tseqbase = 0;
2554 57 : if (r2) {
2555 42 : result2->tnil = false;
2556 42 : result2->tnonil = true;
2557 42 : result2->tkey = true;
2558 42 : result2->tsorted = true;
2559 42 : result2->trevsorted = true;
2560 42 : result2->tseqbase = 0;
2561 : }
2562 57 : if (BATcount(escape) != 1) {
2563 0 : msg = createException(MAL, "pcre.join",
2564 : SQLSTATE(42000)
2565 : "At the moment, only one value is allowed for the escape input at pcre join");
2566 0 : goto fail;
2567 : }
2568 57 : if (BATcount(caseignore) != 1) {
2569 0 : msg = createException(MAL, "pcre.join",
2570 : SQLSTATE(42000)
2571 : "At the moment, only one value is allowed for the case ignore input at pcre join");
2572 0 : goto fail;
2573 : }
2574 57 : bi = bat_iterator(caseignore);
2575 59 : ci = *(bit *) BUNtloc(bi, 0);
2576 59 : bat_iterator_end(&bi);
2577 59 : bi = bat_iterator(escape);
2578 59 : esc = BUNtvar(bi, 0);
2579 59 : msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci,
2580 : anti);
2581 59 : bat_iterator_end(&bi);
2582 59 : if (msg)
2583 0 : goto fail;
2584 59 : *r1 = result1->batCacheid;
2585 59 : BBPkeepref(result1);
2586 59 : if (r2) {
2587 43 : *r2 = result2->batCacheid;
2588 43 : BBPkeepref(result2);
2589 : }
2590 59 : BBPunfix(left->batCacheid);
2591 59 : BBPunfix(right->batCacheid);
2592 59 : BBPreclaim(escape);
2593 59 : BBPreclaim(caseignore);
2594 59 : BBPreclaim(candleft);
2595 59 : BBPreclaim(candright);
2596 : return MAL_SUCCEED;
2597 :
2598 0 : fail:
2599 0 : BBPreclaim(left);
2600 0 : BBPreclaim(right);
2601 0 : BBPreclaim(escape);
2602 0 : BBPreclaim(caseignore);
2603 0 : BBPreclaim(candleft);
2604 0 : BBPreclaim(candright);
2605 0 : BBPreclaim(result1);
2606 0 : BBPreclaim(result2);
2607 0 : if (msg)
2608 : return msg;
2609 0 : throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2610 : }
2611 :
2612 : static str
2613 39 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid,
2614 : const bat *cid, const bat *slid, const bat *srid,
2615 : const bit *nil_matches, const lng *estimate, const bit *anti)
2616 : {
2617 39 : (void) nil_matches;
2618 39 : (void) estimate;
2619 39 : return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
2620 39 : *elid, *cid, *anti);
2621 : }
2622 :
2623 : static str
2624 16 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid,
2625 : const bat *cid, const bat *slid, const bat *srid,
2626 : const bit *nil_matches, const lng *estimate, const bit *anti)
2627 : {
2628 16 : (void) nil_matches;
2629 16 : (void) estimate;
2630 16 : return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
2631 16 : *elid, *cid, *anti);
2632 : }
2633 :
2634 : #include "mel.h"
2635 : mel_atom pcre_init_atoms[] = {
2636 : { .name="pcre", }, { .cmp=NULL }
2637 : };
2638 : mel_func pcre_init_funcs[] = {
2639 : command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
2640 : command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2641 : command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2642 : command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
2643 : command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2644 : command("pcre", "replace_first", PCREreplacefirst_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2645 : command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
2646 : command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
2647 : command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2648 : command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2649 : command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2650 : command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2651 : command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2652 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2653 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2654 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2655 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2656 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2657 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2658 : command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds. The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
2659 : command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
2660 : command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
2661 : { .imp=NULL }
2662 : };
2663 : #include "mal_import.h"
2664 : #ifdef _MSC_VER
2665 : #undef read
2666 : #pragma section(".CRT$XCU",read)
2667 : #endif
2668 334 : LIB_STARTUP_FUNC(init_pcre_mal)
2669 334 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }
|