Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * N. Nes
15 : * PCRE library interface
16 : * The PCRE library is a set of functions that implement regular
17 : * expression pattern matching using the same syntax and semantics as Perl,
18 : * with just a few differences. The current implementation of PCRE
19 : * (release 4.x) corresponds approximately with Perl 5.8, including support
20 : * for UTF-8 encoded strings. However, this support has to be
21 : * explicitly enabled; it is not the default.
22 : *
23 : * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
24 : */
25 : #include "monetdb_config.h"
26 : #include <string.h>
27 :
28 : #include "mal.h"
29 : #include "mal_client.h"
30 : #include "mal_interpreter.h"
31 : #include "mal_exception.h"
32 :
33 : #include <wchar.h>
34 : #include <wctype.h>
35 :
36 : #ifdef HAVE_LIBPCRE
37 : #include <pcre.h>
38 : #ifndef PCRE_STUDY_JIT_COMPILE
39 : /* old library version on e.g. EPEL 6 */
40 : #define pcre_free_study(x) pcre_free(x)
41 : #define PCRE_STUDY_JIT_COMPILE 0
42 : #endif
43 : #define JIT_COMPILE_MIN 1024 /* when to try JIT compilation of patterns */
44 :
45 : #else
46 :
47 : #include <regex.h>
48 :
49 : typedef regex_t pcre;
50 : #endif
51 :
52 : /* current implementation assumes simple %keyword% [keyw%]* */
53 : struct RE {
54 : char *k;
55 : uint32_t *w;
56 : bool search:1, atend:1, is_ascii:1, case_ignore:1;
57 : size_t len;
58 : struct RE *n;
59 : };
60 :
61 : /* We cannot use strcasecmp and strncasecmp since they work byte for
62 : * byte and don't deal with multibyte encodings (such as UTF-8).
63 : *
64 : * We implement our own conversion from UTF-8 encoding to Unicode code
65 : * points which we store in uint32_t. The reason for this is,
66 : * functions like mbsrtowcs are locale-dependent (so we need a UTF-8
67 : * locale to use them), and on Windows, wchar_t is only 2 bytes and
68 : * therefore cannot hold all Unicode code points. We do use functions
69 : * such as towlower to convert a Unicode code point to its lower-case
70 : * equivalent, but again on Windows, if the code point doesn't fit in
71 : * 2 bytes, we skip this conversion and compare the unconverted code
72 : * points.
73 : *
74 : * Note, towlower is also locale-dependent, but we don't need a UTF-8
75 : * locale in order to use it. */
76 :
77 : /* helper function to convert a UTF-8 multibyte character to a wide
78 : * character */
79 : static size_t
80 275 : utfc8touc(uint32_t *restrict dest, const char *restrict src)
81 : {
82 275 : if ((src[0] & 0x80) == 0) {
83 218 : *dest = src[0];
84 218 : return src[0] != 0;
85 57 : } else if ((src[0] & 0xE0) == 0xC0
86 40 : && (src[1] & 0xC0) == 0x80 && (src[0] & 0x1E) != 0) {
87 40 : *dest = (src[0] & 0x1F) << 6 | (src[1] & 0x3F);
88 40 : return 2;
89 17 : } else if ((src[0] & 0xF0) == 0xE0
90 17 : && (src[1] & 0xC0) == 0x80
91 17 : && (src[2] & 0xC0) == 0x80
92 17 : && ((src[0] & 0x0F) != 0 || (src[1] & 0x20) != 0)) {
93 17 : *dest = (src[0] & 0x0F) << 12 | (src[1] & 0x3F) << 6 | (src[2] & 0x3F);
94 17 : return 3;
95 0 : } else if ((src[0] & 0xF8) == 0xF0
96 0 : && (src[1] & 0xC0) == 0x80
97 0 : && (src[2] & 0xC0) == 0x80 && (src[3] & 0xC0) == 0x80) {
98 0 : uint32_t c = (src[0] & 0x07) << 18
99 0 : | (src[1] & 0x3F) << 12
100 0 : | (src[2] & 0x3F) << 6 | (src[3] & 0x3F);
101 0 : if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
102 : return (size_t) -1;
103 0 : *dest = c;
104 0 : return 4;
105 : }
106 : return (size_t) -1;
107 : }
108 :
109 : /* helper function to convert a UTF-8 string to a wide character
110 : * string, the wide character string is allocated */
111 : static uint32_t *
112 49 : utf8stoucs(const char *src)
113 : {
114 49 : uint32_t *dest;
115 49 : size_t i = 0;
116 49 : size_t j = 0;
117 :
118 : /* count how many uint32_t's we need, while also checking for
119 : * correctness of the input */
120 263 : while (src[j]) {
121 214 : i++;
122 214 : if ((src[j + 0] & 0x80) == 0) {
123 165 : j += 1;
124 49 : } else if ((src[j + 0] & 0xE0) == 0xC0
125 24 : && (src[j + 1] & 0xC0) == 0x80 && (src[j + 0] & 0x1E) != 0) {
126 24 : j += 2;
127 25 : } else if ((src[j + 0] & 0xF0) == 0xE0
128 25 : && (src[j + 1] & 0xC0) == 0x80
129 25 : && (src[j + 2] & 0xC0) == 0x80
130 25 : && ((src[j + 0] & 0x0F) != 0 || (src[j + 1] & 0x20) != 0)) {
131 25 : j += 3;
132 0 : } else if ((src[j + 0] & 0xF8) == 0xF0
133 0 : && (src[j + 1] & 0xC0) == 0x80
134 0 : && (src[j + 2] & 0xC0) == 0x80
135 0 : && (src[j + 3] & 0xC0) == 0x80) {
136 0 : uint32_t c = (src[j + 0] & 0x07) << 18
137 0 : | (src[j + 1] & 0x3F) << 12
138 0 : | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
139 0 : if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
140 : return NULL;
141 0 : j += 4;
142 : } else {
143 : return NULL;
144 : }
145 : }
146 49 : dest = GDKmalloc((i + 1) * sizeof(uint32_t));
147 49 : if (dest == NULL)
148 : return NULL;
149 : /* go through the source string again, this time we can skip
150 : * the correctness tests */
151 : i = j = 0;
152 263 : while (src[j]) {
153 214 : if ((src[j + 0] & 0x80) == 0) {
154 165 : dest[i++] = src[j + 0];
155 165 : j += 1;
156 49 : } else if ((src[j + 0] & 0xE0) == 0xC0) {
157 24 : dest[i++] = (src[j + 0] & 0x1F) << 6 | (src[j + 1] & 0x3F);
158 24 : j += 2;
159 25 : } else if ((src[j + 0] & 0xF0) == 0xE0) {
160 25 : dest[i++] = (src[j + 0] & 0x0F) << 12
161 25 : | (src[j + 1] & 0x3F) << 6 | (src[j + 2] & 0x3F);
162 25 : j += 3;
163 0 : } else if ((src[j + 0] & 0xF8) == 0xF0) {
164 0 : dest[i++] = (src[j + 0] & 0x07) << 18
165 0 : | (src[j + 1] & 0x3F) << 12
166 0 : | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
167 0 : j += 4;
168 : }
169 : }
170 49 : dest[i] = 0;
171 49 : return dest;
172 : }
173 :
174 : static size_t
175 33 : myucslen(const uint32_t *ucs)
176 : {
177 33 : size_t i = 0;
178 :
179 66 : while (ucs[i])
180 33 : i++;
181 33 : return i;
182 : }
183 :
184 : static inline bool
185 14 : mywstrncaseeq(const char *restrict s1, const uint32_t *restrict s2, size_t n2,
186 : bool atend)
187 : {
188 14 : uint32_t c1;
189 :
190 27 : while (n2 > 0) {
191 20 : size_t nn1 = utfc8touc(&c1, s1);
192 20 : if (nn1 == 0 || nn1 == (size_t) -1)
193 0 : return (*s2 == 0);
194 20 : if (*s2 == 0)
195 : return false;
196 : #if SIZEOF_WCHAR_T == 2
197 : if (c1 > 0xFFFF || *s2 > 0xFFFF) {
198 : if (c1 != *s2)
199 : return false;
200 : } else
201 : #endif
202 20 : if (towlower((wint_t) c1) != towlower((wint_t) * s2))
203 : return false;
204 13 : s1 += nn1;
205 13 : n2--;
206 13 : s2++;
207 : }
208 14 : return !atend || *s1 == 0;
209 : }
210 :
211 : static inline int
212 1 : mystrcasecmp(const char *s1, const char *s2)
213 : {
214 1 : uint32_t c1 = 0, c2 = 0;
215 :
216 1 : for (;;) {
217 1 : size_t nn1 = utfc8touc(&c1, s1);
218 1 : size_t nn2 = utfc8touc(&c2, s2);
219 1 : if (nn1 == 0 || nn1 == (size_t) -1)
220 0 : return -(nn2 != 0 && nn2 != (size_t) -1);
221 1 : if (nn2 == 0 || nn2 == (size_t) -1)
222 : return 1;
223 : #if SIZEOF_WCHAR_T == 2
224 : if (c1 > 0xFFFF || c2 > 0xFFFF) {
225 : if (c1 != c2)
226 : return c1 - c2;
227 : } else
228 : #endif
229 1 : if (towlower((wint_t) c1) != towlower((wint_t) c2))
230 1 : return towlower((wint_t) c1) - towlower((wint_t) c2);
231 0 : s1 += nn1;
232 0 : s2 += nn2;
233 : }
234 : }
235 :
236 : static inline int
237 41 : mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2)
238 : {
239 41 : uint32_t c1 = 0;
240 :
241 329 : for (;;) {
242 185 : size_t nn1 = utfc8touc(&c1, s1);
243 185 : if (nn1 == 0 || nn1 == (size_t) -1)
244 22 : return -(*s2 != 0);
245 163 : if (*s2 == 0)
246 : return 1;
247 : #if SIZEOF_WCHAR_T == 2
248 : if (c1 > 0xFFFF || *s2 > 0xFFFF) {
249 : if (c1 != *s2)
250 : return c1 - *s2;
251 : } else
252 : #endif
253 163 : if (towlower((wint_t) c1) != towlower((wint_t) * s2))
254 19 : return towlower((wint_t) c1) - towlower((wint_t) * s2);
255 144 : s1 += nn1;
256 144 : s2++;
257 : }
258 : }
259 :
260 : static inline const char *
261 33 : mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle,
262 : bool atend)
263 : {
264 33 : size_t nlen = myucslen(wneedle);
265 :
266 33 : if (nlen == 0)
267 0 : return atend ? haystack + strlen(haystack) : haystack;
268 :
269 86 : while (*haystack) {
270 : size_t i;
271 : size_t h;
272 : size_t step = 0;
273 83 : for (i = h = 0; i < nlen; i++) {
274 68 : uint32_t c = 0;
275 68 : size_t j = utfc8touc(&c, haystack + h);
276 68 : if (j == 0 || j == (size_t) -1)
277 0 : return NULL;
278 68 : if (i == 0) {
279 68 : step = j;
280 : }
281 : #if SIZEOF_WCHAR_T == 2
282 : if (c > 0xFFFF || wneedle[i] > 0xFFFF) {
283 : if (c != wneedle[i])
284 : break;
285 : } else
286 : #endif
287 68 : if (towlower((wint_t) c) != towlower((wint_t) wneedle[i]))
288 : break;
289 15 : h += j;
290 : }
291 68 : if (i == nlen && (!atend || haystack[h] == 0))
292 15 : return haystack;
293 53 : haystack += step;
294 : }
295 : return NULL;
296 : }
297 :
298 : /* returns true if the pattern does not contain unescaped `_' (single
299 : * character match) and ends with unescaped `%' (any sequence
300 : * match) */
301 : static inline bool
302 6216 : re_simple(const char *pat, unsigned char esc)
303 : {
304 6216 : bool escaped = false;
305 :
306 6216 : if (pat == 0)
307 : return false;
308 6216 : if (*pat == '%') {
309 5444 : pat++;
310 : }
311 45538 : while (*pat) {
312 39701 : if (escaped) {
313 : escaped = false;
314 39558 : } else if ((unsigned char) *pat == esc) {
315 : escaped = true;
316 39415 : } else if (*pat == '_') {
317 : return false;
318 : }
319 39322 : pat++;
320 : }
321 : return true;
322 : }
323 :
324 : static inline bool
325 7084 : re_is_pattern_properly_escaped(const char *pat, unsigned char esc)
326 : {
327 7084 : bool escaped = false;
328 :
329 7084 : if (pat == 0)
330 : return true;
331 57898 : while (*pat) {
332 50814 : if (escaped) {
333 : escaped = false;
334 50662 : } else if ((unsigned char) *pat == esc) {
335 50814 : escaped = true;
336 : }
337 50814 : pat++;
338 : }
339 7084 : return escaped ? false : true;
340 : }
341 :
342 : /* returns true if the pattern does not contain wildcard
343 : * characters ('%' or '_') and no character is escaped
344 : */
345 : static inline bool
346 7081 : is_strcmpable(const char *pat, const char *esc)
347 : {
348 7081 : if (pat[strcspn(pat, "%_")])
349 : return false;
350 1794 : return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
351 : }
352 :
353 : /* Compare two strings ignoring case. When both strings are
354 : * lower case this function returns the same result as strcmp.
355 : */
356 : static int
357 871 : istrcmp(const char *s1, const char *s2)
358 : {
359 871 : char c1, c2;
360 871 : const char *p1, *p2;
361 1298 : for (p1 = s1, p2 = s2; *p1 && *p2; p1++, p2++) {
362 1157 : c1 = *p1;
363 1157 : c2 = *p2;
364 :
365 1157 : if ('A' <= c1 && c1 <= 'Z')
366 649 : c1 += 'a' - 'A';
367 :
368 1157 : if ('A' <= c2 && c2 <= 'Z')
369 697 : c2 += 'a' - 'A';
370 :
371 1157 : if (c1 != c2)
372 730 : return (c1 - c2);
373 : }
374 :
375 141 : if (*p1 != *p2)
376 67 : return *p1 - *p2;
377 :
378 : return 0;
379 : }
380 :
381 : /* Compare at most len characters of two strings ignoring
382 : * case. When both strings are lowercase this function
383 : * returns the same result as strncmp.
384 : */
385 : static int
386 16 : istrncmp(const char *s1, const char *s2, size_t len)
387 : {
388 16 : char c1, c2;
389 16 : const char *p1, *p2;
390 16 : size_t n = 0;
391 :
392 32 : for (p1 = s1, p2 = s2; *p1 && *p2 && (n < len); p1++, p2++, n++) {
393 16 : c1 = *p1;
394 16 : c2 = *p2;
395 :
396 16 : if ('A' <= c1 && c1 <= 'Z')
397 4 : c1 += 'a' - 'A';
398 :
399 16 : if ('A' <= c2 && c2 <= 'Z')
400 0 : c2 += 'a' - 'A';
401 :
402 16 : if (c1 != c2)
403 0 : return c1 - c2;
404 : }
405 :
406 16 : if (*p1 != *p2 && n < len)
407 0 : return *p1 - *p2;
408 :
409 : return 0;
410 : }
411 :
412 :
413 : /* Find the first occurence of the substring needle in
414 : * haystack ignoring case.
415 : *
416 : * NOTE: This function assumes that the needle is already
417 : * lowercase.
418 : */
419 : static const char *
420 6746 : istrstr(const char *haystack, const char *needle)
421 : {
422 6746 : const char *ph;
423 6746 : const char *pn;
424 6746 : const char *p1;
425 6746 : bool match = true;
426 :
427 316015 : for (ph = haystack; *ph; ph++) {
428 373952 : match = true;
429 373952 : for (pn = needle, p1 = ph; *pn && *p1; pn++, p1++) {
430 371844 : char c1 = *pn;
431 371844 : char c2 = ('A' <= *p1 && *p1 <= 'Z') ? *p1 - 'A' + 'a' : *p1;
432 371844 : if (c1 != c2) {
433 : match = false;
434 : break;
435 : }
436 : }
437 :
438 : /* We reached the end of the haystack, but we still have characters in
439 : * needle. None of the future iterations will match.
440 : */
441 311377 : if (*p1 == 0 && *pn != 0) {
442 : break;
443 : }
444 :
445 311377 : if (match) {
446 2108 : return ph;
447 : }
448 : }
449 : return NULL;
450 : }
451 :
452 : /* Match regular expression by comparing bytes.
453 : *
454 : * This is faster than re_match_ignore, because it does not
455 : * need to decode characters. This function should be used
456 : * in all cases except when we need to perform UTF-8
457 : * comparisons ignoring case.
458 : *
459 : * TODO: The name of the function is no longer accurate and
460 : * needs to change.
461 : */
462 : static inline bool
463 189629 : re_match_no_ignore(const char *restrict s, const struct RE *restrict pattern)
464 : {
465 189629 : const struct RE *r;
466 189629 : size_t l;
467 :
468 252293 : for (r = pattern; r; r = r->n) {
469 190431 : if (*r->k == 0 && (r->search || *s == 0))
470 : return true;
471 169010 : if (!*s ||
472 : (r->search
473 168937 : ? (r->atend
474 155444 : ? (r->case_ignore
475 5970 : ? (l = strlen(s)) < r->len || istrcmp(s + l - r->len, r->k) != 0
476 5886 : : (l = strlen(s)) < r->len || strcmp(s + l - r->len, r->k) != 0)
477 149474 : : (r->case_ignore ? (s = istrstr(s, r->k)) == NULL
478 142741 : : (s = strstr(s, r->k)) == NULL))
479 : : (r->atend
480 13493 : ? (r->case_ignore ? istrcmp(s, r->k) != 0
481 95 : : strcmp(s, r->k) != 0)
482 13398 : : (r->case_ignore ? istrncmp(s, r->k, r->len) != 0
483 13382 : : strncmp(s, r->k, r->len) != 0))))
484 : return false;
485 62664 : s += r->len;
486 : }
487 : return true;
488 : }
489 :
490 : /* Match a regular expression by comparing wide characters.
491 : *
492 : * This needs to be used when we need to perform a
493 : * case-ignoring comparions involving UTF-8 characters.
494 : */
495 : static inline bool
496 44 : re_match_ignore(const char *restrict s, const struct RE *restrict pattern)
497 : {
498 44 : const struct RE *r;
499 :
500 : /* Since the pattern is ascii, do the cheaper comparison */
501 44 : if (pattern->is_ascii) {
502 0 : return re_match_no_ignore(s, pattern);
503 : }
504 :
505 66 : for (r = pattern; r; r = r->n) {
506 47 : if (*r->w == 0 && (r->search || *s == 0))
507 : return true;
508 47 : if (!*s ||
509 : (r->search
510 47 : ? (s = mywstrcasestr(s, r->w, r->atend)) == NULL
511 14 : : !mywstrncaseeq(s, r->w, r->len, r->atend)))
512 : return false;
513 22 : s += r->len;
514 : }
515 : return true;
516 : }
517 :
518 : static void
519 5836 : re_destroy(struct RE *p)
520 : {
521 5836 : if (p) {
522 5836 : GDKfree(p->k);
523 5837 : GDKfree(p->w);
524 5931 : do {
525 5931 : struct RE *n = p->n;
526 :
527 5931 : GDKfree(p);
528 5931 : p = n;
529 5931 : } while (p);
530 : }
531 5837 : }
532 :
533 : /* Create a linked list of RE structures. Depending on the
534 : * caseignore and the ascii_pattern flags, the w
535 : * (if caseignore == true && ascii_pattern == false) or the k
536 : * (in every other case) field is used. These in the first
537 : * structure are allocated, whereas in all subsequent
538 : * structures the fields point into the allocated buffer of
539 : * the first.
540 : */
541 : static struct RE *
542 5837 : re_create(const char *pat, bool caseignore, bool ascii_pattern, uint32_t esc)
543 : {
544 5837 : struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
545 5837 : bool escaped = false;
546 :
547 5837 : if (r == NULL)
548 : return NULL;
549 5837 : *r = (struct RE) {.atend = true };
550 :
551 11165 : while (esc != '%' && *pat == '%') {
552 5328 : pat++; /* skip % */
553 5328 : r->search = true;
554 : }
555 5837 : if (caseignore && !ascii_pattern) {
556 20 : uint32_t *wp;
557 20 : uint32_t *wq;
558 20 : wp = utf8stoucs(pat);
559 20 : if (wp == NULL) {
560 0 : GDKfree(r);
561 0 : return NULL;
562 : }
563 20 : r->w = wp;
564 20 : wq = wp;
565 68 : while (*wp) {
566 48 : if (escaped) {
567 0 : *wq++ = *wp;
568 0 : n->len++;
569 0 : escaped = false;
570 48 : } else if (*wp == esc) {
571 : escaped = true;
572 48 : } else if (*wp == '%') {
573 16 : n->atend = false;
574 16 : while (wp[1] == '%')
575 0 : wp++;
576 16 : if (wp[1]) {
577 4 : n = n->n = GDKmalloc(sizeof(struct RE));
578 4 : if (n == NULL)
579 0 : goto bailout;
580 4 : *n = (struct RE) {
581 : .search = true,
582 : .atend = true,
583 4 : .w = wp + 1,
584 : };
585 : }
586 16 : *wq = 0;
587 16 : wq = wp + 1;
588 : } else {
589 32 : *wq++ = *wp;
590 32 : n->len++;
591 : }
592 48 : wp++;
593 : }
594 20 : *wq = 0;
595 : } else {
596 5817 : char *p, *q;
597 5817 : if ((p = GDKstrdup(pat)) == NULL) {
598 0 : GDKfree(r);
599 0 : return NULL;
600 : }
601 5817 : if (ascii_pattern)
602 5814 : n->is_ascii = true;
603 5817 : if (caseignore)
604 62 : n->case_ignore = true;
605 :
606 62 : if (ascii_pattern && caseignore) {
607 586 : for (q = p; *q != 0; q++) {
608 524 : if ('A' <= *q && *q <= 'Z')
609 21 : *q += 'a' - 'A';
610 : }
611 : }
612 :
613 5817 : r->k = p;
614 5817 : q = p;
615 44175 : while (*p) {
616 38358 : if (escaped) {
617 136 : *q++ = *p;
618 136 : n->len++;
619 136 : escaped = false;
620 38222 : } else if ((unsigned char) *p == esc) {
621 : escaped = true;
622 38086 : } else if (*p == '%') {
623 5575 : n->atend = false;
624 5603 : while (p[1] == '%')
625 28 : p++;
626 5575 : if (p[1]) {
627 90 : n = n->n = GDKmalloc(sizeof(struct RE));
628 90 : if (n == NULL)
629 0 : goto bailout;
630 90 : *n = (struct RE) {
631 : .search = true,
632 : .atend = true,
633 90 : .k = p + 1
634 : };
635 90 : if (ascii_pattern) {
636 87 : n->is_ascii = true;
637 : }
638 90 : if (caseignore) {
639 16 : n->case_ignore = true;
640 : }
641 : }
642 5575 : *q = 0;
643 5575 : q = p + 1;
644 : } else {
645 32511 : char c = *p;
646 32511 : if (ascii_pattern && caseignore && 'A' <= c && c <= 'Z') {
647 0 : c += 'a' - 'A';
648 : }
649 32511 : *q++ = c;
650 32511 : n->len++;
651 : }
652 38358 : p++;
653 : }
654 5817 : *q = 0;
655 : }
656 : return r;
657 0 : bailout:
658 0 : re_destroy(r);
659 0 : return NULL;
660 : }
661 :
662 : #ifdef HAVE_LIBPCRE
663 : static str
664 25 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
665 : {
666 25 : pcre *r;
667 25 : const char *err_p = NULL;
668 25 : int errpos = 0;
669 25 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
670 25 : if (insensitive)
671 0 : options |= PCRE_CASELESS;
672 :
673 25 : if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
674 0 : throw(MAL, "pcre.compile", OPERATION_FAILED
675 : " with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern);
676 : }
677 25 : *res = r;
678 25 : return MAL_SUCCEED;
679 : }
680 : #endif
681 :
682 : /* maximum number of back references and quoted \ or $ in replacement string */
683 : #define MAX_NR_REFS 20
684 :
685 : struct backref {
686 : int idx;
687 : int start;
688 : int end;
689 : };
690 :
691 : #ifdef HAVE_LIBPCRE
692 : /* fill in parameter backrefs (length maxrefs) with information about
693 : * back references in the replacement string; a back reference is a
694 : * dollar or backslash followed by a number */
695 : static int
696 60 : parse_replacement(const char *replacement, int len_replacement,
697 : struct backref *backrefs, int maxrefs)
698 : {
699 60 : int nbackrefs = 0;
700 :
701 108 : for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
702 48 : if (replacement[i] == '$' || replacement[i] == '\\') {
703 6 : char *endptr;
704 6 : backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
705 6 : if (endptr > replacement + i + 1) {
706 6 : int k = (int) (endptr - (replacement + i + 1));
707 6 : backrefs[nbackrefs].start = i;
708 6 : backrefs[nbackrefs].end = i + k + 1;
709 6 : nbackrefs++;
710 0 : } else if (replacement[i] == replacement[i + 1]) {
711 : /* doubled $ or \, we must copy just one to the output */
712 0 : backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */
713 0 : backrefs[nbackrefs].start = i;
714 0 : backrefs[nbackrefs].end = i + 1;
715 0 : i++; /* don't look at second $ or \ again */
716 0 : nbackrefs++;
717 : }
718 : /* else: $ or \ followed by something we don't recognize,
719 : * so just leave it */
720 : }
721 : }
722 60 : return nbackrefs;
723 : }
724 :
725 : static char *
726 30269 : single_replace(pcre *pcre_code, pcre_extra *extra,
727 : const char *origin_str, int len_origin_str,
728 : int exec_options, int *ovector, int ovecsize,
729 : const char *replacement, int len_replacement,
730 : struct backref *backrefs, int nbackrefs,
731 : bool global, char *result, int *max_result)
732 : {
733 30269 : int offset = 0;
734 30269 : int len_result = 0;
735 108913 : int addlen;
736 108913 : char *tmp;
737 :
738 108913 : do {
739 108913 : int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
740 : exec_options, ovector, ovecsize);
741 108968 : if (j <= 0)
742 : break;
743 80729 : addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0);
744 80729 : if (len_result + addlen >= *max_result) {
745 6892 : tmp = GDKrealloc(result, len_result + addlen + 1);
746 6892 : if (tmp == NULL) {
747 0 : GDKfree(result);
748 0 : return NULL;
749 : }
750 6892 : result = tmp;
751 6892 : *max_result = len_result + addlen + 1;
752 : }
753 80729 : if (ovector[0] > offset) {
754 78643 : strncpy(result + len_result, origin_str + offset,
755 78643 : ovector[0] - offset);
756 78643 : len_result += ovector[0] - offset;
757 : }
758 80729 : if (nbackrefs == 0) {
759 78647 : strncpy(result + len_result, replacement, len_replacement);
760 78647 : len_result += len_replacement;
761 : } else {
762 : int prevend = 0;
763 4164 : for (int i = 0; i < nbackrefs; i++) {
764 2082 : int off, len;
765 2082 : if (backrefs[i].idx >= ovecsize / 3) {
766 : /* out of bounds, replace with empty string */
767 : off = 0;
768 : len = 0;
769 : } else {
770 2082 : off = ovector[backrefs[i].idx * 2];
771 2082 : len = ovector[backrefs[i].idx * 2 + 1] - off;
772 : }
773 2082 : addlen = backrefs[i].start - prevend + len;
774 2082 : if (len_result + addlen >= *max_result) {
775 19 : tmp = GDKrealloc(result, len_result + addlen + 1);
776 19 : if (tmp == NULL) {
777 0 : GDKfree(result);
778 0 : return NULL;
779 : }
780 19 : result = tmp;
781 19 : *max_result = len_result + addlen + 1;
782 : }
783 2082 : if (backrefs[i].start > prevend) {
784 2 : strncpy(result + len_result, replacement + prevend,
785 2 : backrefs[i].start - prevend);
786 2 : len_result += backrefs[i].start - prevend;
787 : }
788 2082 : if (len > 0) {
789 2082 : strncpy(result + len_result, origin_str + off, len);
790 2082 : len_result += len;
791 : }
792 2082 : prevend = backrefs[i].end;
793 : }
794 : /* copy rest of replacement string (after last backref) */
795 2082 : addlen = len_replacement - prevend;
796 2082 : if (addlen > 0) {
797 2 : if (len_result + addlen >= *max_result) {
798 1 : tmp = GDKrealloc(result, len_result + addlen + 1);
799 1 : if (tmp == NULL) {
800 0 : GDKfree(result);
801 0 : return NULL;
802 : }
803 1 : result = tmp;
804 1 : *max_result = len_result + addlen + 1;
805 : }
806 2 : strncpy(result + len_result, replacement + prevend, addlen);
807 2 : len_result += addlen;
808 : }
809 : }
810 80729 : offset = ovector[1];
811 80729 : } while (offset < len_origin_str && global);
812 30324 : if (offset < len_origin_str) {
813 28239 : addlen = len_origin_str - offset;
814 28239 : if (len_result + addlen >= *max_result) {
815 314 : tmp = GDKrealloc(result, len_result + addlen + 1);
816 314 : if (tmp == NULL) {
817 0 : GDKfree(result);
818 0 : return NULL;
819 : }
820 314 : result = tmp;
821 314 : *max_result = len_result + addlen + 1;
822 : }
823 28239 : strncpy(result + len_result, origin_str + offset, addlen);
824 28239 : len_result += addlen;
825 : }
826 : /* null terminate string */
827 30324 : result[len_result] = '\0';
828 30324 : return result;
829 : }
830 : #endif
831 :
832 : static str
833 10 : pcre_replace(str *res, const char *origin_str, const char *pattern,
834 : const char *replacement, const char *flags, bool global)
835 : {
836 : #ifdef HAVE_LIBPCRE
837 10 : const char *err_p = NULL;
838 10 : pcre *pcre_code = NULL;
839 10 : pcre_extra *extra;
840 10 : char *tmpres;
841 10 : int max_result;
842 10 : int i, errpos = 0;
843 10 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
844 10 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
845 10 : int *ovector, ovecsize;
846 10 : int len_origin_str = (int) strlen(origin_str);
847 10 : int len_replacement = (int) strlen(replacement);
848 10 : struct backref backrefs[MAX_NR_REFS];
849 10 : int nbackrefs = 0;
850 :
851 14 : while (*flags) {
852 4 : switch (*flags) {
853 : case 'e':
854 : exec_options &= ~PCRE_NOTEMPTY;
855 : break;
856 1 : case 'i':
857 1 : compile_options |= PCRE_CASELESS;
858 1 : break;
859 1 : case 'm':
860 1 : compile_options |= PCRE_MULTILINE;
861 1 : break;
862 1 : case 's':
863 1 : compile_options |= PCRE_DOTALL;
864 1 : break;
865 1 : case 'x':
866 1 : compile_options |= PCRE_EXTENDED;
867 1 : break;
868 0 : default:
869 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
870 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
871 : *flags);
872 : }
873 4 : flags++;
874 : }
875 :
876 10 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
877 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
878 : OPERATION_FAILED
879 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
880 : pattern, errpos, err_p);
881 : }
882 :
883 : /* Since the compiled pattern is going to be used several times, it is
884 : * worth spending more time analyzing it in order to speed up the time
885 : * taken for matching.
886 : */
887 10 : extra = pcre_study(pcre_code, 0, &err_p);
888 10 : if (err_p != NULL) {
889 0 : pcre_free(pcre_code);
890 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
891 : OPERATION_FAILED
892 : ": pcre study of pattern (%s) failed with '%s'.\n", pattern,
893 : err_p);
894 : }
895 10 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
896 10 : ovecsize = (i + 1) * 3;
897 10 : if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
898 0 : pcre_free_study(extra);
899 0 : pcre_free(pcre_code);
900 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
901 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
902 : }
903 :
904 : /* identify back references in the replacement string */
905 10 : nbackrefs = parse_replacement(replacement, len_replacement,
906 : backrefs, MAX_NR_REFS);
907 :
908 10 : max_result = len_origin_str + 1;
909 10 : tmpres = GDKmalloc(max_result);
910 10 : if (tmpres == NULL) {
911 0 : GDKfree(ovector);
912 0 : pcre_free_study(extra);
913 0 : pcre_free(pcre_code);
914 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
915 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
916 : }
917 :
918 10 : tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
919 : exec_options, ovector, ovecsize, replacement,
920 : len_replacement, backrefs, nbackrefs, global,
921 : tmpres, &max_result);
922 10 : GDKfree(ovector);
923 10 : pcre_free_study(extra);
924 10 : pcre_free(pcre_code);
925 10 : if (tmpres == NULL)
926 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
927 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
928 :
929 10 : *res = tmpres;
930 10 : return MAL_SUCCEED;
931 : #else
932 : (void) res;
933 : (void) origin_str;
934 : (void) pattern;
935 : (void) replacement;
936 : (void) flags;
937 : (void) global;
938 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
939 : "Database was compiled without PCRE support.");
940 : #endif
941 : }
942 :
943 : static str
944 50 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
945 : const char *replacement, const char *flags, bool global)
946 : {
947 : #ifdef HAVE_LIBPCRE
948 50 : const char *err_p = NULL;
949 50 : char *tmpres;
950 50 : int i, errpos = 0;
951 50 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
952 50 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
953 50 : pcre *pcre_code = NULL;
954 50 : pcre_extra *extra;
955 50 : BAT *tmpbat;
956 50 : BUN p, q;
957 50 : int *ovector, ovecsize;
958 50 : int len_replacement = (int) strlen(replacement);
959 50 : struct backref backrefs[MAX_NR_REFS];
960 50 : int nbackrefs = 0;
961 50 : const char *origin_str;
962 50 : int max_dest_size = 0;
963 :
964 70 : while (*flags) {
965 20 : switch (*flags) {
966 : case 'e':
967 : exec_options &= ~PCRE_NOTEMPTY;
968 : break;
969 5 : case 'i':
970 5 : compile_options |= PCRE_CASELESS;
971 5 : break;
972 10 : case 'm':
973 10 : compile_options |= PCRE_MULTILINE;
974 10 : break;
975 5 : case 's':
976 5 : compile_options |= PCRE_DOTALL;
977 5 : break;
978 0 : case 'x':
979 0 : compile_options |= PCRE_EXTENDED;
980 0 : break;
981 0 : default:
982 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
983 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
984 : *flags);
985 : }
986 20 : flags++;
987 : }
988 :
989 50 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
990 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
991 : OPERATION_FAILED
992 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
993 : pattern, errpos, err_p);
994 : }
995 :
996 : /* Since the compiled pattern is going to be used several times,
997 : * it is worth spending more time analyzing it in order to speed
998 : * up the time taken for matching.
999 : */
1000 100 : extra = pcre_study(pcre_code,
1001 50 : BATcount(origin_strs) >
1002 : JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
1003 50 : if (err_p != NULL) {
1004 0 : pcre_free(pcre_code);
1005 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1006 : OPERATION_FAILED);
1007 : }
1008 50 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
1009 50 : ovecsize = (i + 1) * 3;
1010 50 : if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
1011 0 : pcre_free_study(extra);
1012 0 : pcre_free(pcre_code);
1013 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1014 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1015 : }
1016 :
1017 : /* identify back references in the replacement string */
1018 50 : nbackrefs = parse_replacement(replacement, len_replacement,
1019 : backrefs, MAX_NR_REFS);
1020 :
1021 50 : tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs),
1022 : TRANSIENT);
1023 :
1024 : /* the buffer for all destination strings is allocated only once,
1025 : * and extended when needed */
1026 50 : max_dest_size = len_replacement + 1;
1027 50 : tmpres = GDKmalloc(max_dest_size);
1028 50 : if (tmpbat == NULL || tmpres == NULL) {
1029 0 : pcre_free_study(extra);
1030 0 : pcre_free(pcre_code);
1031 0 : GDKfree(ovector);
1032 0 : BBPreclaim(tmpbat);
1033 0 : GDKfree(tmpres);
1034 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1035 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1036 : }
1037 50 : BATiter origin_strsi = bat_iterator(origin_strs);
1038 30232 : BATloop(origin_strs, p, q) {
1039 30182 : origin_str = BUNtvar(origin_strsi, p);
1040 60483 : tmpres = single_replace(pcre_code, extra, origin_str,
1041 30182 : (int) strlen(origin_str), exec_options,
1042 : ovector, ovecsize, replacement,
1043 : len_replacement, backrefs, nbackrefs, global,
1044 : tmpres, &max_dest_size);
1045 30301 : if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
1046 0 : bat_iterator_end(&origin_strsi);
1047 0 : pcre_free_study(extra);
1048 0 : pcre_free(pcre_code);
1049 0 : GDKfree(ovector);
1050 0 : GDKfree(tmpres);
1051 0 : BBPreclaim(tmpbat);
1052 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1053 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1054 : }
1055 : }
1056 50 : bat_iterator_end(&origin_strsi);
1057 50 : pcre_free_study(extra);
1058 50 : pcre_free(pcre_code);
1059 50 : GDKfree(ovector);
1060 50 : GDKfree(tmpres);
1061 50 : *res = tmpbat;
1062 50 : return MAL_SUCCEED;
1063 : #else
1064 : (void) res;
1065 : (void) origin_strs;
1066 : (void) pattern;
1067 : (void) replacement;
1068 : (void) flags;
1069 : (void) global;
1070 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1071 : "Database was compiled without PCRE support.");
1072 : #endif
1073 : }
1074 :
1075 : static str
1076 130 : pcre_match_with_flags(bit *ret, const char *val, const char *pat,
1077 : const char *flags)
1078 : {
1079 130 : int pos;
1080 : #ifdef HAVE_LIBPCRE
1081 130 : const char *err_p = NULL;
1082 130 : int errpos = 0;
1083 130 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
1084 130 : pcre *re;
1085 : #else
1086 : int options = REG_NOSUB;
1087 : regex_t re;
1088 : int errcode;
1089 : int retval;
1090 : #endif
1091 :
1092 260 : while (*flags) {
1093 130 : switch (*flags) {
1094 6 : case 'i':
1095 : #ifdef HAVE_LIBPCRE
1096 6 : options |= PCRE_CASELESS;
1097 : #else
1098 : options |= REG_ICASE;
1099 : #endif
1100 6 : break;
1101 0 : case 'm':
1102 : #ifdef HAVE_LIBPCRE
1103 0 : options |= PCRE_MULTILINE;
1104 : #else
1105 : options |= REG_NEWLINE;
1106 : #endif
1107 0 : break;
1108 : #ifdef HAVE_LIBPCRE
1109 124 : case 's':
1110 124 : options |= PCRE_DOTALL;
1111 124 : break;
1112 : #endif
1113 0 : case 'x':
1114 : #ifdef HAVE_LIBPCRE
1115 0 : options |= PCRE_EXTENDED;
1116 : #else
1117 : options |= REG_EXTENDED;
1118 : #endif
1119 0 : break;
1120 0 : default:
1121 0 : throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
1122 : ": unsupported flag character '%c'\n", *flags);
1123 : }
1124 130 : flags++;
1125 : }
1126 130 : if (strNil(val)) {
1127 0 : *ret = FALSE;
1128 0 : return MAL_SUCCEED;
1129 : }
1130 :
1131 : #ifdef HAVE_LIBPCRE
1132 130 : if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
1133 : #else
1134 : if ((errcode = regcomp(&re, pat, options)) != 0)
1135 : #endif
1136 : {
1137 0 : throw(MAL, "pcre.match", OPERATION_FAILED
1138 : ": compilation of regular expression (%s) failed "
1139 : #ifdef HAVE_LIBPCRE
1140 : "at %d with '%s'", pat, errpos, err_p
1141 : #else
1142 : , pat
1143 : #endif
1144 : );
1145 : }
1146 : #ifdef HAVE_LIBPCRE
1147 130 : pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK,
1148 : NULL, 0);
1149 130 : pcre_free(re);
1150 : #else
1151 : retval = regexec(&re, val, (size_t) 0, NULL, 0);
1152 : pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
1153 : regfree(&re);
1154 : #endif
1155 130 : if (pos >= 0)
1156 46 : *ret = TRUE;
1157 84 : else if (pos == -1)
1158 84 : *ret = FALSE;
1159 : else
1160 0 : throw(MAL, "pcre.match", OPERATION_FAILED
1161 : ": matching of regular expression (%s) failed with %d", pat, pos);
1162 : return MAL_SUCCEED;
1163 : }
1164 :
1165 : #ifdef HAVE_LIBPCRE
1166 : /* special characters in PCRE that need to be escaped */
1167 : static const char *pcre_specials = ".+?*()[]{}|^$\\";
1168 : #else
1169 : /* special characters in POSIX basic regular expressions that need to
1170 : * be escaped */
1171 : static const char *pcre_specials = "^.[$()|*+?{\\";
1172 : #endif
1173 :
1174 : /* change SQL LIKE pattern into PCRE pattern */
1175 : static str
1176 385 : sql2pcre(str *r, const char *pat, const char *esc_str)
1177 : {
1178 385 : int escaped = 0;
1179 385 : int hasWildcard = 0;
1180 385 : char *ppat;
1181 770 : int esc = strNil(esc_str) ? 0 : esc_str[0]; /* should change to utf8_convert() */
1182 385 : int specials;
1183 385 : int c;
1184 :
1185 385 : if (strlen(esc_str) > 1)
1186 0 : throw(MAL, "pcre.sql2pcre",
1187 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1188 : ": ESCAPE string must have length 1");
1189 385 : if (pat == NULL)
1190 0 : throw(MAL, "pcre.sql2pcre",
1191 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1192 : ": (I)LIKE pattern must not be NULL");
1193 385 : ppat = GDKmalloc(strlen(pat) * 3 +
1194 : 3 /* 3 = "^'the translated regexp'$0" */ );
1195 385 : if (ppat == NULL)
1196 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1197 :
1198 385 : *r = ppat;
1199 : /* The escape character can be a char which is special in a PCRE
1200 : * expression. If the user used the "+" char as escape and has "++"
1201 : * in their pattern, then replacing this with "+" is not correct and
1202 : * should be "\+" instead. */
1203 385 : specials = (esc && strchr(pcre_specials, esc) != NULL);
1204 :
1205 385 : *ppat++ = '^';
1206 2927 : while ((c = *pat++) != 0) {
1207 2542 : if (c == esc) {
1208 15 : if (escaped) {
1209 1 : if (specials) { /* change ++ into \+ */
1210 1 : *ppat++ = esc;
1211 : } else { /* do not escape simple escape symbols */
1212 0 : ppat[-1] = esc; /* overwrite backslash */
1213 : }
1214 : escaped = 0;
1215 : } else {
1216 14 : *ppat++ = '\\';
1217 14 : escaped = 1;
1218 : }
1219 : hasWildcard = 1;
1220 2527 : } else if (strchr(pcre_specials, c) != NULL) {
1221 : /* escape PCRE special chars, avoid double backslash if the
1222 : * user uses an invalid escape sequence */
1223 28 : if (!escaped)
1224 28 : *ppat++ = '\\';
1225 28 : *ppat++ = c;
1226 28 : hasWildcard = 1;
1227 28 : escaped = 0;
1228 2499 : } else if (c == '%' && !escaped) {
1229 317 : *ppat++ = '.';
1230 317 : *ppat++ = '*';
1231 317 : *ppat++ = '?';
1232 317 : hasWildcard = 1;
1233 : /* collapse multiple %, but only if it isn't the escape */
1234 317 : if (esc != '%')
1235 317 : while (*pat == '%')
1236 0 : pat++;
1237 2182 : } else if (c == '_' && !escaped) {
1238 492 : *ppat++ = '.';
1239 492 : hasWildcard = 1;
1240 : } else {
1241 1690 : if (escaped) {
1242 13 : ppat[-1] = c; /* overwrite backslash of invalid escape */
1243 : } else {
1244 1677 : *ppat++ = c;
1245 : }
1246 : escaped = 0;
1247 : }
1248 : }
1249 : /* no wildcard or escape character at end of string */
1250 385 : if (!hasWildcard || escaped) {
1251 1 : GDKfree(*r);
1252 1 : *r = NULL;
1253 1 : if (escaped)
1254 0 : throw(MAL, "pcre.sql2pcre",
1255 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1256 : ": (I)LIKE pattern must not end with escape character");
1257 1 : *r = GDKstrdup(str_nil);
1258 1 : if (*r == NULL)
1259 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1260 : } else {
1261 384 : *ppat++ = '$';
1262 384 : *ppat = 0;
1263 : }
1264 : return MAL_SUCCEED;
1265 : }
1266 :
1267 : #ifdef HAVE_LIBPCRE
1268 : /* change SQL PATINDEX pattern into PCRE pattern */
1269 : static str
1270 25 : pat2pcre(str *r, const char *pat)
1271 : {
1272 25 : size_t len = strlen(pat);
1273 25 : char *ppat = GDKmalloc(len * 2 + 3 /* 3 = "^'the translated regexp'$0" */ );
1274 25 : int start = 0;
1275 :
1276 25 : if (ppat == NULL)
1277 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1278 25 : *r = ppat;
1279 77 : while (*pat) {
1280 52 : int c = *pat++;
1281 :
1282 52 : if (strchr(pcre_specials, c) != NULL) {
1283 17 : *ppat++ = '\\';
1284 17 : *ppat++ = c;
1285 35 : } else if (c == '%') {
1286 3 : if (start && *pat) {
1287 0 : *ppat++ = '.';
1288 0 : *ppat++ = '*';
1289 : }
1290 3 : start++;
1291 32 : } else if (c == '_') {
1292 0 : *ppat++ = '.';
1293 : } else {
1294 32 : *ppat++ = c;
1295 : }
1296 : }
1297 25 : *ppat = 0;
1298 25 : return MAL_SUCCEED;
1299 : }
1300 : #endif
1301 :
1302 : /*
1303 : * @+ Wrapping
1304 : */
1305 :
1306 : static str
1307 10 : PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl,
1308 : const str *flags)
1309 : {
1310 10 : return pcre_replace(res, *or, *pat, *repl, *flags, true);
1311 : }
1312 :
1313 : static str
1314 0 : PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl,
1315 : const str *flags)
1316 : {
1317 0 : return pcre_replace(res, *or, *pat, *repl, *flags, false);
1318 : }
1319 :
1320 : static str
1321 50 : PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl,
1322 : const str *flags)
1323 : {
1324 50 : BAT *b, *bn = NULL;
1325 50 : str msg;
1326 50 : if ((b = BATdescriptor(*bid)) == NULL)
1327 0 : throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1328 :
1329 50 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
1330 50 : if (msg == MAL_SUCCEED) {
1331 50 : *res = bn->batCacheid;
1332 50 : BBPkeepref(bn);
1333 : }
1334 50 : BBPunfix(b->batCacheid);
1335 50 : return msg;
1336 : }
1337 :
1338 : static str
1339 0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat,
1340 : const str *repl, const str *flags)
1341 : {
1342 0 : BAT *b, *bn = NULL;
1343 0 : str msg;
1344 0 : if ((b = BATdescriptor(*bid)) == NULL)
1345 0 : throw(MAL, "batpcre.replace_first", RUNTIME_OBJECT_MISSING);
1346 :
1347 0 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
1348 0 : if (msg == MAL_SUCCEED) {
1349 0 : *res = bn->batCacheid;
1350 0 : BBPkeepref(bn);
1351 : }
1352 0 : BBPunfix(b->batCacheid);
1353 0 : return msg;
1354 : }
1355 :
1356 : static str
1357 124 : PCREmatch(bit *ret, const str *val, const str *pat)
1358 : {
1359 4 : return pcre_match_with_flags(ret, *val, *pat,
1360 : #ifdef HAVE_LIBPCRE
1361 : "s"
1362 : #else
1363 : "x"
1364 : #endif
1365 : );
1366 : }
1367 :
1368 : static str
1369 6 : PCREimatch(bit *ret, const str *val, const str *pat)
1370 : {
1371 0 : return pcre_match_with_flags(ret, *val, *pat, "i"
1372 : #ifndef HAVE_LIBPCRE
1373 : "x"
1374 : #endif
1375 : );
1376 : }
1377 :
1378 : static str
1379 25 : PCREindex(int *res, const pcre *pattern, const str *s)
1380 : {
1381 : #ifdef HAVE_LIBPCRE
1382 25 : int v[3];
1383 :
1384 25 : v[0] = v[1] = *res = 0;
1385 25 : if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0,
1386 : PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
1387 23 : *res = v[1];
1388 : }
1389 25 : return MAL_SUCCEED;
1390 : #else
1391 : (void) res;
1392 : (void) pattern;
1393 : (void) s;
1394 : throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
1395 : #endif
1396 : }
1397 :
1398 : static str
1399 27 : PCREpatindex(int *ret, const str *pat, const str *val)
1400 : {
1401 : #ifdef HAVE_LIBPCRE
1402 27 : pcre *re = NULL;
1403 27 : char *ppat = NULL, *msg;
1404 :
1405 53 : if (strNil(*pat) || strNil(*val)) {
1406 2 : *ret = int_nil;
1407 2 : return MAL_SUCCEED;
1408 : }
1409 :
1410 25 : if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
1411 : return msg;
1412 25 : if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
1413 0 : GDKfree(ppat);
1414 0 : return msg;
1415 : }
1416 25 : GDKfree(ppat);
1417 25 : msg = PCREindex(ret, re, val);
1418 25 : pcre_free(re);
1419 25 : return msg;
1420 : #else
1421 : (void) ret;
1422 : (void) pat;
1423 : (void) val;
1424 : throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
1425 : #endif
1426 : }
1427 :
1428 : static str
1429 0 : PCREquote(str *ret, const str *val)
1430 : {
1431 0 : char *p;
1432 0 : const char *s = *val;
1433 :
1434 0 : *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */
1435 0 : if (p == NULL)
1436 0 : throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1437 : /* quote all non-alphanumeric ASCII characters (i.e. leave
1438 : non-ASCII and alphanumeric alone) */
1439 0 : while (*s) {
1440 0 : if (!((*s & 0x80) != 0 ||
1441 0 : ('a' <= *s && *s <= 'z') ||
1442 0 : ('A' <= *s && *s <= 'Z') || isdigit((unsigned char) *s)))
1443 0 : *p++ = '\\';
1444 0 : *p++ = *s++;
1445 : }
1446 0 : *p = 0;
1447 0 : return MAL_SUCCEED;
1448 : }
1449 :
1450 : static str
1451 6 : PCREsql2pcre(str *ret, const str *pat, const str *esc)
1452 : {
1453 6 : return sql2pcre(ret, *pat, *esc);
1454 : }
1455 :
1456 : static bool
1457 7562 : is_ascii_str(const char *pat)
1458 : {
1459 7562 : size_t len = strlen(pat);
1460 57350 : for (size_t i = 0; i < len; i++) {
1461 50436 : if (pat[i] & 0x80)
1462 : return false;
1463 : }
1464 :
1465 : return true;
1466 : }
1467 :
1468 : static inline str
1469 7562 : choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty,
1470 : bool *ascii_pattern, const char *pat, const char *esc)
1471 : {
1472 7562 : str res = MAL_SUCCEED;
1473 7562 : *use_re = false;
1474 7562 : *use_strcmp = false;
1475 7562 : *empty = false;
1476 :
1477 :
1478 7562 : *ascii_pattern = is_ascii_str(pat);
1479 :
1480 14649 : if (strNil(pat) || strNil(esc)) {
1481 475 : *empty = true;
1482 : } else {
1483 7087 : if (!re_is_pattern_properly_escaped(pat, (unsigned char) *esc))
1484 5 : throw(MAL, "pcre.sql2pcre",
1485 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1486 : ": (I)LIKE pattern must not end with escape character");
1487 7081 : if (is_strcmpable(pat, esc)) {
1488 865 : *use_re = true;
1489 865 : *use_strcmp = true;
1490 6216 : } else if (re_simple(pat, (unsigned char) *esc)) {
1491 5836 : *use_re = true;
1492 : } else {
1493 379 : if ((res = sql2pcre(ppat, pat, esc)) != MAL_SUCCEED)
1494 : return res;
1495 379 : if (strNil(*ppat)) {
1496 0 : GDKfree(*ppat);
1497 0 : *ppat = NULL;
1498 0 : *use_re = true;
1499 0 : *use_strcmp = true;
1500 : }
1501 : }
1502 : }
1503 : return res;
1504 : }
1505 :
1506 : static str
1507 420 : PCRElike_imp(bit *ret, const str *s, const str *pat, const str *esc,
1508 : const bit *isens)
1509 : {
1510 420 : str res = MAL_SUCCEED;
1511 420 : char *ppat = NULL;
1512 420 : bool use_re = false, use_strcmp = false, empty = false, ascii_pattern = false;
1513 420 : struct RE *re = NULL;
1514 :
1515 420 : if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
1516 : *pat, *esc)) != MAL_SUCCEED)
1517 : return res;
1518 :
1519 787 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ?
1520 372 : "pcrelike: pattern matching using strcmp" : use_re ?
1521 : "pcrelike: pattern matching using RE" :
1522 : "pcrelike: pattern matching using pcre");
1523 :
1524 822 : if (strNil(*s) || empty) {
1525 12 : *ret = bit_nil;
1526 403 : } else if (use_re) {
1527 277 : if (use_strcmp) {
1528 31 : *ret = *isens ? (ascii_pattern
1529 4 : ? istrcmp(*s, *pat) == 0
1530 1 : : mystrcasecmp(*s, *pat) == 0)
1531 27 : : strcmp(*s, *pat) == 0;
1532 : } else {
1533 246 : if (!(re = re_create(*pat, *isens, ascii_pattern, (unsigned char) **esc)))
1534 0 : res = createException(MAL, "pcre.like4",
1535 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1536 : else
1537 492 : *ret = (*isens && !re->is_ascii)
1538 0 : ? re_match_ignore(*s, re)
1539 246 : : re_match_no_ignore(*s, re);
1540 : }
1541 : } else {
1542 126 : res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, &ppat);
1543 : }
1544 :
1545 289 : if (re)
1546 246 : re_destroy(re);
1547 415 : GDKfree(ppat);
1548 415 : return res;
1549 : }
1550 :
1551 : static str
1552 420 : PCRElike(bit *ret, const str *s, const str *pat, const str *esc,
1553 : const bit *isens)
1554 : {
1555 313 : return PCRElike_imp(ret, s, pat, esc, isens);
1556 : }
1557 :
1558 : static str
1559 107 : PCREnotlike(bit *ret, const str *s, const str *pat, const str *esc,
1560 : const bit *isens)
1561 : {
1562 107 : str tmp;
1563 107 : bit r;
1564 :
1565 107 : rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
1566 103 : *ret = r == bit_nil ? bit_nil : !r;
1567 103 : return MAL_SUCCEED;
1568 : }
1569 :
1570 : static inline str
1571 6420 : re_like_build(struct RE **re, uint32_t **wpat, const char *pat, bool caseignore,
1572 : bool use_strcmp, bool ascii_pattern, uint32_t esc)
1573 : {
1574 6420 : if (!use_strcmp) {
1575 5590 : if (!(*re = re_create(pat, caseignore, ascii_pattern, esc)))
1576 0 : return createException(MAL, "pcre.re_like_build",
1577 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1578 830 : } else if (caseignore && !ascii_pattern) {
1579 29 : if (!(*wpat = utf8stoucs(pat)))
1580 0 : return createException(MAL, "pcre.re_like_build",
1581 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1582 : }
1583 : return MAL_SUCCEED;
1584 : }
1585 :
1586 : #define proj_scanloop(TEST) \
1587 : do { \
1588 : if (strNil(s)) \
1589 : return bit_nil; \
1590 : else \
1591 : return TEST; \
1592 : } while (0)
1593 :
1594 : static inline bit
1595 4459 : re_like_proj_apply(const char *s, const struct RE *restrict re,
1596 : const uint32_t *restrict wpat, const char *pat,
1597 : bool caseignore, bool anti, bool use_strcmp, bool is_ascii)
1598 : {
1599 4459 : if (use_strcmp) {
1600 635 : if (caseignore) {
1601 158 : if (is_ascii) {
1602 140 : if (anti)
1603 198 : proj_scanloop(istrcmp(s, pat) != 0);
1604 : else
1605 82 : proj_scanloop(istrcmp(s, pat) == 0);
1606 : } else {
1607 18 : if (anti)
1608 28 : proj_scanloop(mywstrcasecmp(s, wpat) != 0);
1609 : else
1610 8 : proj_scanloop(mywstrcasecmp(s, wpat) == 0);
1611 : }
1612 : } else {
1613 477 : if (anti)
1614 596 : proj_scanloop(strcmp(s, pat) != 0);
1615 : else
1616 358 : proj_scanloop(strcmp(s, pat) == 0);
1617 : }
1618 : } else {
1619 : /* Use re_match_ignore only if the pattern is UTF-8
1620 : * and we need to ignore case
1621 : */
1622 3824 : if (caseignore && !is_ascii) {
1623 3 : if (anti)
1624 6 : proj_scanloop(!re_match_ignore(s, re));
1625 : else
1626 0 : proj_scanloop(re_match_ignore(s, re));
1627 : } else {
1628 3821 : if (anti)
1629 160 : proj_scanloop(!re_match_no_ignore(s, re));
1630 : else
1631 7482 : proj_scanloop(re_match_no_ignore(s, re));
1632 : }
1633 : }
1634 : }
1635 :
1636 : static inline void
1637 6587 : re_like_clean(struct RE **re, uint32_t **wpat)
1638 : {
1639 6587 : if (*re) {
1640 5590 : re_destroy(*re);
1641 5591 : *re = NULL;
1642 : }
1643 6588 : if (*wpat) {
1644 29 : GDKfree(*wpat);
1645 29 : *wpat = NULL;
1646 : }
1647 6588 : }
1648 :
1649 : #ifdef HAVE_LIBPCRE
1650 : static inline str
1651 253 : pcre_like_build(pcre **res, pcre_extra **ex, const char *ppat, bool caseignore,
1652 : BUN count)
1653 : {
1654 253 : const char *err_p = NULL;
1655 253 : int errpos = 0;
1656 253 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE | PCRE_DOTALL;
1657 253 : int pcrestopt = count > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0;
1658 :
1659 253 : *res = NULL;
1660 253 : *ex = NULL;
1661 :
1662 253 : if (caseignore) {
1663 12 : options |= PCRE_CASELESS;
1664 : }
1665 253 : if ((*res = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL)
1666 0 : return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
1667 : ": compilation of regular expression (%s) failed"
1668 : " at %d with '%s'", ppat, errpos, err_p);
1669 253 : *ex = pcre_study(*res, pcrestopt, &err_p);
1670 253 : if (err_p != NULL)
1671 0 : return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
1672 : ": pcre study of pattern (%s) "
1673 : "failed with '%s'", ppat, err_p);
1674 : return MAL_SUCCEED;
1675 : }
1676 : #else
1677 : static inline str
1678 : pcre_like_build(regex_t *res, void *ex, const char *ppat, bool caseignore,
1679 : BUN count)
1680 : {
1681 : int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
1682 : int errcode;
1683 :
1684 : *res = (regex_t) {
1685 : 0};
1686 : (void) count;
1687 :
1688 : if (caseignore) {
1689 : options |= REG_ICASE;
1690 : }
1691 : if ((errcode = regcomp(res, ppat, options)) != 0)
1692 : return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
1693 : ": compilation of regular expression (%s) failed",
1694 : ppat);
1695 : (void) ex;
1696 : return MAL_SUCCEED;
1697 : }
1698 : #endif
1699 :
1700 : #define PCRE_LIKE_BODY(LOOP_BODY, RES1, RES2) \
1701 : do { \
1702 : LOOP_BODY \
1703 : if (strNil(s)) \
1704 : *ret = bit_nil; \
1705 : else if (pos >= 0) \
1706 : *ret = RES1; \
1707 : else if (pos == -1) \
1708 : *ret = RES2; \
1709 : else \
1710 : return createException(MAL, "pcre.match", OPERATION_FAILED ": matching of regular expression (%s) failed with %d", ppat, pos); \
1711 : } while(0)
1712 :
1713 : static inline str
1714 1096 : pcre_like_apply(bit *ret, const char *s,
1715 : #ifdef HAVE_LIBPCRE
1716 : const pcre *re, const pcre_extra *ex
1717 : #else
1718 : regex_t re, void *ex
1719 : #endif
1720 : , const char *ppat, bool anti)
1721 : {
1722 1096 : int pos;
1723 :
1724 : #ifdef HAVE_LIBPCRE
1725 : #define LOOP_BODY \
1726 : pos = pcre_exec(re, ex, s, (int) strlen(s), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
1727 : #else
1728 : #define LOOP_BODY \
1729 : int retval = regexec(&re, s, (size_t) 0, NULL, 0); \
1730 : (void) ex; \
1731 : pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
1732 : #endif
1733 :
1734 1096 : if (anti)
1735 6 : PCRE_LIKE_BODY(LOOP_BODY, FALSE, TRUE);
1736 : else
1737 1090 : PCRE_LIKE_BODY(LOOP_BODY, TRUE, FALSE);
1738 :
1739 : return MAL_SUCCEED;
1740 : }
1741 :
1742 : static inline void
1743 752 : pcre_clean(
1744 : #ifdef HAVE_LIBPCRE
1745 : pcre **re, pcre_extra **ex)
1746 : {
1747 752 : if (*re)
1748 253 : pcre_free(*re);
1749 752 : if (*ex)
1750 253 : pcre_free_study(*ex);
1751 752 : *re = NULL;
1752 752 : *ex = NULL;
1753 : #else
1754 : regex_t *re, void *ex)
1755 : {
1756 : regfree(re);
1757 : *re = (regex_t) {
1758 : 0};
1759 : (void) ex;
1760 : #endif
1761 752 : }
1762 :
1763 : static str
1764 461 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci,
1765 : const str *esc, const bit *isens, const bit *not)
1766 : {
1767 461 : str msg = MAL_SUCCEED;
1768 461 : BAT *b = NULL, *pbn = NULL, *bn = NULL;
1769 461 : char *ppat = NULL;
1770 461 : const char *input = NULL;
1771 461 : bool use_re = false,
1772 461 : use_strcmp = false,
1773 461 : empty = false,
1774 461 : isensitive = (bool) *isens,
1775 461 : anti = (bool) *not,
1776 461 : has_nil = false,
1777 461 : ascii_pattern = false,
1778 461 : input_is_a_bat = isaBatType(getArgType(mb, pci, 1)),
1779 461 : pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
1780 461 : bat *r = getArgReference_bat(stk, pci, 0);
1781 461 : BUN q = 0;
1782 461 : bit *restrict ret = NULL;
1783 : #ifdef HAVE_LIBPCRE
1784 461 : pcre *re = NULL;
1785 461 : pcre_extra *ex = NULL;
1786 : #else
1787 : regex_t re = (regex_t) { 0 };
1788 : void *ex = NULL;
1789 : #endif
1790 461 : struct RE *re_simple = NULL;
1791 461 : uint32_t *wpat = NULL;
1792 461 : BATiter bi = (BATiter) { 0 }, pi;
1793 :
1794 461 : (void) cntxt;
1795 461 : if (input_is_a_bat) {
1796 458 : bat *bid = getArgReference_bat(stk, pci, 1);
1797 458 : if (!(b = BATdescriptor(*bid))) {
1798 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1799 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1800 0 : goto bailout;
1801 : }
1802 : }
1803 461 : if (pattern_is_a_bat) {
1804 80 : bat *pb = getArgReference_bat(stk, pci, 2);
1805 80 : if (!(pbn = BATdescriptor(*pb))) {
1806 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1807 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1808 0 : goto bailout;
1809 : }
1810 : }
1811 461 : assert((!b || ATOMstorage(b->ttype) == TYPE_str)
1812 : && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
1813 :
1814 461 : q = BATcount(b ? b : pbn);
1815 461 : if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
1816 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1817 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1818 0 : goto bailout;
1819 : }
1820 461 : ret = (bit *) Tloc(bn, 0);
1821 :
1822 461 : if (pattern_is_a_bat) {
1823 80 : pi = bat_iterator(pbn);
1824 80 : if (b)
1825 77 : bi = bat_iterator(b);
1826 : else
1827 3 : input = *getArgReference_str(stk, pci, 1);
1828 :
1829 1167 : for (BUN p = 0; p < q; p++) {
1830 1088 : const char *next_input = b ? BUNtvar(bi, p) : input,
1831 1088 : *np = BUNtvar(pi, p);
1832 :
1833 1088 : if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
1834 : &ascii_pattern, np, *esc)) != MAL_SUCCEED) {
1835 0 : bat_iterator_end(&pi);
1836 0 : if (b)
1837 0 : bat_iterator_end(&bi);
1838 0 : goto bailout;
1839 : }
1840 :
1841 1087 : if (use_re) {
1842 626 : if ((msg = re_like_build(&re_simple, &wpat, np, isensitive,
1843 : use_strcmp, ascii_pattern,
1844 626 : (unsigned char) **esc)) != MAL_SUCCEED) {
1845 0 : bat_iterator_end(&pi);
1846 0 : if (b)
1847 0 : bat_iterator_end(&bi);
1848 0 : goto bailout;
1849 : }
1850 626 : ret[p] = re_like_proj_apply(next_input, re_simple, wpat, np,
1851 : isensitive, anti, use_strcmp,
1852 : ascii_pattern);
1853 626 : re_like_clean(&re_simple, &wpat);
1854 461 : } else if (empty) {
1855 455 : ret[p] = bit_nil;
1856 : } else {
1857 6 : if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, 1)) != MAL_SUCCEED) {
1858 0 : bat_iterator_end(&pi);
1859 0 : if (b)
1860 0 : bat_iterator_end(&bi);
1861 0 : goto bailout;
1862 : }
1863 6 : if ((msg = pcre_like_apply(&(ret[p]), next_input, re, ex, ppat, anti)) != MAL_SUCCEED) {
1864 0 : bat_iterator_end(&pi);
1865 0 : if (b)
1866 0 : bat_iterator_end(&bi);
1867 0 : goto bailout;
1868 : }
1869 6 : pcre_clean(&re, &ex);
1870 : }
1871 1087 : has_nil |= is_bit_nil(ret[p]);
1872 1087 : GDKfree(ppat);
1873 1087 : ppat = NULL;
1874 : }
1875 79 : bat_iterator_end(&pi);
1876 80 : if (b)
1877 77 : bat_iterator_end(&bi);
1878 : } else {
1879 381 : const char *pat = *getArgReference_str(stk, pci, 2);
1880 381 : if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
1881 : &ascii_pattern, pat, *esc)) != MAL_SUCCEED)
1882 0 : goto bailout;
1883 :
1884 381 : bi = bat_iterator(b);
1885 753 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp
1886 : ? "pcrelike: pattern matching using strcmp" :
1887 373 : use_re ? "pcrelike: pattern matching using RE" :
1888 : "pcrelike: pattern matching using pcre");
1889 :
1890 380 : if (use_re) {
1891 300 : if ((msg = re_like_build(&re_simple, &wpat, pat, isensitive, use_strcmp,
1892 299 : ascii_pattern, (unsigned char) **esc)) != MAL_SUCCEED) {
1893 0 : bat_iterator_end(&bi);
1894 0 : goto bailout;
1895 : }
1896 4134 : for (BUN p = 0; p < q; p++) {
1897 3835 : const char *s = BUNtvar(bi, p);
1898 3834 : ret[p] = re_like_proj_apply(s, re_simple, wpat, pat, isensitive,
1899 : anti, use_strcmp, ascii_pattern);
1900 3834 : has_nil |= is_bit_nil(ret[p]);
1901 : }
1902 81 : } else if (empty) {
1903 0 : for (BUN p = 0; p < q; p++)
1904 0 : ret[p] = bit_nil;
1905 : has_nil = true;
1906 : } else {
1907 81 : if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, q)) != MAL_SUCCEED) {
1908 0 : bat_iterator_end(&bi);
1909 0 : goto bailout;
1910 : }
1911 1172 : for (BUN p = 0; p < q; p++) {
1912 1091 : const char *s = BUNtvar(bi, p);
1913 1090 : if ((msg = pcre_like_apply(&(ret[p]), s, re, ex, ppat, anti)) != MAL_SUCCEED) {
1914 0 : bat_iterator_end(&bi);
1915 0 : goto bailout;
1916 : }
1917 1091 : has_nil |= is_bit_nil(ret[p]);
1918 : }
1919 : }
1920 380 : bat_iterator_end(&bi);
1921 : }
1922 :
1923 461 : bailout:
1924 461 : GDKfree(ppat);
1925 461 : re_like_clean(&re_simple, &wpat);
1926 461 : pcre_clean(&re, &ex);
1927 461 : if (bn && !msg) {
1928 461 : BATsetcount(bn, q);
1929 461 : bn->tnil = has_nil;
1930 461 : bn->tnonil = !has_nil;
1931 461 : bn->tkey = BATcount(bn) <= 1;
1932 461 : bn->tsorted = BATcount(bn) <= 1;
1933 461 : bn->trevsorted = BATcount(bn) <= 1;
1934 461 : *r = bn->batCacheid;
1935 461 : BBPkeepref(bn);
1936 0 : } else if (bn)
1937 0 : BBPreclaim(bn);
1938 461 : BBPreclaim(b);
1939 461 : BBPreclaim(pbn);
1940 461 : return msg;
1941 : }
1942 :
1943 : static str
1944 429 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1945 : {
1946 429 : const str *esc = getArgReference_str(stk, pci, 3);
1947 429 : const bit *ci = getArgReference_bit(stk, pci, 4);
1948 429 : bit no = FALSE;
1949 :
1950 429 : return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &no);
1951 : }
1952 :
1953 : static str
1954 32 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1955 : {
1956 32 : const str *esc = getArgReference_str(stk, pci, 3);
1957 32 : const bit *ci = getArgReference_bit(stk, pci, 4);
1958 32 : bit yes = TRUE;
1959 :
1960 32 : return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &yes);
1961 : }
1962 :
1963 : /* scan select loop with or without candidates */
1964 : #define pcrescanloop(TEST, KEEP_NULLS) \
1965 : do { \
1966 : TRC_DEBUG(ALGO, \
1967 : "PCREselect(b=%s#"BUNFMT",anti=%d): " \
1968 : "scanselect %s\n", BATgetId(b), BATcount(b), \
1969 : anti, #TEST); \
1970 : if (!s || BATtdense(s)) { \
1971 : for (; p < q; p++) { \
1972 : GDK_CHECK_TIMEOUT(timeoffset, counter, \
1973 : GOTO_LABEL_TIMEOUT_HANDLER(bailout)); \
1974 : const char *restrict v = BUNtvar(bi, p - off); \
1975 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1976 : vals[cnt++] = p; \
1977 : } \
1978 : } else { \
1979 : for (; p < ncands; p++) { \
1980 : GDK_CHECK_TIMEOUT(timeoffset, counter, \
1981 : GOTO_LABEL_TIMEOUT_HANDLER(bailout)); \
1982 : oid o = canditer_next(ci); \
1983 : const char *restrict v = BUNtvar(bi, o - off); \
1984 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1985 : vals[cnt++] = o; \
1986 : } \
1987 : } \
1988 : } while (0)
1989 :
1990 : #ifdef HAVE_LIBPCRE
1991 : #define PCRE_LIKESELECT_BODY (pcre_exec(re, ex, v, (int) strlen(v), 0, PCRE_NO_UTF8_CHECK, NULL, 0) >= 0)
1992 : #else
1993 : #define PCRE_LIKESELECT_BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH)
1994 : #endif
1995 :
1996 : static str
1997 160 : pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
1998 : BUN *rcnt, const char *pat, bool caseignore, bool anti,
1999 : bool keep_nulls)
2000 : {
2001 : #ifdef HAVE_LIBPCRE
2002 160 : pcre *re = NULL;
2003 160 : pcre_extra *ex = NULL;
2004 : #else
2005 : regex_t re = (regex_t) { 0 };
2006 : void *ex = NULL;
2007 : #endif
2008 160 : BATiter bi = bat_iterator(b);
2009 160 : BUN cnt = 0, ncands = ci->ncand;
2010 160 : oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
2011 160 : str msg = MAL_SUCCEED;
2012 :
2013 160 : size_t counter = 0;
2014 160 : lng timeoffset = 0;
2015 160 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2016 160 : if (qry_ctx != NULL) {
2017 144 : timeoffset = (qry_ctx->starttime
2018 144 : && qry_ctx->querytimeout) ? (qry_ctx->starttime +
2019 144 : qry_ctx->querytimeout) : 0;
2020 : }
2021 :
2022 160 : if ((msg = pcre_like_build(&re, &ex, pat, caseignore, ci->ncand)) != MAL_SUCCEED)
2023 0 : goto bailout;
2024 :
2025 160 : if (anti)
2026 0 : pcrescanloop(!strNil(v) && !PCRE_LIKESELECT_BODY, keep_nulls);
2027 : else
2028 37521 : pcrescanloop(!strNil(v) && PCRE_LIKESELECT_BODY, keep_nulls);
2029 :
2030 4 : bailout:
2031 160 : bat_iterator_end(&bi);
2032 160 : pcre_clean(&re, &ex);
2033 160 : *rcnt = cnt;
2034 160 : return msg;
2035 : }
2036 :
2037 : static str
2038 5376 : re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
2039 : BUN *rcnt, const char *pat, bool caseignore, bool anti,
2040 : bool use_strcmp, uint32_t esc, bool keep_nulls,
2041 : bool ascii_pattern)
2042 : {
2043 5376 : BATiter bi = bat_iterator(b);
2044 5376 : BUN cnt = 0, ncands = ci->ncand;
2045 5376 : oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
2046 5376 : struct RE *re = NULL;
2047 5376 : uint32_t *wpat = NULL;
2048 5376 : str msg = MAL_SUCCEED;
2049 :
2050 5376 : size_t counter = 0;
2051 5376 : lng timeoffset = 0;
2052 5376 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2053 5376 : if (qry_ctx != NULL) {
2054 2898 : timeoffset = (qry_ctx->starttime
2055 2898 : && qry_ctx->querytimeout) ? (qry_ctx->starttime +
2056 2898 : qry_ctx->querytimeout) : 0;
2057 : }
2058 :
2059 5376 : if ((msg = re_like_build(&re, &wpat, pat, caseignore, use_strcmp, ascii_pattern,
2060 : esc)) != MAL_SUCCEED)
2061 0 : goto bailout;
2062 :
2063 5375 : if (use_strcmp) {
2064 90 : if (caseignore) {
2065 30 : if (ascii_pattern) {
2066 22 : if (anti)
2067 64 : pcrescanloop(!strNil(v)
2068 : && istrcmp(v, pat) != 0, keep_nulls);
2069 : else
2070 635 : pcrescanloop(!strNil(v)
2071 : && istrcmp(v, pat) == 0, keep_nulls);
2072 : } else {
2073 8 : if (anti)
2074 0 : pcrescanloop(!strNil(v)
2075 : && mywstrcasecmp(v, wpat) != 0, keep_nulls);
2076 : else
2077 36 : pcrescanloop(!strNil(v)
2078 : && mywstrcasecmp(v, wpat) == 0, keep_nulls);
2079 : }
2080 : } else {
2081 60 : if (anti)
2082 54 : pcrescanloop(!strNil(v) && strcmp(v, pat) != 0, keep_nulls);
2083 : else
2084 10263 : pcrescanloop(!strNil(v) && strcmp(v, pat) == 0, keep_nulls);
2085 : }
2086 : } else {
2087 5285 : if (caseignore) {
2088 : /* ascii_pattern == true is encoded in re */
2089 52 : if (anti) {
2090 0 : if (ascii_pattern)
2091 0 : pcrescanloop(!strNil(v)
2092 : && !re_match_no_ignore(v, re), keep_nulls);
2093 : else
2094 0 : pcrescanloop(!strNil(v)
2095 : && !re_match_ignore(v, re), keep_nulls);
2096 : } else {
2097 52 : if (ascii_pattern)
2098 6795 : pcrescanloop(!strNil(v)
2099 : && re_match_no_ignore(v, re), keep_nulls);
2100 : else
2101 72 : pcrescanloop(!strNil(v)
2102 : && re_match_ignore(v, re), keep_nulls);
2103 : }
2104 : } else {
2105 5233 : if (anti)
2106 60004 : pcrescanloop(!strNil(v)
2107 : && !re_match_no_ignore(v, re), keep_nulls);
2108 : else
2109 133119 : pcrescanloop(!strNil(v)
2110 : && re_match_no_ignore(v, re), keep_nulls);
2111 : }
2112 : }
2113 :
2114 80 : bailout:
2115 5375 : bat_iterator_end(&bi);
2116 5375 : re_like_clean(&re, &wpat);
2117 5376 : *rcnt = cnt;
2118 5376 : return msg;
2119 : }
2120 :
2121 : static str
2122 5536 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const str *pat,
2123 : const str *esc, const bit *caseignore, const bit *anti)
2124 : {
2125 5536 : BAT *b, *s = NULL, *bn = NULL, *old_s = NULL;
2126 5536 : str msg = MAL_SUCCEED;
2127 5536 : char *ppat = NULL;
2128 5536 : bool use_re = false,
2129 5536 : use_strcmp = false,
2130 5536 : empty = false,
2131 5536 : ascii_pattern = false;
2132 5536 : bool with_strimps = false;
2133 5536 : bool with_strimps_anti = false;
2134 5536 : BUN p = 0, q = 0, rcnt = 0;
2135 5536 : struct canditer ci;
2136 :
2137 5536 : if ((b = BATdescriptor(*bid)) == NULL) {
2138 0 : msg = createException(MAL, "algebra.likeselect",
2139 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2140 0 : goto bailout;
2141 : }
2142 5536 : if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
2143 0 : msg = createException(MAL, "algebra.likeselect",
2144 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2145 0 : goto bailout;
2146 : }
2147 :
2148 5536 : assert(ATOMstorage(b->ttype) == TYPE_str);
2149 :
2150 5536 : if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
2151 : *pat, *esc)) != MAL_SUCCEED)
2152 0 : goto bailout;
2153 :
2154 5536 : if (empty) {
2155 0 : if (!(bn = BATdense(0, 0, 0)))
2156 0 : msg = createException(MAL, "algebra.likeselect",
2157 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
2158 :
2159 0 : goto bailout;
2160 : }
2161 : /* Since the strimp pre-filtering of a LIKE query produces a superset of the actual result the complement of that
2162 : * set will necessarily reject some of the matching entries in the NOT LIKE query.
2163 : *
2164 : * In this case we run the PCRElikeselect as a LIKE query with strimps and return the complement of the result,
2165 : * taking extra care to not return NULLs. This currently means that we do not run strimps for NOT LIKE queries if
2166 : * the BAT contains NULLs.
2167 : */
2168 5536 : if (BAThasstrimps(b)) {
2169 24 : if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
2170 24 : BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
2171 24 : if (tmp_s) {
2172 24 : old_s = s;
2173 24 : s = tmp_s;
2174 24 : if (!*anti)
2175 : with_strimps = true;
2176 : else
2177 0 : with_strimps_anti = true;
2178 : }
2179 : } else { /* If we cannot filter with the strimp just continue normally */
2180 0 : GDKclrerr();
2181 : }
2182 : }
2183 :
2184 :
2185 5535 : MT_thread_setalgorithm(use_strcmp
2186 5535 : ? (with_strimps ?
2187 : "pcrelike: pattern matching using strcmp with strimps"
2188 : : (with_strimps_anti ?
2189 : "pcrelike: pattern matching using strcmp with strimps anti"
2190 5535 : : "pcrelike: pattern matching using strcmp")) :
2191 5445 : use_re ? (with_strimps ?
2192 : "pcrelike: pattern matching using RE with strimps"
2193 : : (with_strimps_anti ?
2194 : "pcrelike: patterm matching using RE with strimps anti"
2195 : :
2196 : "pcrelike: pattern matching using RE"))
2197 : : (with_strimps ?
2198 : "pcrelike: pattern matching using pcre with strimps"
2199 : : (with_strimps_anti ?
2200 : "pcrelike: pattermatching using pcre with strimps anti"
2201 : : "pcrelike: pattern matching using pcre")));
2202 :
2203 5536 : canditer_init(&ci, b, s);
2204 5536 : if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
2205 0 : msg = createException(MAL, "algebra.likeselect",
2206 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
2207 0 : goto bailout;
2208 : }
2209 :
2210 5535 : if (!s || BATtdense(s)) {
2211 1344 : if (s) {
2212 4108 : assert(BATtdense(s));
2213 4108 : p = (BUN) s->tseqbase;
2214 4108 : q = p + BATcount(s);
2215 4108 : if ((oid) p < b->hseqbase)
2216 : p = b->hseqbase;
2217 4108 : if ((oid) q > b->hseqbase + BATcount(b))
2218 : q = b->hseqbase + BATcount(b);
2219 : } else {
2220 1344 : p = b->hseqbase;
2221 1344 : q = BATcount(b) + b->hseqbase;
2222 : }
2223 : }
2224 :
2225 5535 : if (use_re) {
2226 5375 : msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, *caseignore, *anti
2227 774 : && !with_strimps_anti, use_strcmp,
2228 5375 : (unsigned char) **esc, with_strimps_anti,
2229 : ascii_pattern);
2230 : } else {
2231 160 : msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, *caseignore,
2232 160 : *anti && !with_strimps_anti, with_strimps_anti);
2233 : }
2234 :
2235 5534 : if (!msg) { /* set some properties */
2236 5534 : BATsetcount(bn, rcnt);
2237 5534 : bn->tsorted = true;
2238 5534 : bn->trevsorted = bn->batCount <= 1;
2239 5534 : bn->tkey = true;
2240 5534 : bn->tnil = false;
2241 5534 : bn->tnonil = true;
2242 5534 : bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 ? *(const oid *) Tloc(bn, 0) : rcnt == b->batCount ? b->hseqbase : oid_nil;
2243 5534 : if (with_strimps_anti) {
2244 : /* Reverse the result taking into account the original candidate list. */
2245 : // BAT *rev = BATdiffcand(BATdense(b->hseqbase, 0, b->batCount), bn);
2246 0 : BAT *rev;
2247 0 : if (old_s) {
2248 0 : rev = BATdiffcand(old_s, bn);
2249 : #ifndef NDEBUG
2250 0 : BAT *is = BATintersectcand(old_s, bn);
2251 0 : if (is) {
2252 0 : assert(is->batCount == bn->batCount);
2253 0 : BBPreclaim(is);
2254 : }
2255 0 : assert(rev->batCount == old_s->batCount - bn->batCount);
2256 : #endif
2257 : }
2258 :
2259 : else
2260 0 : rev = BATnegcands(b->batCount, bn);
2261 : /* BAT *rev = BATnegcands(b->batCount, bn); */
2262 0 : BBPunfix(bn->batCacheid);
2263 0 : bn = rev;
2264 : }
2265 : }
2266 :
2267 :
2268 5534 : bailout:
2269 5534 : BBPreclaim(b);
2270 5534 : BBPreclaim(s);
2271 5536 : BBPreclaim(old_s);
2272 5536 : GDKfree(ppat);
2273 5535 : if (bn && !msg) {
2274 5535 : *ret = bn->batCacheid;
2275 5535 : BBPkeepref(bn);
2276 0 : } else if (bn)
2277 0 : BBPreclaim(bn);
2278 5536 : return msg;
2279 : }
2280 :
2281 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
2282 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##i.width))
2283 :
2284 : #ifdef HAVE_LIBPCRE
2285 : #define PCRE_EXEC \
2286 : do { \
2287 : retval = pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, PCRE_NO_UTF8_CHECK, NULL, 0); \
2288 : } while (0)
2289 : #define PCRE_EXEC_COND (retval < 0)
2290 : #else
2291 : #define PCRE_EXEC \
2292 : do { \
2293 : retval = regexec(&pcrere, vl, (size_t) 0, NULL, 0); \
2294 : } while (0)
2295 : #define PCRE_EXEC_COND (retval == REG_NOMATCH || retval == REG_ENOSYS)
2296 : #endif
2297 :
2298 : /* nested loop implementation for PCRE join */
2299 : #define pcre_join_loop(STRCMP, RE_MATCH, PCRE_COND) \
2300 : do { \
2301 : for (BUN ridx = 0; ridx < rci.ncand; ridx++) { \
2302 : GDK_CHECK_TIMEOUT(timeoffset, counter, \
2303 : GOTO_LABEL_TIMEOUT_HANDLER(bailout)); \
2304 : ro = canditer_next(&rci); \
2305 : vr = VALUE(r, ro - rbase); \
2306 : nl = 0; \
2307 : ascii_pattern = use_re = use_strcmp = empty = false; \
2308 : if ((msg = choose_like_path(&pcrepat, &use_re, &use_strcmp, &empty, &ascii_pattern, vr, esc))) \
2309 : goto bailout; \
2310 : if (!empty) { \
2311 : if (use_re) { \
2312 : if ((msg = re_like_build(&re, &wpat, vr, caseignore, use_strcmp, ascii_pattern, (unsigned char) *esc)) != MAL_SUCCEED) \
2313 : goto bailout; \
2314 : } else if (pcrepat) { \
2315 : if ((msg = pcre_like_build(&pcrere, &pcreex, pcrepat, caseignore, lci.ncand)) != MAL_SUCCEED) \
2316 : goto bailout; \
2317 : GDKfree(pcrepat); \
2318 : pcrepat = NULL; \
2319 : } \
2320 : canditer_reset(&lci); \
2321 : for (BUN lidx = 0; lidx < lci.ncand; lidx++) { \
2322 : lo = canditer_next(&lci); \
2323 : vl = VALUE(l, lo - lbase); \
2324 : if (strNil(vl)) { \
2325 : continue; \
2326 : } else if (use_re) { \
2327 : if (use_strcmp) { \
2328 : if (STRCMP) \
2329 : continue; \
2330 : } else { \
2331 : assert(re); \
2332 : if (RE_MATCH) \
2333 : continue; \
2334 : } \
2335 : } else { \
2336 : int retval; \
2337 : PCRE_EXEC; \
2338 : if (PCRE_COND) \
2339 : continue; \
2340 : } \
2341 : if (BATcount(r1) == BATcapacity(r1)) { \
2342 : newcap = BATgrows(r1); \
2343 : BATsetcount(r1, BATcount(r1)); \
2344 : if (r2) \
2345 : BATsetcount(r2, BATcount(r2)); \
2346 : if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
2347 : msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
2348 : goto bailout; \
2349 : } \
2350 : assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
2351 : } \
2352 : if (BATcount(r1) > 0) { \
2353 : if (lastl + 1 != lo) \
2354 : r1->tseqbase = oid_nil; \
2355 : if (nl == 0) { \
2356 : if (r2) \
2357 : r2->trevsorted = false; \
2358 : if (lastl > lo) { \
2359 : r1->tsorted = false; \
2360 : r1->tkey = false; \
2361 : } else if (lastl < lo) { \
2362 : r1->trevsorted = false; \
2363 : } else { \
2364 : r1->tkey = false; \
2365 : } \
2366 : } \
2367 : } \
2368 : APPEND(r1, lo); \
2369 : if (r2) \
2370 : APPEND(r2, ro); \
2371 : lastl = lo; \
2372 : nl++; \
2373 : } \
2374 : re_like_clean(&re, &wpat); \
2375 : pcre_clean(&pcrere, &pcreex); \
2376 : } \
2377 : if (r2) { \
2378 : if (nl > 1) { \
2379 : r2->tkey = false; \
2380 : r2->tseqbase = oid_nil; \
2381 : r1->trevsorted = false; \
2382 : } else if (nl == 0) { \
2383 : rskipped = BATcount(r2) > 0; \
2384 : } else if (rskipped) { \
2385 : r2->tseqbase = oid_nil; \
2386 : } \
2387 : } else if (nl > 1) { \
2388 : r1->trevsorted = false; \
2389 : } \
2390 : } \
2391 : } while (0)
2392 :
2393 : static char *
2394 43 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc,
2395 : bit caseignore, bit anti)
2396 : {
2397 43 : struct canditer lci, rci;
2398 43 : const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
2399 43 : int rskipped = 0; /* whether we skipped values in r */
2400 43 : oid lbase, rbase, lo, ro, lastl = 0; /* last value inserted into r1 */
2401 43 : BUN nl, newcap;
2402 43 : char *pcrepat = NULL, *msg = MAL_SUCCEED;
2403 43 : struct RE *re = NULL;
2404 43 : bool use_re = false,
2405 43 : use_strcmp = false,
2406 43 : empty = false,
2407 43 : ascii_pattern = false;
2408 43 : uint32_t *wpat = NULL;
2409 : #ifdef HAVE_LIBPCRE
2410 43 : pcre *pcrere = NULL;
2411 43 : pcre_extra *pcreex = NULL;
2412 : #else
2413 : regex_t pcrere = (regex_t) { 0 };
2414 : void *pcreex = NULL;
2415 : #endif
2416 :
2417 43 : size_t counter = 0;
2418 43 : lng timeoffset = 0;
2419 43 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2420 43 : if (qry_ctx != NULL) {
2421 43 : timeoffset = (qry_ctx->starttime
2422 43 : && qry_ctx->querytimeout) ? (qry_ctx->starttime +
2423 43 : qry_ctx->querytimeout) : 0;
2424 : }
2425 :
2426 43 : TRC_DEBUG(ALGO,
2427 : "pcrejoin(l=%s#" BUNFMT "[%s]%s%s,"
2428 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
2429 : "sr=%s#" BUNFMT "%s%s)\n",
2430 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
2431 : l->tsorted ? "-sorted" : "",
2432 : l->trevsorted ? "-revsorted" : "",
2433 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
2434 : r->tsorted ? "-sorted" : "",
2435 : r->trevsorted ? "-revsorted" : "",
2436 : sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
2437 : sl && sl->tsorted ? "-sorted" : "",
2438 : sl && sl->trevsorted ? "-revsorted" : "",
2439 : sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
2440 : sr && sr->tsorted ? "-sorted" : "",
2441 : sr && sr->trevsorted ? "-revsorted" : "");
2442 :
2443 129 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
2444 43 : assert(ATOMtype(l->ttype) == TYPE_str);
2445 :
2446 43 : canditer_init(&lci, l, sl);
2447 43 : canditer_init(&rci, r, sr);
2448 :
2449 43 : BATiter li = bat_iterator(l);
2450 43 : BATiter ri = bat_iterator(r);
2451 43 : lbase = l->hseqbase;
2452 43 : rbase = r->hseqbase;
2453 43 : lvals = (const char *) li.base;
2454 43 : rvals = (const char *) ri.base;
2455 43 : assert(ri.vh && r->ttype);
2456 43 : lvars = li.vh->base;
2457 43 : rvars = ri.vh->base;
2458 :
2459 43 : r1->tkey = true;
2460 43 : r1->tsorted = true;
2461 43 : r1->trevsorted = true;
2462 43 : r1->tnil = false;
2463 43 : r1->tnonil = true;
2464 43 : if (r2) {
2465 26 : r2->tkey = true;
2466 26 : r2->tsorted = true;
2467 26 : r2->trevsorted = true;
2468 26 : r2->tnil = false;
2469 26 : r2->tnonil = true;
2470 : }
2471 :
2472 43 : if (anti) {
2473 23 : if (caseignore) {
2474 123 : pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) == 0 : mywstrcasecmp(vl, wpat) == 0,
2475 : re_match_ignore(vl, re), !PCRE_EXEC_COND);
2476 : } else {
2477 326 : pcre_join_loop(strcmp(vl, vr) == 0, re_match_no_ignore(vl, re), !PCRE_EXEC_COND);
2478 : }
2479 : } else {
2480 20 : if (caseignore) {
2481 5 : pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) != 0 : mywstrcasecmp(vl, wpat) != 0,
2482 : !re_match_ignore(vl, re), PCRE_EXEC_COND);
2483 : } else {
2484 381 : pcre_join_loop(strcmp(vl, vr) != 0, !re_match_no_ignore(vl, re), PCRE_EXEC_COND);
2485 : }
2486 : }
2487 43 : bat_iterator_end(&li);
2488 43 : bat_iterator_end(&ri);
2489 :
2490 43 : assert(!r2 || BATcount(r1) == BATcount(r2));
2491 : /* also set other bits of heap to correct value to indicate size */
2492 43 : BATsetcount(r1, BATcount(r1));
2493 43 : if (r2)
2494 26 : BATsetcount(r2, BATcount(r2));
2495 43 : if (BATcount(r1) > 0) {
2496 30 : if (BATtdense(r1))
2497 7 : r1->tseqbase = ((oid *) r1->theap->base)[0];
2498 30 : if (r2 && BATtdense(r2))
2499 14 : r2->tseqbase = ((oid *) r2->theap->base)[0];
2500 : } else {
2501 13 : r1->tseqbase = 0;
2502 13 : if (r2)
2503 6 : r2->tseqbase = 0;
2504 : }
2505 20 : if (r2)
2506 26 : TRC_DEBUG(ALGO,
2507 : "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s\n",
2508 : BATgetId(l), BATgetId(r),
2509 : BATgetId(r1), BATcount(r1),
2510 : r1->tsorted ? "-sorted" : "",
2511 : r1->trevsorted ? "-revsorted" : "",
2512 : BATgetId(r2), BATcount(r2),
2513 : r2->tsorted ? "-sorted" : "",
2514 : r2->trevsorted ? "-revsorted" : "");
2515 : else
2516 17 : TRC_DEBUG(ALGO,
2517 : "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s\n",
2518 : BATgetId(l), BATgetId(r),
2519 : BATgetId(r1), BATcount(r1),
2520 : r1->tsorted ? "-sorted" : "",
2521 : r1->trevsorted ? "-revsorted" : "");
2522 : return MAL_SUCCEED;
2523 :
2524 0 : bailout:
2525 0 : bat_iterator_end(&li);
2526 0 : bat_iterator_end(&ri);
2527 0 : GDKfree(pcrepat);
2528 0 : re_like_clean(&re, &wpat);
2529 0 : pcre_clean(&pcrere, &pcreex);
2530 0 : assert(msg != MAL_SUCCEED);
2531 : return msg;
2532 : }
2533 :
2534 : static str
2535 43 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid,
2536 : bat ciid, bit anti)
2537 : {
2538 43 : BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL,
2539 43 : *candleft = NULL, *candright = NULL;
2540 43 : BAT *result1 = NULL, *result2 = NULL;
2541 43 : char *msg = MAL_SUCCEED;
2542 43 : const char *esc = "";
2543 43 : bit ci;
2544 43 : BATiter bi;
2545 :
2546 43 : if ((left = BATdescriptor(lid)) == NULL)
2547 0 : goto fail;
2548 43 : if ((right = BATdescriptor(rid)) == NULL)
2549 0 : goto fail;
2550 43 : if ((escape = BATdescriptor(elid)) == NULL)
2551 0 : goto fail;
2552 43 : if ((caseignore = BATdescriptor(ciid)) == NULL)
2553 0 : goto fail;
2554 43 : if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
2555 0 : goto fail;
2556 43 : if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
2557 0 : goto fail;
2558 43 : result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
2559 43 : if (r2)
2560 26 : result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
2561 43 : if (!result1 || (r2 && !result2)) {
2562 0 : msg = createException(MAL, "pcre.join",
2563 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
2564 0 : goto fail;
2565 : }
2566 43 : result1->tnil = false;
2567 43 : result1->tnonil = true;
2568 43 : result1->tkey = true;
2569 43 : result1->tsorted = true;
2570 43 : result1->trevsorted = true;
2571 43 : result1->tseqbase = 0;
2572 43 : if (r2) {
2573 26 : result2->tnil = false;
2574 26 : result2->tnonil = true;
2575 26 : result2->tkey = true;
2576 26 : result2->tsorted = true;
2577 26 : result2->trevsorted = true;
2578 26 : result2->tseqbase = 0;
2579 : }
2580 43 : if (BATcount(escape) != 1) {
2581 0 : msg = createException(MAL, "pcre.join",
2582 : SQLSTATE(42000)
2583 : "At the moment, only one value is allowed for the escape input at pcre join");
2584 0 : goto fail;
2585 : }
2586 43 : if (BATcount(caseignore) != 1) {
2587 0 : msg = createException(MAL, "pcre.join",
2588 : SQLSTATE(42000)
2589 : "At the moment, only one value is allowed for the case ignore input at pcre join");
2590 0 : goto fail;
2591 : }
2592 43 : bi = bat_iterator(caseignore);
2593 43 : ci = *(bit *) BUNtloc(bi, 0);
2594 43 : bat_iterator_end(&bi);
2595 43 : bi = bat_iterator(escape);
2596 43 : esc = BUNtvar(bi, 0);
2597 43 : msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci,
2598 : anti);
2599 43 : bat_iterator_end(&bi);
2600 43 : if (msg)
2601 0 : goto fail;
2602 43 : *r1 = result1->batCacheid;
2603 43 : BBPkeepref(result1);
2604 43 : if (r2) {
2605 26 : *r2 = result2->batCacheid;
2606 26 : BBPkeepref(result2);
2607 : }
2608 43 : BBPunfix(left->batCacheid);
2609 43 : BBPunfix(right->batCacheid);
2610 43 : BBPreclaim(escape);
2611 43 : BBPreclaim(caseignore);
2612 43 : BBPreclaim(candleft);
2613 43 : BBPreclaim(candright);
2614 : return MAL_SUCCEED;
2615 :
2616 0 : fail:
2617 0 : BBPreclaim(left);
2618 0 : BBPreclaim(right);
2619 0 : BBPreclaim(escape);
2620 0 : BBPreclaim(caseignore);
2621 0 : BBPreclaim(candleft);
2622 0 : BBPreclaim(candright);
2623 0 : BBPreclaim(result1);
2624 0 : BBPreclaim(result2);
2625 0 : if (msg)
2626 : return msg;
2627 0 : throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2628 : }
2629 :
2630 : static str
2631 26 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid,
2632 : const bat *cid, const bat *slid, const bat *srid,
2633 : const bit *nil_matches, const lng *estimate, const bit *anti)
2634 : {
2635 26 : (void) nil_matches;
2636 26 : (void) estimate;
2637 26 : return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
2638 26 : *elid, *cid, *anti);
2639 : }
2640 :
2641 : static str
2642 17 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid,
2643 : const bat *cid, const bat *slid, const bat *srid,
2644 : const bit *nil_matches, const lng *estimate, const bit *anti)
2645 : {
2646 17 : (void) nil_matches;
2647 17 : (void) estimate;
2648 17 : return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
2649 17 : *elid, *cid, *anti);
2650 : }
2651 :
2652 : #include "mel.h"
2653 : mel_atom pcre_init_atoms[] = {
2654 : { .name="pcre", }, { .cmp=NULL }
2655 : };
2656 : mel_func pcre_init_funcs[] = {
2657 : command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
2658 : command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2659 : command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2660 : command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
2661 : command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2662 : command("pcre", "replace_first", PCREreplacefirst_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2663 : command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
2664 : command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
2665 : command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2666 : command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2667 : command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2668 : command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2669 : command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2670 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2671 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2672 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2673 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2674 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2675 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2676 : command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds. The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
2677 : command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
2678 : command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
2679 : { .imp=NULL }
2680 : };
2681 : #include "mal_import.h"
2682 : #ifdef _MSC_VER
2683 : #undef read
2684 : #pragma section(".CRT$XCU",read)
2685 : #endif
2686 329 : LIB_STARTUP_FUNC(init_pcre_mal)
2687 329 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }
|