Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024, 2025 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * N. Nes
15 : * PCRE library interface
16 : * The PCRE library is a set of functions that implement regular
17 : * expression pattern matching using the same syntax and semantics as Perl,
18 : * with just a few differences. The current implementation of PCRE
19 : * (release 4.x) corresponds approximately with Perl 5.8, including support
20 : * for UTF-8 encoded strings. However, this support has to be
21 : * explicitly enabled; it is not the default.
22 : *
23 : * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
24 : */
25 : #include "monetdb_config.h"
26 : #include <string.h>
27 :
28 : #include "mal.h"
29 : #include "mal_client.h"
30 : #include "mal_interpreter.h"
31 : #include "mal_exception.h"
32 :
33 : #include <wchar.h>
34 : #include <wctype.h>
35 :
36 : #ifdef HAVE_LIBPCRE
37 : #include <pcre.h>
38 : #ifndef PCRE_STUDY_JIT_COMPILE
39 : /* old library version on e.g. EPEL 6 */
40 : #define pcre_free_study(x) pcre_free(x)
41 : #define PCRE_STUDY_JIT_COMPILE 0
42 : #endif
43 : #define JIT_COMPILE_MIN 1024 /* when to try JIT compilation of patterns */
44 :
45 : #else
46 :
47 : #include <regex.h>
48 :
49 : typedef regex_t pcre;
50 : #endif
51 :
52 : /* current implementation assumes simple %keyword% [keyw%]* */
53 : struct RE {
54 : char *k;
55 : bool search:1, atend:1, case_ignore:1;
56 : size_t skip; /* number of codepoints to skip before matching */
57 : size_t len; /* number of bytes in string */
58 : size_t ulen; /* number of codepoints in string */
59 : struct RE *n;
60 : };
61 :
62 : /* We cannot use strcasecmp and strncasecmp since they work byte for
63 : * byte and don't deal with multibyte encodings (such as UTF-8). */
64 :
65 : static inline bool
66 6851 : mnre_is_pattern_properly_escaped(const char *pat, unsigned char esc)
67 : {
68 6851 : bool escaped = false;
69 :
70 6851 : if (pat == 0)
71 : return true;
72 52401 : while (*pat) {
73 45550 : if (escaped) {
74 : escaped = false;
75 45398 : } else if ((unsigned char) *pat == esc) {
76 45550 : escaped = true;
77 : }
78 45550 : pat++;
79 : }
80 6851 : return escaped ? false : true;
81 : }
82 :
83 : /* returns true if the pattern does not contain wildcard
84 : * characters ('%' or '_') and no character is escaped
85 : */
86 : static inline bool
87 6825 : is_strcmpable(const char *pat, const char *esc)
88 : {
89 6825 : if (pat[strcspn(pat, "%_")])
90 : return false;
91 1826 : return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
92 : }
93 :
94 : /* Match regular expression by comparing bytes.
95 : */
96 : static inline bool
97 408691 : mnre_match(const char *restrict s, const struct RE *restrict pattern)
98 : {
99 408691 : const struct RE *r;
100 :
101 484805 : for (r = pattern; r; r = r->n) {
102 436494 : for (size_t i = 0; i < r->skip; s++) {
103 25792 : if (*s == 0)
104 : return false;
105 26300 : i += (*s & 0xC0) != 0x80;
106 : }
107 410702 : if (r->search) {
108 191828 : if (r->atend) {
109 : /* we're searching for a string at the end, so just skip
110 : * over everything and just compare with the tail of the
111 : * haystack */
112 21623 : size_t slen = strlen(s);
113 21623 : if (slen < r->ulen) {
114 : /* remaining string too short: each codepoint
115 : * requires at least one byte */
116 : return false;
117 : }
118 21611 : const char *e = s + slen;
119 21611 : if (!r->case_ignore) {
120 21531 : if (slen < r->len) {
121 : /* remaining string is too short to match */
122 : return false;
123 : }
124 21528 : e -= r->len;
125 21528 : if ((*e & 0xC0) == 0x80) {
126 : /* not at start of a Unicode character, so
127 : * cannot match (this test not strictly
128 : * required: the strcmp should also return
129 : * unequal) */
130 : return false;
131 : }
132 21537 : return strcmp(e, r->k) == 0;
133 : }
134 : size_t ulen = r->ulen;
135 353 : while (e > s && ulen != 0) {
136 273 : ulen -= (*--e & 0xC0) != 0x80;
137 : }
138 : /* ulen != 0 means remaining string is too short */
139 141 : return ulen == 0 && GDKstrcasecmp(e, r->k) == 0;
140 : }
141 : /* in case we have a pattern consisting of % followed by _,
142 : * we need to backtrack, so use recursion; here we know we
143 : * have the %, look for an _ in the rest of the pattern
144 : * (note %_ and _% are equivalent and is taken care of by
145 : * the pattern construction in mnre_create) */
146 182972 : for (const struct RE *p = r->n; p; p = p->n) {
147 14907 : if (p->skip != 0) {
148 2140 : struct RE pat = *r;
149 2140 : pat.search = false;
150 2140 : pat.skip = 0;
151 160145 : do {
152 160145 : if (mnre_match(s, &pat))
153 : return true;
154 159951 : do
155 159951 : s++;
156 159952 : while (*s && (*s & 0xC0) == 0x80);
157 159952 : } while (*s != 0);
158 : return false;
159 : }
160 : }
161 : }
162 386939 : if (r->k[0] == 0 && (r->search || *s == 0))
163 : return true;
164 386897 : if (r->case_ignore) {
165 11407 : for (;;) {
166 11407 : if (r->search && (s = GDKstrcasestr(s, r->k)) == NULL)
167 : return false;
168 3763 : if (*s == '\0')
169 : return false;
170 : /* in "atend" comparison, compare whole string, else
171 : * only part */
172 3822 : if ((!r->search || r->atend) &&
173 59 : (r->atend ? GDKstrcasecmp(s, r->k) : GDKstrncasecmp(s, r->k, SIZE_MAX, r->len)) != 0) {
174 : /* no match */
175 22 : if (!r->search)
176 : return false;
177 : /* try again with next character */
178 0 : do
179 0 : s++;
180 0 : while (*s != '\0' && (*s & 0xC0) == 0x80);
181 0 : continue;
182 : }
183 : /* match; find end of match by counting codepoints */
184 58947 : for (size_t i = 0; *s && i < r->ulen; s++)
185 55206 : i += (*s & 0xC0) != 0x80;
186 : break;
187 : }
188 : } else {
189 375490 : for (;;) {
190 375490 : if (r->search && (s = strstr(s, r->k)) == NULL)
191 : return false;
192 260989 : if (*s == '\0')
193 : return false;
194 : /* in "atend" comparison, include NUL byte in the compare */
195 260498 : if ((!r->search || r->atend) &&
196 190408 : strncmp(s, r->k, r->len + r->atend) != 0) {
197 : /* no match */
198 188125 : if (!r->search)
199 : return false;
200 : /* try again with next character: have search start
201 : * after current first byte */
202 0 : if ((s = strchr(s + 1, r->k[0])) == NULL)
203 : return false;
204 0 : continue;
205 : }
206 : /* match */
207 72373 : s += r->len;
208 72373 : break;
209 : }
210 : }
211 : }
212 : return true;
213 : }
214 :
215 : static void
216 5965 : mnre_destroy(struct RE *p)
217 : {
218 5965 : if (p) {
219 5965 : GDKfree(p->k);
220 6715 : do {
221 6715 : struct RE *n = p->n;
222 :
223 6715 : GDKfree(p);
224 6715 : p = n;
225 6715 : } while (p);
226 : }
227 5965 : }
228 :
229 : /* Create a linked list of RE structures. Depending on the
230 : * caseignore and the ascii_pattern flags, the w
231 : * (if caseignore == true && ascii_pattern == false) or the k
232 : * (in every other case) field is used. These in the first
233 : * structure are allocated, whereas in all subsequent
234 : * structures the fields point into the allocated buffer of
235 : * the first.
236 : */
237 : static struct RE *
238 5965 : mnre_create(const char *pat, bool caseignore, uint32_t esc)
239 : {
240 5965 : struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
241 5965 : bool escaped = false;
242 5965 : char *p, *q;
243 :
244 5965 : if (r == NULL)
245 : return NULL;
246 5965 : *r = (struct RE) {
247 : .atend = true,
248 : .case_ignore = caseignore,
249 : };
250 :
251 11274 : for (;;) {
252 11274 : if (esc != '%' && *pat == '%') {
253 5137 : pat++; /* skip % */
254 5137 : r->search = true;
255 6137 : } else if (esc != '_' && *pat == '_') {
256 172 : pat++;
257 172 : r->skip++;
258 : } else {
259 : break;
260 : }
261 : }
262 5965 : if ((p = GDKstrdup(pat)) == NULL) {
263 0 : GDKfree(r);
264 0 : return NULL;
265 : }
266 :
267 5965 : r->k = p;
268 5965 : q = p;
269 41628 : while (*p) {
270 35663 : if (escaped) {
271 149 : *q++ = *p;
272 149 : n->len++;
273 149 : n->ulen += (*p & 0xC0) != 0x80;
274 149 : escaped = false;
275 35514 : } else if ((unsigned char) *p == esc) {
276 : escaped = true;
277 35365 : } else if (*p == '%' || *p == '_') {
278 6006 : n->atend = false;
279 6006 : bool search = false;
280 6006 : size_t skip = 0;
281 18130 : for (;;) {
282 12068 : if (*p == '_')
283 602 : skip++;
284 11466 : else if (*p == '%')
285 : search = true;
286 : else
287 : break;
288 6062 : p++;
289 : }
290 6006 : if (*p || skip != 0) {
291 750 : n = n->n = GDKmalloc(sizeof(struct RE));
292 750 : if (n == NULL)
293 0 : goto bailout;
294 750 : *n = (struct RE) {
295 : .search = search,
296 : .atend = true,
297 : .skip = skip,
298 : .k = p,
299 : .case_ignore = caseignore,
300 : };
301 : }
302 6006 : *q = 0;
303 6006 : q = p;
304 6006 : continue; /* skip increment, we already did it */
305 : } else {
306 29359 : *q++ = *p;
307 29359 : n->len++;
308 29359 : n->ulen += (*p & 0xC0) != 0x80;
309 : }
310 29657 : p++;
311 : }
312 5965 : *q = 0;
313 5965 : return r;
314 0 : bailout:
315 0 : mnre_destroy(r);
316 0 : return NULL;
317 : }
318 :
319 : #ifdef HAVE_LIBPCRE
320 : static str
321 25 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
322 : {
323 25 : pcre *r;
324 25 : const char *err_p = NULL;
325 25 : int errpos = 0;
326 25 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
327 25 : if (insensitive)
328 0 : options |= PCRE_CASELESS;
329 :
330 25 : if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
331 0 : throw(MAL, "pcre.compile", OPERATION_FAILED
332 : " with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern);
333 : }
334 25 : *res = r;
335 25 : return MAL_SUCCEED;
336 : }
337 : #endif
338 :
339 : /* maximum number of back references and quoted \ or $ in replacement string */
340 : #define MAX_NR_REFS 20
341 :
342 : struct backref {
343 : int idx;
344 : int start;
345 : int end;
346 : };
347 :
348 : #ifdef HAVE_LIBPCRE
349 : /* fill in parameter backrefs (length maxrefs) with information about
350 : * back references in the replacement string; a back reference is a
351 : * dollar or backslash followed by a number */
352 : static int
353 104 : parse_replacement(const char *replacement, int len_replacement,
354 : struct backref *backrefs, int maxrefs)
355 : {
356 104 : int nbackrefs = 0;
357 :
358 173 : for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
359 69 : if (replacement[i] == '$' || replacement[i] == '\\') {
360 10 : char *endptr;
361 10 : backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
362 10 : if (endptr > replacement + i + 1) {
363 10 : int k = (int) (endptr - (replacement + i + 1));
364 10 : backrefs[nbackrefs].start = i;
365 10 : backrefs[nbackrefs].end = i + k + 1;
366 10 : nbackrefs++;
367 0 : } else if (replacement[i] == replacement[i + 1]) {
368 : /* doubled $ or \, we must copy just one to the output */
369 0 : backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */
370 0 : backrefs[nbackrefs].start = i;
371 0 : backrefs[nbackrefs].end = i + 1;
372 0 : i++; /* don't look at second $ or \ again */
373 0 : nbackrefs++;
374 : }
375 : /* else: $ or \ followed by something we don't recognize,
376 : * so just leave it */
377 : }
378 : }
379 104 : return nbackrefs;
380 : }
381 :
382 : static char *
383 51342 : single_replace(pcre *pcre_code, pcre_extra *extra,
384 : const char *origin_str, int len_origin_str,
385 : int exec_options, int *ovector, int ovecsize,
386 : const char *replacement, int len_replacement,
387 : struct backref *backrefs, int nbackrefs,
388 : bool global, char *result, int *max_result)
389 : {
390 51342 : int offset = 0;
391 51342 : int len_result = 0;
392 51342 : int addlen;
393 51342 : int empty_match_correction = 0;
394 191251 : char *tmp;
395 :
396 191251 : do {
397 191251 : int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
398 : exec_options, ovector, ovecsize);
399 191296 : if (j <= 0)
400 : break;
401 :
402 143586 : empty_match_correction = ovector[0] == ovector[1] ? 1 : 0;
403 :
404 : // calculate the length of the string that will be appended to result
405 287172 : addlen = ovector[0] - offset
406 143586 : + (nbackrefs == 0 ? len_replacement : 0) + empty_match_correction;
407 143586 : if (len_result + addlen >= *max_result) {
408 12149 : tmp = GDKrealloc(result, len_result + addlen + 1);
409 12149 : if (tmp == NULL) {
410 0 : GDKfree(result);
411 0 : return NULL;
412 : }
413 12149 : result = tmp;
414 12149 : *max_result = len_result + addlen + 1;
415 : }
416 : // append to the result the parts of the original string that are left unchanged
417 143586 : if (ovector[0] > offset) {
418 139340 : strncpy(result + len_result, origin_str + offset,
419 139340 : ovector[0] - offset);
420 139340 : len_result += ovector[0] - offset;
421 : }
422 : // append to the result the replacement of the matched string
423 143586 : if (nbackrefs == 0) {
424 139915 : strncpy(result + len_result, replacement, len_replacement);
425 139915 : len_result += len_replacement;
426 : } else {
427 : int prevend = 0;
428 7342 : for (int i = 0; i < nbackrefs; i++) {
429 3671 : int off, len;
430 3671 : if (backrefs[i].idx >= ovecsize / 3) {
431 : /* out of bounds, replace with empty string */
432 : off = 0;
433 : len = 0;
434 : } else {
435 3671 : off = ovector[backrefs[i].idx * 2];
436 3671 : len = ovector[backrefs[i].idx * 2 + 1] - off;
437 : }
438 3671 : addlen = backrefs[i].start - prevend + len;
439 3671 : if (len_result + addlen >= *max_result) {
440 37 : tmp = GDKrealloc(result, len_result + addlen + 1);
441 37 : if (tmp == NULL) {
442 0 : GDKfree(result);
443 0 : return NULL;
444 : }
445 37 : result = tmp;
446 37 : *max_result = len_result + addlen + 1;
447 : }
448 3671 : if (backrefs[i].start > prevend) {
449 2 : strncpy(result + len_result, replacement + prevend,
450 2 : backrefs[i].start - prevend);
451 2 : len_result += backrefs[i].start - prevend;
452 : }
453 3671 : if (len > 0) {
454 3671 : strncpy(result + len_result, origin_str + off, len);
455 3671 : len_result += len;
456 : }
457 3671 : prevend = backrefs[i].end;
458 : }
459 : /* copy rest of replacement string (after last backref) */
460 3671 : addlen = len_replacement - prevend;
461 3671 : if (addlen > 0) {
462 2 : if (len_result + addlen >= *max_result) {
463 1 : tmp = GDKrealloc(result, len_result + addlen + 1);
464 1 : if (tmp == NULL) {
465 0 : GDKfree(result);
466 0 : return NULL;
467 : }
468 1 : result = tmp;
469 1 : *max_result = len_result + addlen + 1;
470 : }
471 2 : strncpy(result + len_result, replacement + prevend, addlen);
472 2 : len_result += addlen;
473 : }
474 : }
475 : // In case of an empty match just advance the offset by 1
476 143586 : offset = ovector[1] + empty_match_correction;
477 : // and copy the character that we just advanced over
478 143586 : if (empty_match_correction) {
479 14 : strncpy(result + len_result, origin_str + ovector[1], 1);
480 14 : ++len_result;
481 : }
482 : // before we loop around check with the offset - 1 if we had an empty match
483 : // since we manually advanced the offset by one. otherwise we gonna skip a
484 : // replacement at the end of the string
485 143586 : } while ((offset - empty_match_correction) < len_origin_str && global);
486 :
487 51387 : if (offset < len_origin_str) {
488 47716 : addlen = len_origin_str - offset;
489 47716 : if (len_result + addlen >= *max_result) {
490 575 : tmp = GDKrealloc(result, len_result + addlen + 1);
491 575 : if (tmp == NULL) {
492 0 : GDKfree(result);
493 0 : return NULL;
494 : }
495 575 : result = tmp;
496 575 : *max_result = len_result + addlen + 1;
497 : }
498 47716 : strncpy(result + len_result, origin_str + offset, addlen);
499 47716 : len_result += addlen;
500 : }
501 : /* null terminate string */
502 51387 : result[len_result] = '\0';
503 51387 : return result;
504 : }
505 : #endif
506 :
507 : static str
508 14 : pcre_replace(str *res, const char *origin_str, const char *pattern,
509 : const char *replacement, const char *flags, bool global)
510 : {
511 : #ifdef HAVE_LIBPCRE
512 14 : const char *err_p = NULL;
513 14 : pcre *pcre_code = NULL;
514 14 : pcre_extra *extra;
515 14 : char *tmpres;
516 14 : int max_result;
517 14 : int i, errpos = 0;
518 14 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
519 14 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
520 14 : int *ovector, ovecsize;
521 14 : int len_origin_str = (int) strlen(origin_str);
522 14 : int len_replacement = (int) strlen(replacement);
523 14 : struct backref backrefs[MAX_NR_REFS];
524 14 : int nbackrefs = 0;
525 :
526 21 : while (*flags) {
527 7 : switch (*flags) {
528 : case 'e':
529 : exec_options &= ~PCRE_NOTEMPTY;
530 : break;
531 1 : case 'i':
532 1 : compile_options |= PCRE_CASELESS;
533 1 : break;
534 1 : case 'm':
535 1 : compile_options |= PCRE_MULTILINE;
536 1 : break;
537 1 : case 's':
538 1 : compile_options |= PCRE_DOTALL;
539 1 : break;
540 1 : case 'x':
541 1 : compile_options |= PCRE_EXTENDED;
542 1 : break;
543 0 : default:
544 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
545 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
546 : *flags);
547 : }
548 7 : flags++;
549 : }
550 :
551 14 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
552 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
553 : OPERATION_FAILED
554 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
555 : pattern, errpos, err_p);
556 : }
557 :
558 : /* Since the compiled pattern is going to be used several times, it is
559 : * worth spending more time analyzing it in order to speed up the time
560 : * taken for matching.
561 : */
562 14 : extra = pcre_study(pcre_code, 0, &err_p);
563 14 : if (err_p != NULL) {
564 0 : pcre_free(pcre_code);
565 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
566 : OPERATION_FAILED
567 : ": pcre study of pattern (%s) failed with '%s'.\n", pattern,
568 : err_p);
569 : }
570 14 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
571 14 : ovecsize = (i + 1) * 3;
572 14 : if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
573 0 : pcre_free_study(extra);
574 0 : pcre_free(pcre_code);
575 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
576 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
577 : }
578 :
579 : /* identify back references in the replacement string */
580 14 : nbackrefs = parse_replacement(replacement, len_replacement,
581 : backrefs, MAX_NR_REFS);
582 :
583 14 : max_result = len_origin_str + 1;
584 14 : tmpres = GDKmalloc(max_result);
585 14 : if (tmpres == NULL) {
586 0 : GDKfree(ovector);
587 0 : pcre_free_study(extra);
588 0 : pcre_free(pcre_code);
589 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
590 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
591 : }
592 :
593 14 : tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
594 : exec_options, ovector, ovecsize, replacement,
595 : len_replacement, backrefs, nbackrefs, global,
596 : tmpres, &max_result);
597 14 : GDKfree(ovector);
598 14 : pcre_free_study(extra);
599 14 : pcre_free(pcre_code);
600 14 : if (tmpres == NULL)
601 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
602 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
603 :
604 14 : *res = tmpres;
605 14 : return MAL_SUCCEED;
606 : #else
607 : (void) res;
608 : (void) origin_str;
609 : (void) pattern;
610 : (void) replacement;
611 : (void) flags;
612 : (void) global;
613 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
614 : "Database was compiled without PCRE support.");
615 : #endif
616 : }
617 :
618 : static str
619 90 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
620 : const char *replacement, const char *flags, bool global)
621 : {
622 : #ifdef HAVE_LIBPCRE
623 90 : const char *err_p = NULL;
624 90 : char *tmpres;
625 90 : int i, errpos = 0;
626 90 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
627 90 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
628 90 : pcre *pcre_code = NULL;
629 90 : pcre_extra *extra;
630 90 : BAT *tmpbat;
631 90 : BUN p, q;
632 90 : int *ovector, ovecsize;
633 90 : int len_replacement = (int) strlen(replacement);
634 90 : struct backref backrefs[MAX_NR_REFS];
635 90 : int nbackrefs = 0;
636 90 : const char *origin_str;
637 90 : int max_dest_size = 0;
638 :
639 126 : while (*flags) {
640 36 : switch (*flags) {
641 : case 'e':
642 : exec_options &= ~PCRE_NOTEMPTY;
643 : break;
644 9 : case 'i':
645 9 : compile_options |= PCRE_CASELESS;
646 9 : break;
647 18 : case 'm':
648 18 : compile_options |= PCRE_MULTILINE;
649 18 : break;
650 9 : case 's':
651 9 : compile_options |= PCRE_DOTALL;
652 9 : break;
653 0 : case 'x':
654 0 : compile_options |= PCRE_EXTENDED;
655 0 : break;
656 0 : default:
657 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
658 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
659 : *flags);
660 : }
661 36 : flags++;
662 : }
663 :
664 90 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
665 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
666 : OPERATION_FAILED
667 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
668 : pattern, errpos, err_p);
669 : }
670 :
671 : /* Since the compiled pattern is going to be used several times,
672 : * it is worth spending more time analyzing it in order to speed
673 : * up the time taken for matching.
674 : */
675 180 : extra = pcre_study(pcre_code,
676 90 : BATcount(origin_strs) >
677 : JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
678 90 : if (err_p != NULL) {
679 0 : pcre_free(pcre_code);
680 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
681 : OPERATION_FAILED);
682 : }
683 90 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
684 90 : ovecsize = (i + 1) * 3;
685 90 : if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
686 0 : pcre_free_study(extra);
687 0 : pcre_free(pcre_code);
688 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
689 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
690 : }
691 :
692 : /* identify back references in the replacement string */
693 90 : nbackrefs = parse_replacement(replacement, len_replacement,
694 : backrefs, MAX_NR_REFS);
695 :
696 90 : tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs),
697 : TRANSIENT);
698 :
699 : /* the buffer for all destination strings is allocated only once,
700 : * and extended when needed */
701 90 : max_dest_size = len_replacement + 1;
702 90 : tmpres = GDKmalloc(max_dest_size);
703 90 : if (tmpbat == NULL || tmpres == NULL) {
704 0 : pcre_free_study(extra);
705 0 : pcre_free(pcre_code);
706 0 : GDKfree(ovector);
707 0 : BBPreclaim(tmpbat);
708 0 : GDKfree(tmpres);
709 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
710 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
711 : }
712 90 : BATiter origin_strsi = bat_iterator(origin_strs);
713 51410 : BATloop(origin_strs, p, q) {
714 51320 : origin_str = BUNtvar(origin_strsi, p);
715 102692 : tmpres = single_replace(pcre_code, extra, origin_str,
716 51320 : (int) strlen(origin_str), exec_options,
717 : ovector, ovecsize, replacement,
718 : len_replacement, backrefs, nbackrefs, global,
719 : tmpres, &max_dest_size);
720 51372 : if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
721 0 : bat_iterator_end(&origin_strsi);
722 0 : pcre_free_study(extra);
723 0 : pcre_free(pcre_code);
724 0 : GDKfree(ovector);
725 0 : GDKfree(tmpres);
726 0 : BBPreclaim(tmpbat);
727 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
728 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
729 : }
730 : }
731 90 : bat_iterator_end(&origin_strsi);
732 90 : pcre_free_study(extra);
733 90 : pcre_free(pcre_code);
734 90 : GDKfree(ovector);
735 90 : GDKfree(tmpres);
736 90 : *res = tmpbat;
737 90 : return MAL_SUCCEED;
738 : #else
739 : (void) res;
740 : (void) origin_strs;
741 : (void) pattern;
742 : (void) replacement;
743 : (void) flags;
744 : (void) global;
745 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
746 : "Database was compiled without PCRE support.");
747 : #endif
748 : }
749 :
750 : static str
751 4 : pcre_match_with_flags(bit *ret, const char *val, const char *pat,
752 : const char *flags)
753 : {
754 4 : int pos;
755 : #ifdef HAVE_LIBPCRE
756 4 : const char *err_p = NULL;
757 4 : int errpos = 0;
758 4 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_DOTALL;
759 4 : pcre *re;
760 : #else
761 : int options = REG_NOSUB | REG_EXTENDED;
762 : regex_t re;
763 : int errcode;
764 : int retval;
765 : #endif
766 :
767 4 : while (*flags) {
768 0 : switch (*flags) {
769 0 : case 'i':
770 : #ifdef HAVE_LIBPCRE
771 0 : options |= PCRE_CASELESS;
772 : #else
773 : options |= REG_ICASE;
774 : #endif
775 0 : break;
776 0 : case 'm':
777 : #ifdef HAVE_LIBPCRE
778 0 : options |= PCRE_MULTILINE;
779 : #else
780 : options |= REG_NEWLINE;
781 : #endif
782 0 : break;
783 : #ifdef HAVE_LIBPCRE
784 0 : case 's':
785 0 : options |= PCRE_DOTALL;
786 0 : break;
787 : #endif
788 0 : case 'x':
789 : #ifdef HAVE_LIBPCRE
790 0 : options |= PCRE_EXTENDED;
791 : #else
792 : options |= REG_EXTENDED;
793 : #endif
794 0 : break;
795 0 : default:
796 0 : throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
797 : ": unsupported flag character '%c'\n", *flags);
798 : }
799 0 : flags++;
800 : }
801 4 : if (strNil(val)) {
802 0 : *ret = FALSE;
803 0 : return MAL_SUCCEED;
804 : }
805 :
806 : #ifdef HAVE_LIBPCRE
807 4 : if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
808 : #else
809 : if ((errcode = regcomp(&re, pat, options)) != 0)
810 : #endif
811 : {
812 0 : throw(MAL, "pcre.match", OPERATION_FAILED
813 : ": compilation of regular expression (%s) failed "
814 : #ifdef HAVE_LIBPCRE
815 : "at %d with '%s'", pat, errpos, err_p
816 : #else
817 : , pat
818 : #endif
819 : );
820 : }
821 : #ifdef HAVE_LIBPCRE
822 4 : pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK,
823 : NULL, 0);
824 4 : pcre_free(re);
825 : #else
826 : retval = regexec(&re, val, (size_t) 0, NULL, 0);
827 : pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
828 : regfree(&re);
829 : #endif
830 4 : if (pos >= 0)
831 3 : *ret = TRUE;
832 1 : else if (pos == -1)
833 1 : *ret = FALSE;
834 : else
835 0 : throw(MAL, "pcre.match", OPERATION_FAILED
836 : ": matching of regular expression (%s) failed with %d", pat, pos);
837 : return MAL_SUCCEED;
838 : }
839 :
840 : #ifdef HAVE_LIBPCRE
841 : /* special characters in PCRE that need to be escaped */
842 : static const char pcre_specials[] = "$()*+.?[\\]^{|}";
843 : #else
844 : /* special characters in POSIX basic regular expressions that need to
845 : * be escaped */
846 : static const char pcre_specials[] = "$()*+.?[\\^{|";
847 : #endif
848 :
849 : /* change SQL LIKE pattern into PCRE pattern */
850 : static str
851 6 : sql2pcre(str *r, const char *pat, const char *esc_str)
852 : {
853 6 : int escaped = 0;
854 6 : int hasWildcard = 0;
855 6 : char *ppat;
856 12 : int esc = strNil(esc_str) ? 0 : esc_str[0]; /* should change to utf8_convert() */
857 6 : int specials;
858 6 : int c;
859 :
860 6 : if (strlen(esc_str) > 1)
861 0 : throw(MAL, "pcre.sql2pcre",
862 : SQLSTATE(22019) ILLEGAL_ARGUMENT
863 : ": ESCAPE string must have length 1");
864 6 : if (pat == NULL)
865 0 : throw(MAL, "pcre.sql2pcre",
866 : SQLSTATE(22019) ILLEGAL_ARGUMENT
867 : ": (I)LIKE pattern must not be NULL");
868 6 : ppat = GDKmalloc(strlen(pat) * 3 +
869 : 3 /* 3 = "^'the translated regexp'$0" */ );
870 6 : if (ppat == NULL)
871 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
872 :
873 6 : *r = ppat;
874 : /* The escape character can be a char which is special in a PCRE
875 : * expression. If the user used the "+" char as escape and has "++"
876 : * in their pattern, then replacing this with "+" is not correct and
877 : * should be "\+" instead. */
878 6 : specials = (esc && strchr(pcre_specials, esc) != NULL);
879 :
880 6 : *ppat++ = '^';
881 17 : while ((c = *pat++) != 0) {
882 11 : if (c == esc) {
883 2 : if (escaped) {
884 1 : if (specials) { /* change ++ into \+ */
885 1 : *ppat++ = esc;
886 : } else { /* do not escape simple escape symbols */
887 0 : ppat[-1] = esc; /* overwrite backslash */
888 : }
889 : escaped = 0;
890 : } else {
891 1 : *ppat++ = '\\';
892 1 : escaped = 1;
893 : }
894 : hasWildcard = 1;
895 9 : } else if (strchr(pcre_specials, c) != NULL) {
896 : /* escape PCRE special chars, avoid double backslash if the
897 : * user uses an invalid escape sequence */
898 2 : if (!escaped)
899 2 : *ppat++ = '\\';
900 2 : *ppat++ = c;
901 2 : hasWildcard = 1;
902 2 : escaped = 0;
903 7 : } else if (c == '%' && !escaped) {
904 3 : *ppat++ = '.';
905 3 : *ppat++ = '*';
906 3 : *ppat++ = '?';
907 3 : hasWildcard = 1;
908 : /* collapse multiple %, but only if it isn't the escape */
909 3 : if (esc != '%')
910 3 : while (*pat == '%')
911 0 : pat++;
912 4 : } else if (c == '_' && !escaped) {
913 3 : *ppat++ = '.';
914 3 : hasWildcard = 1;
915 : } else {
916 1 : if (escaped) {
917 0 : ppat[-1] = c; /* overwrite backslash of invalid escape */
918 : } else {
919 1 : *ppat++ = c;
920 : }
921 : escaped = 0;
922 : }
923 : }
924 : /* no wildcard or escape character at end of string */
925 6 : if (!hasWildcard || escaped) {
926 1 : GDKfree(*r);
927 1 : *r = NULL;
928 1 : if (escaped)
929 0 : throw(MAL, "pcre.sql2pcre",
930 : SQLSTATE(22019) ILLEGAL_ARGUMENT
931 : ": (I)LIKE pattern must not end with escape character");
932 1 : *r = GDKstrdup(str_nil);
933 1 : if (*r == NULL)
934 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
935 : } else {
936 5 : *ppat++ = '$';
937 5 : *ppat = 0;
938 : }
939 : return MAL_SUCCEED;
940 : }
941 :
942 : #ifdef HAVE_LIBPCRE
943 : /* change SQL PATINDEX pattern into PCRE pattern */
944 : static str
945 25 : pat2pcre(str *r, const char *pat)
946 : {
947 25 : size_t len = strlen(pat);
948 25 : char *ppat = GDKmalloc(len * 2 + 3 /* 3 = "^'the translated regexp'$0" */ );
949 25 : int start = 0;
950 :
951 25 : if (ppat == NULL)
952 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
953 25 : *r = ppat;
954 77 : while (*pat) {
955 52 : int c = *pat++;
956 :
957 52 : if (strchr(pcre_specials, c) != NULL) {
958 17 : *ppat++ = '\\';
959 17 : *ppat++ = c;
960 35 : } else if (c == '%') {
961 3 : if (start && *pat) {
962 0 : *ppat++ = '.';
963 0 : *ppat++ = '*';
964 : }
965 3 : start++;
966 32 : } else if (c == '_') {
967 0 : *ppat++ = '.';
968 : } else {
969 32 : *ppat++ = c;
970 : }
971 : }
972 25 : *ppat = 0;
973 25 : return MAL_SUCCEED;
974 : }
975 : #endif
976 :
977 : /*
978 : * @+ Wrapping
979 : */
980 :
981 : static str
982 14 : PCREreplace_wrap(str *res, const char *const *or, const char *const *pat,
983 : const char *const *repl, const char *const *flags)
984 : {
985 14 : return pcre_replace(res, *or, *pat, *repl, *flags, true);
986 : }
987 :
988 : static str
989 0 : PCREreplacefirst_wrap(str *res, const char *const *or, const char *const *pat,
990 : const char *const *repl, const char *const *flags)
991 : {
992 0 : return pcre_replace(res, *or, *pat, *repl, *flags, false);
993 : }
994 :
995 : static str
996 90 : PCREreplace_bat_wrap(bat *res, const bat *bid, const char *const *pat,
997 : const char *const *repl, const char *const *flags)
998 : {
999 90 : BAT *b, *bn = NULL;
1000 90 : str msg;
1001 90 : if ((b = BATdescriptor(*bid)) == NULL)
1002 0 : throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1003 :
1004 90 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
1005 90 : if (msg == MAL_SUCCEED) {
1006 90 : *res = bn->batCacheid;
1007 90 : BBPkeepref(bn);
1008 : }
1009 90 : BBPunfix(b->batCacheid);
1010 90 : return msg;
1011 : }
1012 :
1013 : static str
1014 0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const char *const *pat,
1015 : const char *const *repl, const char *const *flags)
1016 : {
1017 0 : BAT *b, *bn = NULL;
1018 0 : str msg;
1019 0 : if ((b = BATdescriptor(*bid)) == NULL)
1020 0 : throw(MAL, "batpcre.replace_first", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1021 :
1022 0 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
1023 0 : if (msg == MAL_SUCCEED) {
1024 0 : *res = bn->batCacheid;
1025 0 : BBPkeepref(bn);
1026 : }
1027 0 : BBPunfix(b->batCacheid);
1028 0 : return msg;
1029 : }
1030 :
1031 : static str
1032 4 : PCREmatch(bit *ret, const char *const *val, const char *const *pat)
1033 : {
1034 4 : return pcre_match_with_flags(ret, *val, *pat, "");
1035 : }
1036 :
1037 : static str
1038 0 : PCREimatch(bit *ret, const char *const *val, const char *const *pat)
1039 : {
1040 0 : return pcre_match_with_flags(ret, *val, *pat, "i");
1041 : }
1042 :
1043 : static str
1044 25 : PCREindex(int *res, const pcre *pattern, const char *const *s)
1045 : {
1046 : #ifdef HAVE_LIBPCRE
1047 25 : int v[3];
1048 :
1049 25 : v[0] = v[1] = *res = 0;
1050 25 : if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0,
1051 : PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
1052 23 : *res = v[1];
1053 : }
1054 25 : return MAL_SUCCEED;
1055 : #else
1056 : (void) res;
1057 : (void) pattern;
1058 : (void) s;
1059 : throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
1060 : #endif
1061 : }
1062 :
1063 : static str
1064 27 : PCREpatindex(int *ret, const char *const *pat, const char *const *val)
1065 : {
1066 : #ifdef HAVE_LIBPCRE
1067 27 : pcre *re = NULL;
1068 27 : char *ppat = NULL, *msg;
1069 :
1070 53 : if (strNil(*pat) || strNil(*val)) {
1071 2 : *ret = int_nil;
1072 2 : return MAL_SUCCEED;
1073 : }
1074 :
1075 25 : if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
1076 : return msg;
1077 25 : if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
1078 0 : GDKfree(ppat);
1079 0 : return msg;
1080 : }
1081 25 : GDKfree(ppat);
1082 25 : msg = PCREindex(ret, re, val);
1083 25 : pcre_free(re);
1084 25 : return msg;
1085 : #else
1086 : (void) ret;
1087 : (void) pat;
1088 : (void) val;
1089 : throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
1090 : #endif
1091 : }
1092 :
1093 : static str
1094 0 : PCREquote(str *ret, const char *const *val)
1095 : {
1096 0 : char *p;
1097 0 : const char *s = *val;
1098 :
1099 0 : *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */
1100 0 : if (p == NULL)
1101 0 : throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1102 : /* quote all non-alphanumeric ASCII characters (i.e. leave
1103 : non-ASCII and alphanumeric alone) */
1104 0 : while (*s) {
1105 0 : if (!((*s & 0x80) != 0 ||
1106 0 : ('a' <= *s && *s <= 'z') ||
1107 0 : ('A' <= *s && *s <= 'Z') || isdigit((unsigned char) *s)))
1108 0 : *p++ = '\\';
1109 0 : *p++ = *s++;
1110 : }
1111 0 : *p = 0;
1112 0 : return MAL_SUCCEED;
1113 : }
1114 :
1115 : static str
1116 6 : PCREsql2pcre(str *ret, const char *const *pat, const char *const *esc)
1117 : {
1118 6 : return sql2pcre(ret, *pat, *esc);
1119 : }
1120 :
1121 : static inline str
1122 7341 : choose_like_path(bool *use_re, bool *use_strcmp, bool *empty,
1123 : const char *pat, const char *esc)
1124 : {
1125 7341 : str res = MAL_SUCCEED;
1126 7341 : *use_re = false;
1127 7341 : *use_strcmp = false;
1128 7341 : *empty = false;
1129 :
1130 :
1131 14192 : if (strNil(pat) || strNil(esc)) {
1132 490 : *empty = true;
1133 : } else {
1134 6851 : if (!mnre_is_pattern_properly_escaped(pat, (unsigned char) *esc))
1135 5 : throw(MAL, "pcre.sql2pcre",
1136 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1137 : ": (I)LIKE pattern must not end with escape character");
1138 6845 : if (is_strcmpable(pat, esc)) {
1139 882 : *use_re = true;
1140 882 : *use_strcmp = true;
1141 : } else {
1142 5963 : *use_re = true;
1143 : }
1144 : }
1145 : return res;
1146 : }
1147 :
1148 : static str
1149 234 : PCRElike_imp(bit *ret, const char *const *s, const char *const *pat,
1150 : const char *const *esc, const bit *isens)
1151 : {
1152 234 : str res = MAL_SUCCEED;
1153 234 : bool use_re = false, use_strcmp = false, empty = false;
1154 234 : struct RE *re = NULL;
1155 :
1156 234 : if ((res = choose_like_path(&use_re, &use_strcmp, &empty,
1157 : *pat, *esc)) != MAL_SUCCEED)
1158 : return res;
1159 :
1160 459 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ?
1161 225 : "pcrelike: pattern matching using strcmp" : use_re ?
1162 : "pcrelike: pattern matching using RE" :
1163 : "pcrelike: pattern matching using pcre");
1164 :
1165 468 : if (strNil(*s) || empty) {
1166 0 : *ret = bit_nil;
1167 : } else {
1168 234 : if (use_strcmp) {
1169 9 : *ret = *isens ? GDKstrcasecmp(*s, *pat) == 0
1170 7 : : strcmp(*s, *pat) == 0;
1171 : } else {
1172 225 : if (!(re = mnre_create(*pat, *isens, (unsigned char) **esc)))
1173 0 : res = createException(MAL, "pcre.like4",
1174 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1175 : else
1176 225 : *ret = mnre_match(*s, re);
1177 : }
1178 : }
1179 :
1180 234 : if (re)
1181 225 : mnre_destroy(re);
1182 : return res;
1183 : }
1184 :
1185 : static str
1186 234 : PCRElike(bit *ret, const char *const *s, const char *const *pat,
1187 : const char *const *esc, const bit *isens)
1188 : {
1189 229 : return PCRElike_imp(ret, s, pat, esc, isens);
1190 : }
1191 :
1192 : static str
1193 5 : PCREnotlike(bit *ret, const char *const *s, const char *const *pat,
1194 : const char *const *esc, const bit *isens)
1195 : {
1196 5 : str tmp;
1197 5 : bit r;
1198 :
1199 5 : rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
1200 5 : *ret = r == bit_nil ? bit_nil : !r;
1201 5 : return MAL_SUCCEED;
1202 : }
1203 :
1204 : static inline str
1205 6613 : mnre_like_build(struct RE **re, const char *pat, bool caseignore,
1206 : bool use_strcmp, uint32_t esc)
1207 : {
1208 6613 : if (!use_strcmp) {
1209 5740 : if (!(*re = mnre_create(pat, caseignore, esc)))
1210 0 : return createException(MAL, "pcre.re_like_build",
1211 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1212 : }
1213 : return MAL_SUCCEED;
1214 : }
1215 :
1216 : static inline bit
1217 6251 : mnre_like_proj_apply(const char *s, const struct RE *restrict re,
1218 : const char *pat,
1219 : bool caseignore, bool anti, bool use_strcmp)
1220 : {
1221 6251 : if (strNil(s))
1222 446 : return bit_nil;
1223 5805 : if (use_strcmp) {
1224 1134 : if (caseignore) {
1225 492 : if (anti)
1226 461 : return GDKstrcasecmp(s, pat) != 0;
1227 : else
1228 31 : return GDKstrcasecmp(s, pat) == 0;
1229 : } else {
1230 642 : if (anti)
1231 302 : return strcmp(s, pat) != 0;
1232 : else
1233 340 : return strcmp(s, pat) == 0;
1234 : }
1235 : } else {
1236 4671 : if (anti)
1237 137 : return !mnre_match(s, re);
1238 : else
1239 4534 : return mnre_match(s, re);
1240 : }
1241 : }
1242 :
1243 : static inline void
1244 6723 : mnre_like_clean(struct RE **re)
1245 : {
1246 6723 : if (*re) {
1247 562 : mnre_destroy(*re);
1248 5740 : *re = NULL;
1249 : }
1250 : }
1251 :
1252 : static str
1253 684 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci,
1254 : const char *const *esc, const bit *isens, const bit *not)
1255 : {
1256 684 : str msg = MAL_SUCCEED;
1257 684 : BAT *b = NULL, *pbn = NULL, *bn = NULL;
1258 684 : const char *input = NULL;
1259 684 : bool use_re = false,
1260 684 : use_strcmp = false,
1261 684 : empty = false,
1262 684 : isensitive = (bool) *isens,
1263 684 : anti = (bool) *not,
1264 684 : has_nil = false,
1265 684 : input_is_a_bat = isaBatType(getArgType(mb, pci, 1)),
1266 684 : pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
1267 684 : bat *r = getArgReference_bat(stk, pci, 0);
1268 684 : BUN q = 0;
1269 684 : bit *restrict ret = NULL;
1270 684 : struct RE *mnre_simple = NULL;
1271 684 : BATiter bi = (BATiter) { 0 }, pi;
1272 :
1273 684 : (void) cntxt;
1274 684 : if (input_is_a_bat) {
1275 684 : bat *bid = getArgReference_bat(stk, pci, 1);
1276 684 : if (!(b = BATdescriptor(*bid))) {
1277 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1278 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1279 0 : goto bailout;
1280 : }
1281 : }
1282 684 : if (pattern_is_a_bat) {
1283 88 : bat *pb = getArgReference_bat(stk, pci, 2);
1284 88 : if (!(pbn = BATdescriptor(*pb))) {
1285 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1286 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1287 0 : goto bailout;
1288 : }
1289 : }
1290 684 : assert((!b || ATOMstorage(b->ttype) == TYPE_str)
1291 : && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
1292 :
1293 684 : q = BATcount(b ? b : pbn);
1294 684 : if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
1295 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1296 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1297 0 : goto bailout;
1298 : }
1299 684 : ret = (bit *) Tloc(bn, 0);
1300 :
1301 684 : if (pattern_is_a_bat) {
1302 88 : pi = bat_iterator(pbn);
1303 88 : if (b)
1304 88 : bi = bat_iterator(b);
1305 : else
1306 0 : input = *getArgReference_str(stk, pci, 1);
1307 :
1308 1219 : for (BUN p = 0; p < q; p++) {
1309 1131 : const char *next_input = b ? BUNtvar(bi, p) : input,
1310 1131 : *np = BUNtvar(pi, p);
1311 :
1312 1131 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty,
1313 : np, *esc)) != MAL_SUCCEED) {
1314 0 : bat_iterator_end(&pi);
1315 0 : if (b)
1316 0 : bat_iterator_end(&bi);
1317 0 : goto bailout;
1318 : }
1319 :
1320 1132 : if (empty) {
1321 462 : ret[p] = bit_nil;
1322 : } else {
1323 670 : if ((msg = mnre_like_build(&mnre_simple, np, isensitive,
1324 : use_strcmp,
1325 670 : (unsigned char) **esc)) != MAL_SUCCEED) {
1326 0 : bat_iterator_end(&pi);
1327 0 : if (b)
1328 0 : bat_iterator_end(&bi);
1329 0 : goto bailout;
1330 : }
1331 670 : ret[p] = mnre_like_proj_apply(next_input, mnre_simple, np,
1332 : isensitive, anti, use_strcmp);
1333 670 : mnre_like_clean(&mnre_simple);
1334 : }
1335 1132 : has_nil |= is_bit_nil(ret[p]);
1336 : }
1337 88 : bat_iterator_end(&pi);
1338 88 : if (b)
1339 88 : bat_iterator_end(&bi);
1340 : } else {
1341 596 : const char *pat = *getArgReference_str(stk, pci, 2);
1342 596 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty,
1343 : pat, *esc)) != MAL_SUCCEED)
1344 5 : goto bailout;
1345 :
1346 591 : bi = bat_iterator(b);
1347 1123 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp
1348 : ? "pcrelike: pattern matching using strcmp" :
1349 532 : use_re ? "pcrelike: pattern matching using RE" :
1350 : "pcrelike: pattern matching using pcre");
1351 :
1352 591 : if (empty) {
1353 43 : for (BUN p = 0; p < q; p++)
1354 26 : ret[p] = bit_nil;
1355 : has_nil = true;
1356 : } else {
1357 574 : if ((msg = mnre_like_build(&mnre_simple, pat, isensitive, use_strcmp,
1358 574 : (unsigned char) **esc)) != MAL_SUCCEED) {
1359 0 : bat_iterator_end(&bi);
1360 0 : goto bailout;
1361 : }
1362 6155 : for (BUN p = 0; p < q; p++) {
1363 5582 : const char *s = BUNtvar(bi, p);
1364 5585 : ret[p] = mnre_like_proj_apply(s, mnre_simple, pat, isensitive,
1365 : anti, use_strcmp);
1366 5578 : has_nil |= is_bit_nil(ret[p]);
1367 : }
1368 : }
1369 590 : bat_iterator_end(&bi);
1370 : }
1371 :
1372 684 : bailout:
1373 684 : mnre_like_clean(&mnre_simple);
1374 684 : if (bn && !msg) {
1375 679 : BATsetcount(bn, q);
1376 679 : bn->tnil = has_nil;
1377 679 : bn->tnonil = !has_nil;
1378 679 : bn->tkey = BATcount(bn) <= 1;
1379 679 : bn->tsorted = BATcount(bn) <= 1;
1380 679 : bn->trevsorted = BATcount(bn) <= 1;
1381 679 : *r = bn->batCacheid;
1382 679 : BBPkeepref(bn);
1383 5 : } else if (bn)
1384 5 : BBPreclaim(bn);
1385 684 : BBPreclaim(b);
1386 684 : BBPreclaim(pbn);
1387 684 : return msg;
1388 : }
1389 :
1390 : static str
1391 541 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1392 : {
1393 541 : const char *esc = *getArgReference_str(stk, pci, 3);
1394 541 : const bit *ci = getArgReference_bit(stk, pci, 4);
1395 541 : bit no = FALSE;
1396 :
1397 541 : return BATPCRElike_imp(cntxt, mb, stk, pci, &esc, ci, &no);
1398 : }
1399 :
1400 : static str
1401 143 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1402 : {
1403 143 : const char *esc = *getArgReference_str(stk, pci, 3);
1404 143 : const bit *ci = getArgReference_bit(stk, pci, 4);
1405 143 : bit yes = TRUE;
1406 :
1407 143 : return BATPCRElike_imp(cntxt, mb, stk, pci, &esc, ci, &yes);
1408 : }
1409 :
1410 : /* scan select loop with or without candidates */
1411 : #define pcrescanloop(TEST, KEEP_NULLS) \
1412 : do { \
1413 : TRC_DEBUG(ALGO, \
1414 : "PCREselect(b=%s#"BUNFMT",anti=%d): " \
1415 : "scanselect %s\n", BATgetId(b), BATcount(b), \
1416 : anti, #TEST); \
1417 : if (!s || BATtdense(s)) { \
1418 : for (; p < q; p++) { \
1419 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1420 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1421 : const char *restrict v = BUNtvar(bi, p - off); \
1422 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1423 : vals[cnt++] = p; \
1424 : } \
1425 : } else { \
1426 : for (; p < ncands; p++) { \
1427 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1428 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1429 : oid o = canditer_next(ci); \
1430 : const char *restrict v = BUNtvar(bi, o - off); \
1431 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1432 : vals[cnt++] = o; \
1433 : } \
1434 : } \
1435 : } while (0)
1436 :
1437 : static str
1438 5260 : mnre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
1439 : BUN *rcnt, const char *pat, bool caseignore, bool anti,
1440 : bool use_strcmp, uint32_t esc, bool keep_nulls)
1441 : {
1442 5260 : BATiter bi = bat_iterator(b);
1443 5263 : BUN cnt = 0, ncands = ci->ncand;
1444 5263 : oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
1445 5263 : struct RE *re = NULL;
1446 5263 : str msg = MAL_SUCCEED;
1447 :
1448 5263 : size_t counter = 0;
1449 5263 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
1450 :
1451 5263 : if ((msg = mnre_like_build(&re, pat, caseignore, use_strcmp,
1452 : esc)) != MAL_SUCCEED)
1453 0 : goto bailout;
1454 :
1455 5262 : if (use_strcmp) {
1456 85 : if (caseignore) {
1457 27 : if (anti)
1458 58 : pcrescanloop(!strNil(v)
1459 : && GDKstrcasecmp(v, pat) != 0, keep_nulls);
1460 : else
1461 671 : pcrescanloop(!strNil(v)
1462 : && GDKstrcasecmp(v, pat) == 0, keep_nulls);
1463 : } else {
1464 58 : if (anti)
1465 5 : pcrescanloop(!strNil(v) && strcmp(v, pat) != 0, keep_nulls);
1466 : else
1467 8481 : pcrescanloop(!strNil(v) && strcmp(v, pat) == 0, keep_nulls);
1468 : }
1469 : } else {
1470 5177 : if (caseignore) {
1471 74 : if (anti) {
1472 44 : pcrescanloop(!strNil(v)
1473 : && !mnre_match(v, re), keep_nulls);
1474 : } else {
1475 11485 : pcrescanloop(!strNil(v)
1476 : && mnre_match(v, re), keep_nulls);
1477 : }
1478 : } else {
1479 5103 : if (anti)
1480 55258 : pcrescanloop(!strNil(v)
1481 : && !mnre_match(v, re), keep_nulls);
1482 : else
1483 174899 : pcrescanloop(!strNil(v)
1484 : && mnre_match(v, re), keep_nulls);
1485 : }
1486 : }
1487 :
1488 41 : bailout:
1489 5259 : bat_iterator_end(&bi);
1490 5263 : mnre_like_clean(&re);
1491 5263 : *rcnt = cnt;
1492 5263 : return msg;
1493 : }
1494 :
1495 : static str
1496 5263 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const char *const *pat,
1497 : const char *const *esc, const bit *caseignore, const bit *anti)
1498 : {
1499 5263 : BAT *b, *s = NULL, *bn = NULL, *old_s = NULL;
1500 5263 : str msg = MAL_SUCCEED;
1501 5263 : bool use_re = false,
1502 5263 : use_strcmp = false,
1503 5263 : empty = false;
1504 5263 : bool with_strimps = false;
1505 5263 : bool with_strimps_anti = false;
1506 5263 : BUN p = 0, q = 0, rcnt = 0;
1507 5263 : struct canditer ci;
1508 :
1509 5263 : if ((b = BATdescriptor(*bid)) == NULL) {
1510 0 : msg = createException(MAL, "algebra.likeselect",
1511 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1512 0 : goto bailout;
1513 : }
1514 5263 : if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
1515 0 : msg = createException(MAL, "algebra.likeselect",
1516 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1517 0 : goto bailout;
1518 : }
1519 :
1520 5262 : assert(ATOMstorage(b->ttype) == TYPE_str);
1521 :
1522 5262 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty,
1523 : *pat, *esc)) != MAL_SUCCEED)
1524 0 : goto bailout;
1525 :
1526 5248 : if (empty) {
1527 0 : if (!(bn = BATdense(0, 0, 0)))
1528 0 : msg = createException(MAL, "algebra.likeselect",
1529 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1530 :
1531 0 : goto bailout;
1532 : }
1533 : /* Since the strimp pre-filtering of a LIKE query produces a superset of the actual result the complement of that
1534 : * set will necessarily reject some of the matching entries in the NOT LIKE query.
1535 : *
1536 : * In this case we run the PCRElikeselect as a LIKE query with strimps and return the complement of the result,
1537 : * taking extra care to not return NULLs. This currently means that we do not run strimps for NOT LIKE queries if
1538 : * the BAT contains NULLs.
1539 : */
1540 5248 : if (BAThasstrimps(b)) {
1541 24 : if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
1542 24 : BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
1543 24 : if (tmp_s) {
1544 24 : old_s = s;
1545 24 : s = tmp_s;
1546 24 : if (!*anti)
1547 : with_strimps = true;
1548 : else
1549 0 : with_strimps_anti = true;
1550 : }
1551 : } else { /* If we cannot filter with the strimp just continue normally */
1552 0 : GDKclrerr();
1553 : }
1554 : }
1555 :
1556 :
1557 5261 : MT_thread_setalgorithm(use_strcmp
1558 5261 : ? (with_strimps ?
1559 : "pcrelike: pattern matching using strcmp with strimps"
1560 : : (with_strimps_anti ?
1561 : "pcrelike: pattern matching using strcmp with strimps anti"
1562 5261 : : "pcrelike: pattern matching using strcmp")) :
1563 5176 : use_re ? (with_strimps ?
1564 : "pcrelike: pattern matching using RE with strimps"
1565 : : (with_strimps_anti ?
1566 : "pcrelike: patterm matching using RE with strimps anti"
1567 : :
1568 : "pcrelike: pattern matching using RE"))
1569 : : (with_strimps ?
1570 : "pcrelike: pattern matching using pcre with strimps"
1571 : : (with_strimps_anti ?
1572 : "pcrelike: pattermatching using pcre with strimps anti"
1573 : : "pcrelike: pattern matching using pcre")));
1574 :
1575 5263 : canditer_init(&ci, b, s);
1576 5263 : if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
1577 0 : msg = createException(MAL, "algebra.likeselect",
1578 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1579 0 : goto bailout;
1580 : }
1581 :
1582 5261 : if (!s || BATtdense(s)) {
1583 1125 : if (s) {
1584 4095 : assert(BATtdense(s));
1585 4095 : p = (BUN) s->tseqbase;
1586 4095 : q = p + BATcount(s);
1587 4095 : if ((oid) p < b->hseqbase)
1588 : p = b->hseqbase;
1589 4095 : if ((oid) q > b->hseqbase + BATcount(b))
1590 : q = b->hseqbase + BATcount(b);
1591 : } else {
1592 1125 : p = b->hseqbase;
1593 1125 : q = BATcount(b) + b->hseqbase;
1594 : }
1595 : }
1596 :
1597 5261 : msg = mnre_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, *caseignore, *anti
1598 961 : && !with_strimps_anti, use_strcmp,
1599 5261 : (unsigned char) **esc, with_strimps_anti);
1600 :
1601 5263 : if (!msg) { /* set some properties */
1602 5263 : BATsetcount(bn, rcnt);
1603 5263 : bn->tsorted = true;
1604 5263 : bn->trevsorted = bn->batCount <= 1;
1605 5263 : bn->tkey = true;
1606 5263 : bn->tnil = false;
1607 5263 : bn->tnonil = true;
1608 5263 : bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 ? *(const oid *) Tloc(bn, 0) : rcnt == b->batCount ? b->hseqbase : oid_nil;
1609 5263 : if (with_strimps_anti) {
1610 : /* Reverse the result taking into account the original candidate list. */
1611 : // BAT *rev = BATdiffcand(BATdense(b->hseqbase, 0, b->batCount), bn);
1612 0 : BAT *rev;
1613 0 : if (old_s) {
1614 0 : rev = BATdiffcand(old_s, bn);
1615 : #ifndef NDEBUG
1616 0 : BAT *is = BATintersectcand(old_s, bn);
1617 0 : if (is) {
1618 0 : assert(is->batCount == bn->batCount);
1619 0 : BBPreclaim(is);
1620 : }
1621 0 : assert(rev->batCount == old_s->batCount - bn->batCount);
1622 : #endif
1623 : }
1624 :
1625 : else
1626 0 : rev = BATnegcands(0, b->batCount, bn);
1627 : /* BAT *rev = BATnegcands(0, b->batCount, bn); */
1628 0 : BBPunfix(bn->batCacheid);
1629 0 : bn = rev;
1630 : }
1631 : }
1632 :
1633 :
1634 5263 : bailout:
1635 5263 : BBPreclaim(b);
1636 5262 : BBPreclaim(s);
1637 5263 : BBPreclaim(old_s);
1638 5263 : if (bn && !msg) {
1639 5263 : *ret = bn->batCacheid;
1640 5263 : BBPkeepref(bn);
1641 0 : } else if (bn)
1642 0 : BBPreclaim(bn);
1643 5262 : return msg;
1644 : }
1645 :
1646 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
1647 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##i.width))
1648 :
1649 : /* nested loop implementation for PCRE join */
1650 : #define pcre_join_loop(STRCMP, MNRE_MATCH) \
1651 : do { \
1652 : for (BUN ridx = 0; ridx < rci.ncand; ridx++) { \
1653 : ro = canditer_next(&rci); \
1654 : vr = VALUE(r, ro - rbase); \
1655 : nl = 0; \
1656 : use_re = use_strcmp = empty = false; \
1657 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty, vr, esc))) \
1658 : goto bailout; \
1659 : if (!empty) { \
1660 : if ((msg = mnre_like_build(&re, vr, false, use_strcmp, (unsigned char) *esc)) != MAL_SUCCEED) \
1661 : goto bailout; \
1662 : canditer_reset(&lci); \
1663 : TIMEOUT_LOOP_IDX_DECL(lidx, lci.ncand, qry_ctx) { \
1664 : lo = canditer_next(&lci); \
1665 : vl = VALUE(l, lo - lbase); \
1666 : if (strNil(vl)) { \
1667 : continue; \
1668 : } else { \
1669 : if (use_strcmp) { \
1670 : if (STRCMP) \
1671 : continue; \
1672 : } else { \
1673 : assert(re); \
1674 : if (MNRE_MATCH) \
1675 : continue; \
1676 : } \
1677 : } \
1678 : if (BATcount(r1) == BATcapacity(r1)) { \
1679 : newcap = BATgrows(r1); \
1680 : BATsetcount(r1, BATcount(r1)); \
1681 : if (r2) \
1682 : BATsetcount(r2, BATcount(r2)); \
1683 : if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
1684 : msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
1685 : goto bailout; \
1686 : } \
1687 : assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
1688 : } \
1689 : if (BATcount(r1) > 0) { \
1690 : if (lastl + 1 != lo) \
1691 : r1->tseqbase = oid_nil; \
1692 : if (nl == 0) { \
1693 : if (r2) \
1694 : r2->trevsorted = false; \
1695 : if (lastl > lo) { \
1696 : r1->tsorted = false; \
1697 : r1->tkey = false; \
1698 : } else if (lastl < lo) { \
1699 : r1->trevsorted = false; \
1700 : } else { \
1701 : r1->tkey = false; \
1702 : } \
1703 : } \
1704 : } \
1705 : APPEND(r1, lo); \
1706 : if (r2) \
1707 : APPEND(r2, ro); \
1708 : lastl = lo; \
1709 : nl++; \
1710 : } \
1711 : mnre_like_clean(&re); \
1712 : TIMEOUT_CHECK(qry_ctx, \
1713 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1714 : } \
1715 : if (r2) { \
1716 : if (nl > 1) { \
1717 : r2->tkey = false; \
1718 : r2->tseqbase = oid_nil; \
1719 : r1->trevsorted = false; \
1720 : } else if (nl == 0) { \
1721 : rskipped = BATcount(r2) > 0; \
1722 : } else if (rskipped) { \
1723 : r2->tseqbase = oid_nil; \
1724 : } \
1725 : } else if (nl > 1) { \
1726 : r1->trevsorted = false; \
1727 : } \
1728 : } \
1729 : } while (0)
1730 :
1731 : static char *
1732 39 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc,
1733 : bit caseignore, bit anti)
1734 : {
1735 39 : struct canditer lci, rci;
1736 39 : const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
1737 39 : int rskipped = 0; /* whether we skipped values in r */
1738 39 : oid lbase, rbase, lo, ro, lastl = 0; /* last value inserted into r1 */
1739 39 : BUN nl, newcap;
1740 39 : char *msg = MAL_SUCCEED;
1741 39 : struct RE *re = NULL;
1742 39 : bool use_re = false,
1743 39 : use_strcmp = false,
1744 39 : empty = false;
1745 39 : lng t0 = 0;
1746 :
1747 39 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
1748 :
1749 39 : TRC_DEBUG_IF(ALGO) t0 = GDKusec();
1750 :
1751 117 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
1752 39 : assert(ATOMtype(l->ttype) == TYPE_str);
1753 :
1754 39 : BAT *ol = NULL, *or = NULL;
1755 39 : if (caseignore) {
1756 3 : ol = l;
1757 3 : or = r;
1758 3 : l = BATcasefold(l, NULL);
1759 3 : r = BATcasefold(r, NULL);
1760 3 : if (l == NULL || r == NULL) {
1761 0 : BBPreclaim(l);
1762 0 : BBPreclaim(r);
1763 0 : throw(MAL, "pcre.join", GDK_EXCEPTION);
1764 : }
1765 : }
1766 :
1767 39 : canditer_init(&lci, l, sl);
1768 39 : canditer_init(&rci, r, sr);
1769 :
1770 39 : BATiter li = bat_iterator(l);
1771 39 : BATiter ri = bat_iterator(r);
1772 39 : lbase = l->hseqbase;
1773 39 : rbase = r->hseqbase;
1774 39 : lvals = (const char *) li.base;
1775 39 : rvals = (const char *) ri.base;
1776 39 : assert(ri.vh && r->ttype);
1777 39 : lvars = li.vh->base;
1778 39 : rvars = ri.vh->base;
1779 :
1780 39 : r1->tkey = true;
1781 39 : r1->tsorted = true;
1782 39 : r1->trevsorted = true;
1783 39 : r1->tnil = false;
1784 39 : r1->tnonil = true;
1785 39 : if (r2) {
1786 17 : r2->tkey = true;
1787 17 : r2->tsorted = true;
1788 17 : r2->trevsorted = true;
1789 17 : r2->tnil = false;
1790 17 : r2->tnonil = true;
1791 : }
1792 :
1793 39 : if (anti) {
1794 482 : pcre_join_loop(strcmp(vl, vr) == 0, mnre_match(vl, re));
1795 : } else {
1796 456 : pcre_join_loop(strcmp(vl, vr) != 0, !mnre_match(vl, re));
1797 : }
1798 39 : bat_iterator_end(&li);
1799 39 : bat_iterator_end(&ri);
1800 39 : if (ol) {
1801 3 : BBPreclaim(l);
1802 3 : BBPreclaim(r);
1803 3 : l = ol;
1804 3 : r = or;
1805 : }
1806 :
1807 39 : assert(!r2 || BATcount(r1) == BATcount(r2));
1808 : /* also set other bits of heap to correct value to indicate size */
1809 39 : BATsetcount(r1, BATcount(r1));
1810 39 : if (r2)
1811 17 : BATsetcount(r2, BATcount(r2));
1812 39 : if (BATcount(r1) > 0) {
1813 26 : if (BATtdense(r1))
1814 7 : r1->tseqbase = ((oid *) r1->theap->base)[0];
1815 26 : if (r2 && BATtdense(r2))
1816 8 : r2->tseqbase = ((oid *) r2->theap->base)[0];
1817 : } else {
1818 13 : r1->tseqbase = 0;
1819 13 : if (r2)
1820 6 : r2->tseqbase = 0;
1821 : }
1822 :
1823 14 : if (r2)
1824 17 : TRC_DEBUG(ALGO,
1825 : "l=%s#" BUNFMT "[%s]%s%s,"
1826 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
1827 : "sr=%s#" BUNFMT "%s%s -> "
1828 : "%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s (" LLFMT " usec)\n",
1829 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
1830 : l->tsorted ? "-sorted" : "",
1831 : l->trevsorted ? "-revsorted" : "",
1832 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
1833 : r->tsorted ? "-sorted" : "",
1834 : r->trevsorted ? "-revsorted" : "",
1835 : sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
1836 : sl && sl->tsorted ? "-sorted" : "",
1837 : sl && sl->trevsorted ? "-revsorted" : "",
1838 : sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
1839 : sr && sr->tsorted ? "-sorted" : "",
1840 : sr && sr->trevsorted ? "-revsorted" : "",
1841 : BATgetId(r1), BATcount(r1),
1842 : r1->tsorted ? "-sorted" : "",
1843 : r1->trevsorted ? "-revsorted" : "",
1844 : BATgetId(r2), BATcount(r2),
1845 : r2->tsorted ? "-sorted" : "",
1846 : r2->trevsorted ? "-revsorted" : "", GDKusec() - t0);
1847 : else
1848 22 : TRC_DEBUG(ALGO,
1849 : "l=%s#" BUNFMT "[%s]%s%s,"
1850 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
1851 : "sr=%s#" BUNFMT "%s%s -> "
1852 : "%s#" BUNFMT "%s%s (" LLFMT " usec)\n",
1853 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
1854 : l->tsorted ? "-sorted" : "",
1855 : l->trevsorted ? "-revsorted" : "",
1856 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
1857 : r->tsorted ? "-sorted" : "",
1858 : r->trevsorted ? "-revsorted" : "",
1859 : sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
1860 : sl && sl->tsorted ? "-sorted" : "",
1861 : sl && sl->trevsorted ? "-revsorted" : "",
1862 : sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
1863 : sr && sr->tsorted ? "-sorted" : "",
1864 : sr && sr->trevsorted ? "-revsorted" : "",
1865 : BATgetId(r1), BATcount(r1),
1866 : r1->tsorted ? "-sorted" : "",
1867 : r1->trevsorted ? "-revsorted" : "", GDKusec() - t0);
1868 : return MAL_SUCCEED;
1869 :
1870 0 : bailout:
1871 0 : bat_iterator_end(&li);
1872 0 : bat_iterator_end(&ri);
1873 0 : mnre_like_clean(&re);
1874 0 : assert(msg != MAL_SUCCEED);
1875 : return msg;
1876 : }
1877 :
1878 : static str
1879 39 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid,
1880 : bat ciid, bit anti)
1881 : {
1882 39 : BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL,
1883 39 : *candleft = NULL, *candright = NULL;
1884 39 : BAT *result1 = NULL, *result2 = NULL;
1885 39 : char *msg = MAL_SUCCEED;
1886 39 : const char *esc = "";
1887 39 : bit ci;
1888 39 : BATiter bi;
1889 :
1890 39 : if ((left = BATdescriptor(lid)) == NULL)
1891 0 : goto fail;
1892 39 : if ((right = BATdescriptor(rid)) == NULL)
1893 0 : goto fail;
1894 39 : if ((escape = BATdescriptor(elid)) == NULL)
1895 0 : goto fail;
1896 39 : if ((caseignore = BATdescriptor(ciid)) == NULL)
1897 0 : goto fail;
1898 39 : if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
1899 0 : goto fail;
1900 39 : if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
1901 0 : goto fail;
1902 39 : result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
1903 39 : if (r2)
1904 17 : result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
1905 39 : if (!result1 || (r2 && !result2)) {
1906 0 : msg = createException(MAL, "pcre.join",
1907 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1908 0 : goto fail;
1909 : }
1910 39 : result1->tnil = false;
1911 39 : result1->tnonil = true;
1912 39 : result1->tkey = true;
1913 39 : result1->tsorted = true;
1914 39 : result1->trevsorted = true;
1915 39 : result1->tseqbase = 0;
1916 39 : if (r2) {
1917 17 : result2->tnil = false;
1918 17 : result2->tnonil = true;
1919 17 : result2->tkey = true;
1920 17 : result2->tsorted = true;
1921 17 : result2->trevsorted = true;
1922 17 : result2->tseqbase = 0;
1923 : }
1924 39 : if (BATcount(escape) != 1) {
1925 0 : msg = createException(MAL, "pcre.join",
1926 : SQLSTATE(42000)
1927 : "At the moment, only one value is allowed for the escape input at pcre join");
1928 0 : goto fail;
1929 : }
1930 39 : if (BATcount(caseignore) != 1) {
1931 0 : msg = createException(MAL, "pcre.join",
1932 : SQLSTATE(42000)
1933 : "At the moment, only one value is allowed for the case ignore input at pcre join");
1934 0 : goto fail;
1935 : }
1936 39 : bi = bat_iterator(caseignore);
1937 39 : ci = *(bit *) BUNtloc(bi, 0);
1938 39 : bat_iterator_end(&bi);
1939 39 : bi = bat_iterator(escape);
1940 39 : esc = BUNtvar(bi, 0);
1941 39 : msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci,
1942 : anti);
1943 39 : bat_iterator_end(&bi);
1944 39 : if (msg)
1945 0 : goto fail;
1946 39 : *r1 = result1->batCacheid;
1947 39 : BBPkeepref(result1);
1948 38 : if (r2) {
1949 17 : *r2 = result2->batCacheid;
1950 17 : BBPkeepref(result2);
1951 : }
1952 38 : BBPunfix(left->batCacheid);
1953 39 : BBPunfix(right->batCacheid);
1954 39 : BBPreclaim(escape);
1955 39 : BBPreclaim(caseignore);
1956 39 : BBPreclaim(candleft);
1957 39 : BBPreclaim(candright);
1958 : return MAL_SUCCEED;
1959 :
1960 0 : fail:
1961 0 : BBPreclaim(left);
1962 0 : BBPreclaim(right);
1963 0 : BBPreclaim(escape);
1964 0 : BBPreclaim(caseignore);
1965 0 : BBPreclaim(candleft);
1966 0 : BBPreclaim(candright);
1967 0 : BBPreclaim(result1);
1968 0 : BBPreclaim(result2);
1969 0 : if (msg)
1970 : return msg;
1971 0 : throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1972 : }
1973 :
1974 : static str
1975 17 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid,
1976 : const bat *cid, const bat *slid, const bat *srid,
1977 : const bit *nil_matches, const lng *estimate, const bit *anti)
1978 : {
1979 17 : (void) nil_matches;
1980 17 : (void) estimate;
1981 17 : return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
1982 17 : *elid, *cid, *anti);
1983 : }
1984 :
1985 : static str
1986 22 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid,
1987 : const bat *cid, const bat *slid, const bat *srid,
1988 : const bit *nil_matches, const lng *estimate, const bit *anti)
1989 : {
1990 22 : (void) nil_matches;
1991 22 : (void) estimate;
1992 22 : return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
1993 22 : *elid, *cid, *anti);
1994 : }
1995 :
1996 : #include "mel.h"
1997 : mel_atom pcre_init_atoms[] = {
1998 : { .name="pcre", }, { .cmp=NULL }
1999 : };
2000 : mel_func pcre_init_funcs[] = {
2001 : command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
2002 : command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2003 : command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2004 : command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
2005 : command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2006 : command("pcre", "replace_first", PCREreplacefirst_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2007 : command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
2008 : command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
2009 : command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2010 : command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2011 : command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2012 : command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2013 : command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2014 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2015 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2016 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2017 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2018 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2019 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2020 : command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds. The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
2021 : command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
2022 : command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
2023 : { .imp=NULL }
2024 : };
2025 : #include "mal_import.h"
2026 : #ifdef _MSC_VER
2027 : #undef read
2028 : #pragma section(".CRT$XCU",read)
2029 : #endif
2030 350 : LIB_STARTUP_FUNC(init_pcre_mal)
2031 350 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }
|