Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * N. Nes
15 : * PCRE library interface
16 : * The PCRE library is a set of functions that implement regular
17 : * expression pattern matching using the same syntax and semantics as Perl,
18 : * with just a few differences. The current implementation of PCRE
19 : * (release 4.x) corresponds approximately with Perl 5.8, including support
20 : * for UTF-8 encoded strings. However, this support has to be
21 : * explicitly enabled; it is not the default.
22 : *
23 : * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
24 : */
25 : #include "monetdb_config.h"
26 : #include <string.h>
27 :
28 : #include "mal.h"
29 : #include "mal_client.h"
30 : #include "mal_interpreter.h"
31 : #include "mal_exception.h"
32 :
33 : #include <wchar.h>
34 : #include <wctype.h>
35 :
36 : #ifdef HAVE_LIBPCRE
37 : #include <pcre.h>
38 : #ifndef PCRE_STUDY_JIT_COMPILE
39 : /* old library version on e.g. EPEL 6 */
40 : #define pcre_free_study(x) pcre_free(x)
41 : #define PCRE_STUDY_JIT_COMPILE 0
42 : #endif
43 : #define JIT_COMPILE_MIN 1024 /* when to try JIT compilation of patterns */
44 :
45 : #else
46 :
47 : #include <regex.h>
48 :
49 : typedef regex_t pcre;
50 : #endif
51 :
52 : /* current implementation assumes simple %keyword% [keyw%]* */
53 : struct RE {
54 : char *k;
55 : bool search:1, atend:1, case_ignore:1;
56 : size_t skip; /* number of codepoints to skip before matching */
57 : size_t len; /* number of bytes in string */
58 : size_t ulen; /* number of codepoints in string */
59 : struct RE *n;
60 : };
61 :
62 : /* We cannot use strcasecmp and strncasecmp since they work byte for
63 : * byte and don't deal with multibyte encodings (such as UTF-8). */
64 :
65 : static inline bool
66 5695 : mnre_is_pattern_properly_escaped(const char *pat, unsigned char esc)
67 : {
68 5695 : bool escaped = false;
69 :
70 5695 : if (pat == 0)
71 : return true;
72 40942 : while (*pat) {
73 35247 : if (escaped) {
74 : escaped = false;
75 35095 : } else if ((unsigned char) *pat == esc) {
76 35247 : escaped = true;
77 : }
78 35247 : pat++;
79 : }
80 5695 : return escaped ? false : true;
81 : }
82 :
83 : /* returns true if the pattern does not contain wildcard
84 : * characters ('%' or '_') and no character is escaped
85 : */
86 : static inline bool
87 5687 : is_strcmpable(const char *pat, const char *esc)
88 : {
89 5687 : if (pat[strcspn(pat, "%_")])
90 : return false;
91 1768 : return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
92 : }
93 :
94 : /* Match regular expression by comparing bytes.
95 : */
96 : static inline bool
97 370095 : mnre_match(const char *restrict s, const struct RE *restrict pattern)
98 : {
99 370095 : const struct RE *r;
100 :
101 439525 : for (r = pattern; r; r = r->n) {
102 399892 : for (size_t i = 0; i < r->skip; s++) {
103 27166 : if (*s == 0)
104 : return false;
105 28074 : i += (*s & 0xC0) != 0x80;
106 : }
107 372726 : if (r->search) {
108 183903 : if (r->atend) {
109 : /* we're searching for a string at the end, so just skip
110 : * over everything and just compare with the tail of the
111 : * haystack */
112 22111 : size_t slen = strlen(s);
113 22111 : if (slen < r->ulen) {
114 : /* remaining string too short: each codepoint
115 : * requires at least one byte */
116 : return false;
117 : }
118 22082 : const char *e = s + slen;
119 22082 : if (!r->case_ignore) {
120 22000 : if (slen < r->len) {
121 : /* remaining string is too short to match */
122 : return false;
123 : }
124 22014 : e -= r->len;
125 22014 : if ((*e & 0xC0) == 0x80) {
126 : /* not at start of a Unicode character, so
127 : * cannot match (this test not strictly
128 : * required: the strcmp should also return
129 : * unequal) */
130 : return false;
131 : }
132 22018 : return strcmp(e, r->k) == 0;
133 : }
134 : size_t ulen = r->ulen;
135 363 : while (e > s && ulen != 0) {
136 281 : ulen -= (*--e & 0xC0) != 0x80;
137 : }
138 : /* ulen != 0 means remaining string is too short */
139 144 : return ulen == 0 && GDKstrcasecmp(e, r->k) == 0;
140 : }
141 : /* in case we have a pattern consisting of % followed by _,
142 : * we need to backtrack, so use recursion; here we know we
143 : * have the %, look for an _ in the rest of the pattern
144 : * (note %_ and _% are equivalent and is taken care of by
145 : * the pattern construction in mnre_create) */
146 170715 : for (const struct RE *p = r->n; p; p = p->n) {
147 11007 : if (p->skip != 0) {
148 2084 : struct RE pat = *r;
149 2084 : pat.search = false;
150 2084 : pat.skip = 0;
151 128659 : do {
152 128659 : if (mnre_match(s, &pat))
153 : return true;
154 128517 : do
155 128517 : s++;
156 128518 : while (*s && (*s & 0xC0) == 0x80);
157 128518 : } while (*s != 0);
158 : return false;
159 : }
160 : }
161 : }
162 348531 : if (r->k[0] == 0 && (r->search || *s == 0))
163 : return true;
164 348489 : if (r->case_ignore) {
165 1360 : for (;;) {
166 1360 : if (r->search && (s = GDKstrcasestr(s, r->k)) == NULL)
167 : return false;
168 527 : if (*s == '\0')
169 : return false;
170 : /* in "atend" comparison, compare whole string, else
171 : * only part */
172 583 : if ((!r->search || r->atend) &&
173 56 : (r->atend ? GDKstrcasecmp(s, r->k) : GDKstrncasecmp(s, r->k, SIZE_MAX, r->len)) != 0) {
174 : /* no match */
175 19 : if (!r->search)
176 : return false;
177 : /* try again with next character */
178 0 : do
179 0 : s++;
180 0 : while (*s != '\0' && (*s & 0xC0) == 0x80);
181 0 : continue;
182 : }
183 : /* match; find end of match by counting codepoints */
184 7219 : for (size_t i = 0; *s && i < r->ulen; s++)
185 6711 : i += (*s & 0xC0) != 0x80;
186 : break;
187 : }
188 : } else {
189 347129 : for (;;) {
190 347129 : if (r->search && (s = strstr(s, r->k)) == NULL)
191 : return false;
192 227273 : if (*s == '\0')
193 : return false;
194 : /* in "atend" comparison, include NUL byte in the compare */
195 226790 : if ((!r->search || r->atend) &&
196 159768 : strncmp(s, r->k, r->len + r->atend) != 0) {
197 : /* no match */
198 157868 : if (!r->search)
199 : return false;
200 : /* try again with next character: have search start
201 : * after current first byte */
202 0 : if ((s = strchr(s + 1, r->k[0])) == NULL)
203 : return false;
204 0 : continue;
205 : }
206 : /* match */
207 68922 : s += r->len;
208 68922 : break;
209 : }
210 : }
211 : }
212 : return true;
213 : }
214 :
215 : static void
216 4838 : mnre_destroy(struct RE *p)
217 : {
218 4838 : if (p) {
219 4838 : GDKfree(p->k);
220 5476 : do {
221 5476 : struct RE *n = p->n;
222 :
223 5476 : GDKfree(p);
224 5476 : p = n;
225 5476 : } while (p);
226 : }
227 4838 : }
228 :
229 : /* Create a linked list of RE structures. Depending on the
230 : * caseignore and the ascii_pattern flags, the w
231 : * (if caseignore == true && ascii_pattern == false) or the k
232 : * (in every other case) field is used. These in the first
233 : * structure are allocated, whereas in all subsequent
234 : * structures the fields point into the allocated buffer of
235 : * the first.
236 : */
237 : static struct RE *
238 4838 : mnre_create(const char *pat, bool caseignore, uint32_t esc)
239 : {
240 4838 : struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
241 4838 : bool escaped = false;
242 4838 : char *p, *q;
243 :
244 4838 : if (r == NULL)
245 : return NULL;
246 4838 : *r = (struct RE) {
247 : .atend = true,
248 : .case_ignore = caseignore,
249 : };
250 :
251 9030 : for (;;) {
252 9030 : if (esc != '%' && *pat == '%') {
253 4021 : pat++; /* skip % */
254 4021 : r->search = true;
255 5009 : } else if (esc != '_' && *pat == '_') {
256 171 : pat++;
257 171 : r->skip++;
258 : } else {
259 : break;
260 : }
261 : }
262 4838 : if ((p = GDKstrdup(pat)) == NULL) {
263 0 : GDKfree(r);
264 0 : return NULL;
265 : }
266 :
267 4838 : r->k = p;
268 4838 : q = p;
269 31508 : while (*p) {
270 26670 : if (escaped) {
271 149 : *q++ = *p;
272 149 : n->len++;
273 149 : n->ulen += (*p & 0xC0) != 0x80;
274 149 : escaped = false;
275 26521 : } else if ((unsigned char) *p == esc) {
276 : escaped = true;
277 26372 : } else if (*p == '%' || *p == '_') {
278 4773 : n->atend = false;
279 4773 : bool search = false;
280 4773 : size_t skip = 0;
281 14435 : for (;;) {
282 9604 : if (*p == '_')
283 492 : skip++;
284 9112 : else if (*p == '%')
285 : search = true;
286 : else
287 : break;
288 4831 : p++;
289 : }
290 4773 : if (*p || skip != 0) {
291 638 : n = n->n = GDKmalloc(sizeof(struct RE));
292 638 : if (n == NULL)
293 0 : goto bailout;
294 638 : *n = (struct RE) {
295 : .search = search,
296 : .atend = true,
297 : .skip = skip,
298 : .k = p,
299 : .case_ignore = caseignore,
300 : };
301 : }
302 4773 : *q = 0;
303 4773 : q = p;
304 4773 : continue; /* skip increment, we already did it */
305 : } else {
306 21599 : *q++ = *p;
307 21599 : n->len++;
308 21599 : n->ulen += (*p & 0xC0) != 0x80;
309 : }
310 21897 : p++;
311 : }
312 4838 : *q = 0;
313 4838 : return r;
314 0 : bailout:
315 0 : mnre_destroy(r);
316 0 : return NULL;
317 : }
318 :
319 : #ifdef HAVE_LIBPCRE
320 : static str
321 25 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
322 : {
323 25 : pcre *r;
324 25 : const char *err_p = NULL;
325 25 : int errpos = 0;
326 25 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
327 25 : if (insensitive)
328 0 : options |= PCRE_CASELESS;
329 :
330 25 : if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
331 0 : throw(MAL, "pcre.compile", OPERATION_FAILED
332 : " with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern);
333 : }
334 25 : *res = r;
335 25 : return MAL_SUCCEED;
336 : }
337 : #endif
338 :
339 : /* maximum number of back references and quoted \ or $ in replacement string */
340 : #define MAX_NR_REFS 20
341 :
342 : struct backref {
343 : int idx;
344 : int start;
345 : int end;
346 : };
347 :
348 : #ifdef HAVE_LIBPCRE
349 : /* fill in parameter backrefs (length maxrefs) with information about
350 : * back references in the replacement string; a back reference is a
351 : * dollar or backslash followed by a number */
352 : static int
353 24 : parse_replacement(const char *replacement, int len_replacement,
354 : struct backref *backrefs, int maxrefs)
355 : {
356 24 : int nbackrefs = 0;
357 :
358 61 : for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
359 37 : if (replacement[i] == '$' || replacement[i] == '\\') {
360 2 : char *endptr;
361 2 : backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
362 2 : if (endptr > replacement + i + 1) {
363 2 : int k = (int) (endptr - (replacement + i + 1));
364 2 : backrefs[nbackrefs].start = i;
365 2 : backrefs[nbackrefs].end = i + k + 1;
366 2 : nbackrefs++;
367 0 : } else if (replacement[i] == replacement[i + 1]) {
368 : /* doubled $ or \, we must copy just one to the output */
369 0 : backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */
370 0 : backrefs[nbackrefs].start = i;
371 0 : backrefs[nbackrefs].end = i + 1;
372 0 : i++; /* don't look at second $ or \ again */
373 0 : nbackrefs++;
374 : }
375 : /* else: $ or \ followed by something we don't recognize,
376 : * so just leave it */
377 : }
378 : }
379 24 : return nbackrefs;
380 : }
381 :
382 : static char *
383 5735 : single_replace(pcre *pcre_code, pcre_extra *extra,
384 : const char *origin_str, int len_origin_str,
385 : int exec_options, int *ovector, int ovecsize,
386 : const char *replacement, int len_replacement,
387 : struct backref *backrefs, int nbackrefs,
388 : bool global, char *result, int *max_result)
389 : {
390 5735 : int offset = 0;
391 5735 : int len_result = 0;
392 5735 : int addlen;
393 5735 : int empty_match_correction = 0;
394 21178 : char *tmp;
395 :
396 21178 : do {
397 21178 : int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
398 : exec_options, ovector, ovecsize);
399 21185 : if (j <= 0)
400 : break;
401 :
402 15886 : empty_match_correction = ovector[0] == ovector[1] ? 1 : 0;
403 :
404 : // calculate the length of the string that will be appended to result
405 31772 : addlen = ovector[0] - offset
406 15886 : + (nbackrefs == 0 ? len_replacement : 0) + empty_match_correction;
407 15886 : if (len_result + addlen >= *max_result) {
408 1385 : tmp = GDKrealloc(result, len_result + addlen + 1);
409 1385 : if (tmp == NULL) {
410 0 : GDKfree(result);
411 0 : return NULL;
412 : }
413 1385 : result = tmp;
414 1385 : *max_result = len_result + addlen + 1;
415 : }
416 : // append to the result the parts of the original string that are left unchanged
417 15886 : if (ovector[0] > offset) {
418 15430 : strncpy(result + len_result, origin_str + offset,
419 15430 : ovector[0] - offset);
420 15430 : len_result += ovector[0] - offset;
421 : }
422 : // append to the result the replacement of the matched string
423 15886 : if (nbackrefs == 0) {
424 15449 : strncpy(result + len_result, replacement, len_replacement);
425 15449 : len_result += len_replacement;
426 : } else {
427 : int prevend = 0;
428 874 : for (int i = 0; i < nbackrefs; i++) {
429 437 : int off, len;
430 437 : if (backrefs[i].idx >= ovecsize / 3) {
431 : /* out of bounds, replace with empty string */
432 : off = 0;
433 : len = 0;
434 : } else {
435 437 : off = ovector[backrefs[i].idx * 2];
436 437 : len = ovector[backrefs[i].idx * 2 + 1] - off;
437 : }
438 437 : addlen = backrefs[i].start - prevend + len;
439 437 : if (len_result + addlen >= *max_result) {
440 4 : tmp = GDKrealloc(result, len_result + addlen + 1);
441 4 : if (tmp == NULL) {
442 0 : GDKfree(result);
443 0 : return NULL;
444 : }
445 4 : result = tmp;
446 4 : *max_result = len_result + addlen + 1;
447 : }
448 437 : if (backrefs[i].start > prevend) {
449 2 : strncpy(result + len_result, replacement + prevend,
450 2 : backrefs[i].start - prevend);
451 2 : len_result += backrefs[i].start - prevend;
452 : }
453 437 : if (len > 0) {
454 437 : strncpy(result + len_result, origin_str + off, len);
455 437 : len_result += len;
456 : }
457 437 : prevend = backrefs[i].end;
458 : }
459 : /* copy rest of replacement string (after last backref) */
460 437 : addlen = len_replacement - prevend;
461 437 : if (addlen > 0) {
462 2 : if (len_result + addlen >= *max_result) {
463 1 : tmp = GDKrealloc(result, len_result + addlen + 1);
464 1 : if (tmp == NULL) {
465 0 : GDKfree(result);
466 0 : return NULL;
467 : }
468 1 : result = tmp;
469 1 : *max_result = len_result + addlen + 1;
470 : }
471 2 : strncpy(result + len_result, replacement + prevend, addlen);
472 2 : len_result += addlen;
473 : }
474 : }
475 : // In case of an empty match just advance the offset by 1
476 15886 : offset = ovector[1] + empty_match_correction;
477 : // and copy the character that we just advanced over
478 15886 : if (empty_match_correction) {
479 14 : strncpy(result + len_result, origin_str + ovector[1], 1);
480 14 : ++len_result;
481 : }
482 : // before we loop around check with the offset - 1 if we had an empty match
483 : // since we manually advanced the offset by one. otherwise we gonna skip a
484 : // replacement at the end of the string
485 15886 : } while ((offset - empty_match_correction) < len_origin_str && global);
486 :
487 5742 : if (offset < len_origin_str) {
488 5296 : addlen = len_origin_str - offset;
489 5296 : if (len_result + addlen >= *max_result) {
490 66 : tmp = GDKrealloc(result, len_result + addlen + 1);
491 66 : if (tmp == NULL) {
492 0 : GDKfree(result);
493 0 : return NULL;
494 : }
495 66 : result = tmp;
496 66 : *max_result = len_result + addlen + 1;
497 : }
498 5296 : strncpy(result + len_result, origin_str + offset, addlen);
499 5296 : len_result += addlen;
500 : }
501 : /* null terminate string */
502 5742 : result[len_result] = '\0';
503 5742 : return result;
504 : }
505 : #endif
506 :
507 : static str
508 14 : pcre_replace(str *res, const char *origin_str, const char *pattern,
509 : const char *replacement, const char *flags, bool global)
510 : {
511 : #ifdef HAVE_LIBPCRE
512 14 : const char *err_p = NULL;
513 14 : pcre *pcre_code = NULL;
514 14 : pcre_extra *extra;
515 14 : char *tmpres;
516 14 : int max_result;
517 14 : int i, errpos = 0;
518 14 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
519 14 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
520 14 : int *ovector, ovecsize;
521 14 : int len_origin_str = (int) strlen(origin_str);
522 14 : int len_replacement = (int) strlen(replacement);
523 14 : struct backref backrefs[MAX_NR_REFS];
524 14 : int nbackrefs = 0;
525 :
526 21 : while (*flags) {
527 7 : switch (*flags) {
528 : case 'e':
529 : exec_options &= ~PCRE_NOTEMPTY;
530 : break;
531 1 : case 'i':
532 1 : compile_options |= PCRE_CASELESS;
533 1 : break;
534 1 : case 'm':
535 1 : compile_options |= PCRE_MULTILINE;
536 1 : break;
537 1 : case 's':
538 1 : compile_options |= PCRE_DOTALL;
539 1 : break;
540 1 : case 'x':
541 1 : compile_options |= PCRE_EXTENDED;
542 1 : break;
543 0 : default:
544 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
545 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
546 : *flags);
547 : }
548 7 : flags++;
549 : }
550 :
551 14 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
552 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
553 : OPERATION_FAILED
554 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
555 : pattern, errpos, err_p);
556 : }
557 :
558 : /* Since the compiled pattern is going to be used several times, it is
559 : * worth spending more time analyzing it in order to speed up the time
560 : * taken for matching.
561 : */
562 14 : extra = pcre_study(pcre_code, 0, &err_p);
563 14 : if (err_p != NULL) {
564 0 : pcre_free(pcre_code);
565 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
566 : OPERATION_FAILED
567 : ": pcre study of pattern (%s) failed with '%s'.\n", pattern,
568 : err_p);
569 : }
570 14 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
571 14 : ovecsize = (i + 1) * 3;
572 14 : if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
573 0 : pcre_free_study(extra);
574 0 : pcre_free(pcre_code);
575 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
576 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
577 : }
578 :
579 : /* identify back references in the replacement string */
580 14 : nbackrefs = parse_replacement(replacement, len_replacement,
581 : backrefs, MAX_NR_REFS);
582 :
583 14 : max_result = len_origin_str + 1;
584 14 : tmpres = GDKmalloc(max_result);
585 14 : if (tmpres == NULL) {
586 0 : GDKfree(ovector);
587 0 : pcre_free_study(extra);
588 0 : pcre_free(pcre_code);
589 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
590 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
591 : }
592 :
593 14 : tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
594 : exec_options, ovector, ovecsize, replacement,
595 : len_replacement, backrefs, nbackrefs, global,
596 : tmpres, &max_result);
597 14 : GDKfree(ovector);
598 14 : pcre_free_study(extra);
599 14 : pcre_free(pcre_code);
600 14 : if (tmpres == NULL)
601 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
602 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
603 :
604 14 : *res = tmpres;
605 14 : return MAL_SUCCEED;
606 : #else
607 : (void) res;
608 : (void) origin_str;
609 : (void) pattern;
610 : (void) replacement;
611 : (void) flags;
612 : (void) global;
613 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
614 : "Database was compiled without PCRE support.");
615 : #endif
616 : }
617 :
618 : static str
619 10 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
620 : const char *replacement, const char *flags, bool global)
621 : {
622 : #ifdef HAVE_LIBPCRE
623 10 : const char *err_p = NULL;
624 10 : char *tmpres;
625 10 : int i, errpos = 0;
626 10 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
627 10 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
628 10 : pcre *pcre_code = NULL;
629 10 : pcre_extra *extra;
630 10 : BAT *tmpbat;
631 10 : BUN p, q;
632 10 : int *ovector, ovecsize;
633 10 : int len_replacement = (int) strlen(replacement);
634 10 : struct backref backrefs[MAX_NR_REFS];
635 10 : int nbackrefs = 0;
636 10 : const char *origin_str;
637 10 : int max_dest_size = 0;
638 :
639 14 : while (*flags) {
640 4 : switch (*flags) {
641 : case 'e':
642 : exec_options &= ~PCRE_NOTEMPTY;
643 : break;
644 1 : case 'i':
645 1 : compile_options |= PCRE_CASELESS;
646 1 : break;
647 2 : case 'm':
648 2 : compile_options |= PCRE_MULTILINE;
649 2 : break;
650 1 : case 's':
651 1 : compile_options |= PCRE_DOTALL;
652 1 : break;
653 0 : case 'x':
654 0 : compile_options |= PCRE_EXTENDED;
655 0 : break;
656 0 : default:
657 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
658 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
659 : *flags);
660 : }
661 4 : flags++;
662 : }
663 :
664 10 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
665 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
666 : OPERATION_FAILED
667 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
668 : pattern, errpos, err_p);
669 : }
670 :
671 : /* Since the compiled pattern is going to be used several times,
672 : * it is worth spending more time analyzing it in order to speed
673 : * up the time taken for matching.
674 : */
675 20 : extra = pcre_study(pcre_code,
676 10 : BATcount(origin_strs) >
677 : JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
678 10 : if (err_p != NULL) {
679 0 : pcre_free(pcre_code);
680 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
681 : OPERATION_FAILED);
682 : }
683 10 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
684 10 : ovecsize = (i + 1) * 3;
685 10 : if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
686 0 : pcre_free_study(extra);
687 0 : pcre_free(pcre_code);
688 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
689 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
690 : }
691 :
692 : /* identify back references in the replacement string */
693 10 : nbackrefs = parse_replacement(replacement, len_replacement,
694 : backrefs, MAX_NR_REFS);
695 :
696 10 : tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs),
697 : TRANSIENT);
698 :
699 : /* the buffer for all destination strings is allocated only once,
700 : * and extended when needed */
701 10 : max_dest_size = len_replacement + 1;
702 10 : tmpres = GDKmalloc(max_dest_size);
703 10 : if (tmpbat == NULL || tmpres == NULL) {
704 0 : pcre_free_study(extra);
705 0 : pcre_free(pcre_code);
706 0 : GDKfree(ovector);
707 0 : BBPreclaim(tmpbat);
708 0 : GDKfree(tmpres);
709 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
710 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
711 : }
712 10 : BATiter origin_strsi = bat_iterator(origin_strs);
713 5731 : BATloop(origin_strs, p, q) {
714 5721 : origin_str = BUNtvar(origin_strsi, p);
715 11443 : tmpres = single_replace(pcre_code, extra, origin_str,
716 5721 : (int) strlen(origin_str), exec_options,
717 : ovector, ovecsize, replacement,
718 : len_replacement, backrefs, nbackrefs, global,
719 : tmpres, &max_dest_size);
720 5722 : if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
721 0 : bat_iterator_end(&origin_strsi);
722 0 : pcre_free_study(extra);
723 0 : pcre_free(pcre_code);
724 0 : GDKfree(ovector);
725 0 : GDKfree(tmpres);
726 0 : BBPreclaim(tmpbat);
727 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
728 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
729 : }
730 : }
731 10 : bat_iterator_end(&origin_strsi);
732 10 : pcre_free_study(extra);
733 10 : pcre_free(pcre_code);
734 10 : GDKfree(ovector);
735 10 : GDKfree(tmpres);
736 10 : *res = tmpbat;
737 10 : return MAL_SUCCEED;
738 : #else
739 : (void) res;
740 : (void) origin_strs;
741 : (void) pattern;
742 : (void) replacement;
743 : (void) flags;
744 : (void) global;
745 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
746 : "Database was compiled without PCRE support.");
747 : #endif
748 : }
749 :
750 : static str
751 4 : pcre_match_with_flags(bit *ret, const char *val, const char *pat,
752 : const char *flags)
753 : {
754 4 : int pos;
755 : #ifdef HAVE_LIBPCRE
756 4 : const char *err_p = NULL;
757 4 : int errpos = 0;
758 4 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_DOTALL;
759 4 : pcre *re;
760 : #else
761 : int options = REG_NOSUB | REG_EXTENDED;
762 : regex_t re;
763 : int errcode;
764 : int retval;
765 : #endif
766 :
767 4 : while (*flags) {
768 0 : switch (*flags) {
769 0 : case 'i':
770 : #ifdef HAVE_LIBPCRE
771 0 : options |= PCRE_CASELESS;
772 : #else
773 : options |= REG_ICASE;
774 : #endif
775 0 : break;
776 0 : case 'm':
777 : #ifdef HAVE_LIBPCRE
778 0 : options |= PCRE_MULTILINE;
779 : #else
780 : options |= REG_NEWLINE;
781 : #endif
782 0 : break;
783 : #ifdef HAVE_LIBPCRE
784 0 : case 's':
785 0 : options |= PCRE_DOTALL;
786 0 : break;
787 : #endif
788 0 : case 'x':
789 : #ifdef HAVE_LIBPCRE
790 0 : options |= PCRE_EXTENDED;
791 : #else
792 : options |= REG_EXTENDED;
793 : #endif
794 0 : break;
795 0 : default:
796 0 : throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
797 : ": unsupported flag character '%c'\n", *flags);
798 : }
799 0 : flags++;
800 : }
801 4 : if (strNil(val)) {
802 0 : *ret = FALSE;
803 0 : return MAL_SUCCEED;
804 : }
805 :
806 : #ifdef HAVE_LIBPCRE
807 4 : if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
808 : #else
809 : if ((errcode = regcomp(&re, pat, options)) != 0)
810 : #endif
811 : {
812 0 : throw(MAL, "pcre.match", OPERATION_FAILED
813 : ": compilation of regular expression (%s) failed "
814 : #ifdef HAVE_LIBPCRE
815 : "at %d with '%s'", pat, errpos, err_p
816 : #else
817 : , pat
818 : #endif
819 : );
820 : }
821 : #ifdef HAVE_LIBPCRE
822 4 : pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK,
823 : NULL, 0);
824 4 : pcre_free(re);
825 : #else
826 : retval = regexec(&re, val, (size_t) 0, NULL, 0);
827 : pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
828 : regfree(&re);
829 : #endif
830 4 : if (pos >= 0)
831 3 : *ret = TRUE;
832 1 : else if (pos == -1)
833 1 : *ret = FALSE;
834 : else
835 0 : throw(MAL, "pcre.match", OPERATION_FAILED
836 : ": matching of regular expression (%s) failed with %d", pat, pos);
837 : return MAL_SUCCEED;
838 : }
839 :
840 : #ifdef HAVE_LIBPCRE
841 : /* special characters in PCRE that need to be escaped */
842 : static const char pcre_specials[] = "$()*+.?[\\]^{|}";
843 : #else
844 : /* special characters in POSIX basic regular expressions that need to
845 : * be escaped */
846 : static const char pcre_specials[] = "$()*+.?[\\^{|";
847 : #endif
848 :
849 : /* change SQL LIKE pattern into PCRE pattern */
850 : static str
851 6 : sql2pcre(str *r, const char *pat, const char *esc_str)
852 : {
853 6 : int escaped = 0;
854 6 : int hasWildcard = 0;
855 6 : char *ppat;
856 12 : int esc = strNil(esc_str) ? 0 : esc_str[0]; /* should change to utf8_convert() */
857 6 : int specials;
858 6 : int c;
859 :
860 6 : if (strlen(esc_str) > 1)
861 0 : throw(MAL, "pcre.sql2pcre",
862 : SQLSTATE(22019) ILLEGAL_ARGUMENT
863 : ": ESCAPE string must have length 1");
864 6 : if (pat == NULL)
865 0 : throw(MAL, "pcre.sql2pcre",
866 : SQLSTATE(22019) ILLEGAL_ARGUMENT
867 : ": (I)LIKE pattern must not be NULL");
868 6 : ppat = GDKmalloc(strlen(pat) * 3 +
869 : 3 /* 3 = "^'the translated regexp'$0" */ );
870 6 : if (ppat == NULL)
871 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
872 :
873 6 : *r = ppat;
874 : /* The escape character can be a char which is special in a PCRE
875 : * expression. If the user used the "+" char as escape and has "++"
876 : * in their pattern, then replacing this with "+" is not correct and
877 : * should be "\+" instead. */
878 6 : specials = (esc && strchr(pcre_specials, esc) != NULL);
879 :
880 6 : *ppat++ = '^';
881 17 : while ((c = *pat++) != 0) {
882 11 : if (c == esc) {
883 2 : if (escaped) {
884 1 : if (specials) { /* change ++ into \+ */
885 1 : *ppat++ = esc;
886 : } else { /* do not escape simple escape symbols */
887 0 : ppat[-1] = esc; /* overwrite backslash */
888 : }
889 : escaped = 0;
890 : } else {
891 1 : *ppat++ = '\\';
892 1 : escaped = 1;
893 : }
894 : hasWildcard = 1;
895 9 : } else if (strchr(pcre_specials, c) != NULL) {
896 : /* escape PCRE special chars, avoid double backslash if the
897 : * user uses an invalid escape sequence */
898 2 : if (!escaped)
899 2 : *ppat++ = '\\';
900 2 : *ppat++ = c;
901 2 : hasWildcard = 1;
902 2 : escaped = 0;
903 7 : } else if (c == '%' && !escaped) {
904 3 : *ppat++ = '.';
905 3 : *ppat++ = '*';
906 3 : *ppat++ = '?';
907 3 : hasWildcard = 1;
908 : /* collapse multiple %, but only if it isn't the escape */
909 3 : if (esc != '%')
910 3 : while (*pat == '%')
911 0 : pat++;
912 4 : } else if (c == '_' && !escaped) {
913 3 : *ppat++ = '.';
914 3 : hasWildcard = 1;
915 : } else {
916 1 : if (escaped) {
917 0 : ppat[-1] = c; /* overwrite backslash of invalid escape */
918 : } else {
919 1 : *ppat++ = c;
920 : }
921 : escaped = 0;
922 : }
923 : }
924 : /* no wildcard or escape character at end of string */
925 6 : if (!hasWildcard || escaped) {
926 1 : GDKfree(*r);
927 1 : *r = NULL;
928 1 : if (escaped)
929 0 : throw(MAL, "pcre.sql2pcre",
930 : SQLSTATE(22019) ILLEGAL_ARGUMENT
931 : ": (I)LIKE pattern must not end with escape character");
932 1 : *r = GDKstrdup(str_nil);
933 1 : if (*r == NULL)
934 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
935 : } else {
936 5 : *ppat++ = '$';
937 5 : *ppat = 0;
938 : }
939 : return MAL_SUCCEED;
940 : }
941 :
942 : #ifdef HAVE_LIBPCRE
943 : /* change SQL PATINDEX pattern into PCRE pattern */
944 : static str
945 25 : pat2pcre(str *r, const char *pat)
946 : {
947 25 : size_t len = strlen(pat);
948 25 : char *ppat = GDKmalloc(len * 2 + 3 /* 3 = "^'the translated regexp'$0" */ );
949 25 : int start = 0;
950 :
951 25 : if (ppat == NULL)
952 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
953 25 : *r = ppat;
954 77 : while (*pat) {
955 52 : int c = *pat++;
956 :
957 52 : if (strchr(pcre_specials, c) != NULL) {
958 17 : *ppat++ = '\\';
959 17 : *ppat++ = c;
960 35 : } else if (c == '%') {
961 3 : if (start && *pat) {
962 0 : *ppat++ = '.';
963 0 : *ppat++ = '*';
964 : }
965 3 : start++;
966 32 : } else if (c == '_') {
967 0 : *ppat++ = '.';
968 : } else {
969 32 : *ppat++ = c;
970 : }
971 : }
972 25 : *ppat = 0;
973 25 : return MAL_SUCCEED;
974 : }
975 : #endif
976 :
977 : /*
978 : * @+ Wrapping
979 : */
980 :
981 : static str
982 14 : PCREreplace_wrap(str *res, const char *const *or, const char *const *pat,
983 : const char *const *repl, const char *const *flags)
984 : {
985 14 : return pcre_replace(res, *or, *pat, *repl, *flags, true);
986 : }
987 :
988 : static str
989 0 : PCREreplacefirst_wrap(str *res, const char *const *or, const char *const *pat,
990 : const char *const *repl, const char *const *flags)
991 : {
992 0 : return pcre_replace(res, *or, *pat, *repl, *flags, false);
993 : }
994 :
995 : static str
996 10 : PCREreplace_bat_wrap(bat *res, const bat *bid, const char *const *pat,
997 : const char *const *repl, const char *const *flags)
998 : {
999 10 : BAT *b, *bn = NULL;
1000 10 : str msg;
1001 10 : if ((b = BATdescriptor(*bid)) == NULL)
1002 0 : throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1003 :
1004 10 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
1005 10 : if (msg == MAL_SUCCEED) {
1006 10 : *res = bn->batCacheid;
1007 10 : BBPkeepref(bn);
1008 : }
1009 10 : BBPunfix(b->batCacheid);
1010 10 : return msg;
1011 : }
1012 :
1013 : static str
1014 0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const char *const *pat,
1015 : const char *const *repl, const char *const *flags)
1016 : {
1017 0 : BAT *b, *bn = NULL;
1018 0 : str msg;
1019 0 : if ((b = BATdescriptor(*bid)) == NULL)
1020 0 : throw(MAL, "batpcre.replace_first", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1021 :
1022 0 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
1023 0 : if (msg == MAL_SUCCEED) {
1024 0 : *res = bn->batCacheid;
1025 0 : BBPkeepref(bn);
1026 : }
1027 0 : BBPunfix(b->batCacheid);
1028 0 : return msg;
1029 : }
1030 :
1031 : static str
1032 4 : PCREmatch(bit *ret, const char *const *val, const char *const *pat)
1033 : {
1034 4 : return pcre_match_with_flags(ret, *val, *pat, "");
1035 : }
1036 :
1037 : static str
1038 0 : PCREimatch(bit *ret, const char *const *val, const char *const *pat)
1039 : {
1040 0 : return pcre_match_with_flags(ret, *val, *pat, "i");
1041 : }
1042 :
1043 : static str
1044 25 : PCREindex(int *res, const pcre *pattern, const char *const *s)
1045 : {
1046 : #ifdef HAVE_LIBPCRE
1047 25 : int v[3];
1048 :
1049 25 : v[0] = v[1] = *res = 0;
1050 25 : if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0,
1051 : PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
1052 23 : *res = v[1];
1053 : }
1054 25 : return MAL_SUCCEED;
1055 : #else
1056 : (void) res;
1057 : (void) pattern;
1058 : (void) s;
1059 : throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
1060 : #endif
1061 : }
1062 :
1063 : static str
1064 27 : PCREpatindex(int *ret, const char *const *pat, const char *const *val)
1065 : {
1066 : #ifdef HAVE_LIBPCRE
1067 27 : pcre *re = NULL;
1068 27 : char *ppat = NULL, *msg;
1069 :
1070 53 : if (strNil(*pat) || strNil(*val)) {
1071 2 : *ret = int_nil;
1072 2 : return MAL_SUCCEED;
1073 : }
1074 :
1075 25 : if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
1076 : return msg;
1077 25 : if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
1078 0 : GDKfree(ppat);
1079 0 : return msg;
1080 : }
1081 25 : GDKfree(ppat);
1082 25 : msg = PCREindex(ret, re, val);
1083 25 : pcre_free(re);
1084 25 : return msg;
1085 : #else
1086 : (void) ret;
1087 : (void) pat;
1088 : (void) val;
1089 : throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
1090 : #endif
1091 : }
1092 :
1093 : static str
1094 0 : PCREquote(str *ret, const char *const *val)
1095 : {
1096 0 : char *p;
1097 0 : const char *s = *val;
1098 :
1099 0 : *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */
1100 0 : if (p == NULL)
1101 0 : throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1102 : /* quote all non-alphanumeric ASCII characters (i.e. leave
1103 : non-ASCII and alphanumeric alone) */
1104 0 : while (*s) {
1105 0 : if (!((*s & 0x80) != 0 ||
1106 0 : ('a' <= *s && *s <= 'z') ||
1107 0 : ('A' <= *s && *s <= 'Z') || isdigit((unsigned char) *s)))
1108 0 : *p++ = '\\';
1109 0 : *p++ = *s++;
1110 : }
1111 0 : *p = 0;
1112 0 : return MAL_SUCCEED;
1113 : }
1114 :
1115 : static str
1116 6 : PCREsql2pcre(str *ret, const char *const *pat, const char *const *esc)
1117 : {
1118 6 : return sql2pcre(ret, *pat, *esc);
1119 : }
1120 :
1121 : static inline str
1122 6185 : choose_like_path(bool *use_re, bool *use_strcmp, bool *empty,
1123 : const char *pat, const char *esc)
1124 : {
1125 6185 : str res = MAL_SUCCEED;
1126 6185 : *use_re = false;
1127 6185 : *use_strcmp = false;
1128 6185 : *empty = false;
1129 :
1130 :
1131 11882 : if (strNil(pat) || strNil(esc)) {
1132 488 : *empty = true;
1133 : } else {
1134 5697 : if (!mnre_is_pattern_properly_escaped(pat, (unsigned char) *esc))
1135 5 : throw(MAL, "pcre.sql2pcre",
1136 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1137 : ": (I)LIKE pattern must not end with escape character");
1138 5683 : if (is_strcmpable(pat, esc)) {
1139 854 : *use_re = true;
1140 854 : *use_strcmp = true;
1141 : } else {
1142 4829 : *use_re = true;
1143 : }
1144 : }
1145 : return res;
1146 : }
1147 :
1148 : static str
1149 234 : PCRElike_imp(bit *ret, const char *const *s, const char *const *pat,
1150 : const char *const *esc, const bit *isens)
1151 : {
1152 234 : str res = MAL_SUCCEED;
1153 234 : bool use_re = false, use_strcmp = false, empty = false;
1154 234 : struct RE *re = NULL;
1155 :
1156 234 : if ((res = choose_like_path(&use_re, &use_strcmp, &empty,
1157 : *pat, *esc)) != MAL_SUCCEED)
1158 : return res;
1159 :
1160 459 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ?
1161 225 : "pcrelike: pattern matching using strcmp" : use_re ?
1162 : "pcrelike: pattern matching using RE" :
1163 : "pcrelike: pattern matching using pcre");
1164 :
1165 468 : if (strNil(*s) || empty) {
1166 0 : *ret = bit_nil;
1167 : } else {
1168 234 : if (use_strcmp) {
1169 9 : *ret = *isens ? GDKstrcasecmp(*s, *pat) == 0
1170 7 : : strcmp(*s, *pat) == 0;
1171 : } else {
1172 225 : if (!(re = mnre_create(*pat, *isens, (unsigned char) **esc)))
1173 0 : res = createException(MAL, "pcre.like4",
1174 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1175 : else
1176 225 : *ret = mnre_match(*s, re);
1177 : }
1178 : }
1179 :
1180 234 : if (re)
1181 225 : mnre_destroy(re);
1182 : return res;
1183 : }
1184 :
1185 : static str
1186 234 : PCRElike(bit *ret, const char *const *s, const char *const *pat,
1187 : const char *const *esc, const bit *isens)
1188 : {
1189 229 : return PCRElike_imp(ret, s, pat, esc, isens);
1190 : }
1191 :
1192 : static str
1193 5 : PCREnotlike(bit *ret, const char *const *s, const char *const *pat,
1194 : const char *const *esc, const bit *isens)
1195 : {
1196 5 : str tmp;
1197 5 : bit r;
1198 :
1199 5 : rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
1200 5 : *ret = r == bit_nil ? bit_nil : !r;
1201 5 : return MAL_SUCCEED;
1202 : }
1203 :
1204 : static inline str
1205 5457 : mnre_like_build(struct RE **re, const char *pat, bool caseignore,
1206 : bool use_strcmp, uint32_t esc)
1207 : {
1208 5457 : if (!use_strcmp) {
1209 4613 : if (!(*re = mnre_create(pat, caseignore, esc)))
1210 0 : return createException(MAL, "pcre.re_like_build",
1211 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1212 : }
1213 : return MAL_SUCCEED;
1214 : }
1215 :
1216 : static inline bit
1217 6203 : mnre_like_proj_apply(const char *s, const struct RE *restrict re,
1218 : const char *pat,
1219 : bool caseignore, bool anti, bool use_strcmp)
1220 : {
1221 6203 : if (strNil(s))
1222 408 : return bit_nil;
1223 5795 : if (use_strcmp) {
1224 1139 : if (caseignore) {
1225 525 : if (anti)
1226 494 : return GDKstrcasecmp(s, pat) != 0;
1227 : else
1228 31 : return GDKstrcasecmp(s, pat) == 0;
1229 : } else {
1230 614 : if (anti)
1231 302 : return strcmp(s, pat) != 0;
1232 : else
1233 312 : return strcmp(s, pat) == 0;
1234 : }
1235 : } else {
1236 4656 : if (anti)
1237 136 : return !mnre_match(s, re);
1238 : else
1239 4520 : return mnre_match(s, re);
1240 : }
1241 : }
1242 :
1243 : static inline void
1244 5564 : mnre_like_clean(struct RE **re)
1245 : {
1246 5564 : if (*re) {
1247 556 : mnre_destroy(*re);
1248 4613 : *re = NULL;
1249 : }
1250 : }
1251 :
1252 : static str
1253 676 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci,
1254 : const char *const *esc, const bit *isens, const bit *not)
1255 : {
1256 676 : str msg = MAL_SUCCEED;
1257 676 : BAT *b = NULL, *pbn = NULL, *bn = NULL;
1258 676 : const char *input = NULL;
1259 676 : bool use_re = false,
1260 676 : use_strcmp = false,
1261 676 : empty = false,
1262 676 : isensitive = (bool) *isens,
1263 676 : anti = (bool) *not,
1264 676 : has_nil = false,
1265 676 : input_is_a_bat = isaBatType(getArgType(mb, pci, 1)),
1266 676 : pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
1267 676 : bat *r = getArgReference_bat(stk, pci, 0);
1268 676 : BUN q = 0;
1269 676 : bit *restrict ret = NULL;
1270 676 : struct RE *mnre_simple = NULL;
1271 676 : BATiter bi = (BATiter) { 0 }, pi;
1272 :
1273 676 : (void) cntxt;
1274 676 : if (input_is_a_bat) {
1275 676 : bat *bid = getArgReference_bat(stk, pci, 1);
1276 676 : if (!(b = BATdescriptor(*bid))) {
1277 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1278 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1279 0 : goto bailout;
1280 : }
1281 : }
1282 676 : if (pattern_is_a_bat) {
1283 84 : bat *pb = getArgReference_bat(stk, pci, 2);
1284 84 : if (!(pbn = BATdescriptor(*pb))) {
1285 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1286 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1287 0 : goto bailout;
1288 : }
1289 : }
1290 675 : assert((!b || ATOMstorage(b->ttype) == TYPE_str)
1291 : && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
1292 :
1293 675 : q = BATcount(b ? b : pbn);
1294 675 : if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
1295 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1296 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1297 0 : goto bailout;
1298 : }
1299 675 : ret = (bit *) Tloc(bn, 0);
1300 :
1301 675 : if (pattern_is_a_bat) {
1302 84 : pi = bat_iterator(pbn);
1303 84 : if (b)
1304 84 : bi = bat_iterator(b);
1305 : else
1306 0 : input = *getArgReference_str(stk, pci, 1);
1307 :
1308 1160 : for (BUN p = 0; p < q; p++) {
1309 1076 : const char *next_input = b ? BUNtvar(bi, p) : input,
1310 1076 : *np = BUNtvar(pi, p);
1311 :
1312 1076 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty,
1313 : np, *esc)) != MAL_SUCCEED) {
1314 0 : bat_iterator_end(&pi);
1315 0 : if (b)
1316 0 : bat_iterator_end(&bi);
1317 0 : goto bailout;
1318 : }
1319 :
1320 1076 : if (empty) {
1321 459 : ret[p] = bit_nil;
1322 : } else {
1323 617 : if ((msg = mnre_like_build(&mnre_simple, np, isensitive,
1324 : use_strcmp,
1325 617 : (unsigned char) **esc)) != MAL_SUCCEED) {
1326 0 : bat_iterator_end(&pi);
1327 0 : if (b)
1328 0 : bat_iterator_end(&bi);
1329 0 : goto bailout;
1330 : }
1331 617 : ret[p] = mnre_like_proj_apply(next_input, mnre_simple, np,
1332 : isensitive, anti, use_strcmp);
1333 617 : mnre_like_clean(&mnre_simple);
1334 : }
1335 1076 : has_nil |= is_bit_nil(ret[p]);
1336 : }
1337 84 : bat_iterator_end(&pi);
1338 84 : if (b)
1339 84 : bat_iterator_end(&bi);
1340 : } else {
1341 591 : const char *pat = *getArgReference_str(stk, pci, 2);
1342 591 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty,
1343 : pat, *esc)) != MAL_SUCCEED)
1344 5 : goto bailout;
1345 :
1346 587 : bi = bat_iterator(b);
1347 1119 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp
1348 : ? "pcrelike: pattern matching using strcmp" :
1349 532 : use_re ? "pcrelike: pattern matching using RE" :
1350 : "pcrelike: pattern matching using pcre");
1351 :
1352 587 : if (empty) {
1353 43 : for (BUN p = 0; p < q; p++)
1354 26 : ret[p] = bit_nil;
1355 : has_nil = true;
1356 : } else {
1357 570 : if ((msg = mnre_like_build(&mnre_simple, pat, isensitive, use_strcmp,
1358 570 : (unsigned char) **esc)) != MAL_SUCCEED) {
1359 0 : bat_iterator_end(&bi);
1360 0 : goto bailout;
1361 : }
1362 6157 : for (BUN p = 0; p < q; p++) {
1363 5588 : const char *s = BUNtvar(bi, p);
1364 5588 : ret[p] = mnre_like_proj_apply(s, mnre_simple, pat, isensitive,
1365 : anti, use_strcmp);
1366 5587 : has_nil |= is_bit_nil(ret[p]);
1367 : }
1368 : }
1369 586 : bat_iterator_end(&bi);
1370 : }
1371 :
1372 676 : bailout:
1373 676 : mnre_like_clean(&mnre_simple);
1374 676 : if (bn && !msg) {
1375 671 : BATsetcount(bn, q);
1376 671 : bn->tnil = has_nil;
1377 671 : bn->tnonil = !has_nil;
1378 671 : bn->tkey = BATcount(bn) <= 1;
1379 671 : bn->tsorted = BATcount(bn) <= 1;
1380 671 : bn->trevsorted = BATcount(bn) <= 1;
1381 671 : *r = bn->batCacheid;
1382 671 : BBPkeepref(bn);
1383 5 : } else if (bn)
1384 5 : BBPreclaim(bn);
1385 675 : BBPreclaim(b);
1386 676 : BBPreclaim(pbn);
1387 676 : return msg;
1388 : }
1389 :
1390 : static str
1391 537 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1392 : {
1393 537 : const char *esc = *getArgReference_str(stk, pci, 3);
1394 537 : const bit *ci = getArgReference_bit(stk, pci, 4);
1395 537 : bit no = FALSE;
1396 :
1397 537 : return BATPCRElike_imp(cntxt, mb, stk, pci, &esc, ci, &no);
1398 : }
1399 :
1400 : static str
1401 139 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1402 : {
1403 139 : const char *esc = *getArgReference_str(stk, pci, 3);
1404 139 : const bit *ci = getArgReference_bit(stk, pci, 4);
1405 139 : bit yes = TRUE;
1406 :
1407 139 : return BATPCRElike_imp(cntxt, mb, stk, pci, &esc, ci, &yes);
1408 : }
1409 :
1410 : /* scan select loop with or without candidates */
1411 : #define pcrescanloop(TEST, KEEP_NULLS) \
1412 : do { \
1413 : TRC_DEBUG(ALGO, \
1414 : "PCREselect(b=%s#"BUNFMT",anti=%d): " \
1415 : "scanselect %s\n", BATgetId(b), BATcount(b), \
1416 : anti, #TEST); \
1417 : if (!s || BATtdense(s)) { \
1418 : for (; p < q; p++) { \
1419 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1420 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1421 : const char *restrict v = BUNtvar(bi, p - off); \
1422 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1423 : vals[cnt++] = p; \
1424 : } \
1425 : } else { \
1426 : for (; p < ncands; p++) { \
1427 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1428 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1429 : oid o = canditer_next(ci); \
1430 : const char *restrict v = BUNtvar(bi, o - off); \
1431 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1432 : vals[cnt++] = o; \
1433 : } \
1434 : } \
1435 : } while (0)
1436 :
1437 : static str
1438 4142 : mnre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
1439 : BUN *rcnt, const char *pat, bool caseignore, bool anti,
1440 : bool use_strcmp, uint32_t esc, bool keep_nulls)
1441 : {
1442 4142 : BATiter bi = bat_iterator(b);
1443 4146 : BUN cnt = 0, ncands = ci->ncand;
1444 4146 : oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
1445 4146 : struct RE *re = NULL;
1446 4146 : str msg = MAL_SUCCEED;
1447 :
1448 4146 : size_t counter = 0;
1449 4146 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
1450 :
1451 4146 : if ((msg = mnre_like_build(&re, pat, caseignore, use_strcmp,
1452 : esc)) != MAL_SUCCEED)
1453 0 : goto bailout;
1454 :
1455 4145 : if (use_strcmp) {
1456 88 : if (caseignore) {
1457 27 : if (anti)
1458 58 : pcrescanloop(!strNil(v)
1459 : && GDKstrcasecmp(v, pat) != 0, keep_nulls);
1460 : else
1461 671 : pcrescanloop(!strNil(v)
1462 : && GDKstrcasecmp(v, pat) == 0, keep_nulls);
1463 : } else {
1464 61 : if (anti)
1465 54 : pcrescanloop(!strNil(v) && strcmp(v, pat) != 0, keep_nulls);
1466 : else
1467 9180 : pcrescanloop(!strNil(v) && strcmp(v, pat) == 0, keep_nulls);
1468 : }
1469 : } else {
1470 4057 : if (caseignore) {
1471 42 : if (anti) {
1472 44 : pcrescanloop(!strNil(v)
1473 : && !mnre_match(v, re), keep_nulls);
1474 : } else {
1475 1398 : pcrescanloop(!strNil(v)
1476 : && mnre_match(v, re), keep_nulls);
1477 : }
1478 : } else {
1479 4015 : if (anti)
1480 62575 : pcrescanloop(!strNil(v)
1481 : && !mnre_match(v, re), keep_nulls);
1482 : else
1483 169129 : pcrescanloop(!strNil(v)
1484 : && mnre_match(v, re), keep_nulls);
1485 : }
1486 : }
1487 :
1488 25 : bailout:
1489 4145 : bat_iterator_end(&bi);
1490 4146 : mnre_like_clean(&re);
1491 4146 : *rcnt = cnt;
1492 4146 : return msg;
1493 : }
1494 :
1495 : static str
1496 4146 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const char *const *pat,
1497 : const char *const *esc, const bit *caseignore, const bit *anti)
1498 : {
1499 4146 : BAT *b, *s = NULL, *bn = NULL, *old_s = NULL;
1500 4146 : str msg = MAL_SUCCEED;
1501 4146 : bool use_re = false,
1502 4146 : use_strcmp = false,
1503 4146 : empty = false;
1504 4146 : bool with_strimps = false;
1505 4146 : bool with_strimps_anti = false;
1506 4146 : BUN p = 0, q = 0, rcnt = 0;
1507 4146 : struct canditer ci;
1508 :
1509 4146 : if ((b = BATdescriptor(*bid)) == NULL) {
1510 0 : msg = createException(MAL, "algebra.likeselect",
1511 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1512 0 : goto bailout;
1513 : }
1514 4146 : if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
1515 0 : msg = createException(MAL, "algebra.likeselect",
1516 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1517 0 : goto bailout;
1518 : }
1519 :
1520 4147 : assert(ATOMstorage(b->ttype) == TYPE_str);
1521 :
1522 4147 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty,
1523 : *pat, *esc)) != MAL_SUCCEED)
1524 0 : goto bailout;
1525 :
1526 4140 : if (empty) {
1527 0 : if (!(bn = BATdense(0, 0, 0)))
1528 0 : msg = createException(MAL, "algebra.likeselect",
1529 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1530 :
1531 0 : goto bailout;
1532 : }
1533 : /* Since the strimp pre-filtering of a LIKE query produces a superset of the actual result the complement of that
1534 : * set will necessarily reject some of the matching entries in the NOT LIKE query.
1535 : *
1536 : * In this case we run the PCRElikeselect as a LIKE query with strimps and return the complement of the result,
1537 : * taking extra care to not return NULLs. This currently means that we do not run strimps for NOT LIKE queries if
1538 : * the BAT contains NULLs.
1539 : */
1540 4140 : if (BAThasstrimps(b)) {
1541 24 : if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
1542 24 : BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
1543 24 : if (tmp_s) {
1544 24 : old_s = s;
1545 24 : s = tmp_s;
1546 24 : if (!*anti)
1547 : with_strimps = true;
1548 : else
1549 0 : with_strimps_anti = true;
1550 : }
1551 : } else { /* If we cannot filter with the strimp just continue normally */
1552 0 : GDKclrerr();
1553 : }
1554 : }
1555 :
1556 :
1557 4145 : MT_thread_setalgorithm(use_strcmp
1558 4145 : ? (with_strimps ?
1559 : "pcrelike: pattern matching using strcmp with strimps"
1560 : : (with_strimps_anti ?
1561 : "pcrelike: pattern matching using strcmp with strimps anti"
1562 4145 : : "pcrelike: pattern matching using strcmp")) :
1563 4057 : use_re ? (with_strimps ?
1564 : "pcrelike: pattern matching using RE with strimps"
1565 : : (with_strimps_anti ?
1566 : "pcrelike: patterm matching using RE with strimps anti"
1567 : :
1568 : "pcrelike: pattern matching using RE"))
1569 : : (with_strimps ?
1570 : "pcrelike: pattern matching using pcre with strimps"
1571 : : (with_strimps_anti ?
1572 : "pcrelike: pattermatching using pcre with strimps anti"
1573 : : "pcrelike: pattern matching using pcre")));
1574 :
1575 4146 : canditer_init(&ci, b, s);
1576 4146 : if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
1577 0 : msg = createException(MAL, "algebra.likeselect",
1578 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1579 0 : goto bailout;
1580 : }
1581 :
1582 4146 : if (!s || BATtdense(s)) {
1583 600 : if (s) {
1584 3521 : assert(BATtdense(s));
1585 3521 : p = (BUN) s->tseqbase;
1586 3521 : q = p + BATcount(s);
1587 3521 : if ((oid) p < b->hseqbase)
1588 : p = b->hseqbase;
1589 3521 : if ((oid) q > b->hseqbase + BATcount(b))
1590 : q = b->hseqbase + BATcount(b);
1591 : } else {
1592 600 : p = b->hseqbase;
1593 600 : q = BATcount(b) + b->hseqbase;
1594 : }
1595 : }
1596 :
1597 4146 : msg = mnre_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, *caseignore, *anti
1598 536 : && !with_strimps_anti, use_strcmp,
1599 4146 : (unsigned char) **esc, with_strimps_anti);
1600 :
1601 4146 : if (!msg) { /* set some properties */
1602 4146 : BATsetcount(bn, rcnt);
1603 4146 : bn->tsorted = true;
1604 4146 : bn->trevsorted = bn->batCount <= 1;
1605 4146 : bn->tkey = true;
1606 4146 : bn->tnil = false;
1607 4146 : bn->tnonil = true;
1608 4146 : bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 ? *(const oid *) Tloc(bn, 0) : rcnt == b->batCount ? b->hseqbase : oid_nil;
1609 4146 : if (with_strimps_anti) {
1610 : /* Reverse the result taking into account the original candidate list. */
1611 : // BAT *rev = BATdiffcand(BATdense(b->hseqbase, 0, b->batCount), bn);
1612 0 : BAT *rev;
1613 0 : if (old_s) {
1614 0 : rev = BATdiffcand(old_s, bn);
1615 : #ifndef NDEBUG
1616 0 : BAT *is = BATintersectcand(old_s, bn);
1617 0 : if (is) {
1618 0 : assert(is->batCount == bn->batCount);
1619 0 : BBPreclaim(is);
1620 : }
1621 0 : assert(rev->batCount == old_s->batCount - bn->batCount);
1622 : #endif
1623 : }
1624 :
1625 : else
1626 0 : rev = BATnegcands(0, b->batCount, bn);
1627 : /* BAT *rev = BATnegcands(0, b->batCount, bn); */
1628 0 : BBPunfix(bn->batCacheid);
1629 0 : bn = rev;
1630 : }
1631 : }
1632 :
1633 :
1634 4146 : bailout:
1635 4146 : BBPreclaim(b);
1636 4146 : BBPreclaim(s);
1637 4146 : BBPreclaim(old_s);
1638 4146 : if (bn && !msg) {
1639 4146 : *ret = bn->batCacheid;
1640 4146 : BBPkeepref(bn);
1641 0 : } else if (bn)
1642 0 : BBPreclaim(bn);
1643 4145 : return msg;
1644 : }
1645 :
1646 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
1647 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##i.width))
1648 :
1649 : /* nested loop implementation for PCRE join */
1650 : #define pcre_join_loop(STRCMP, MNRE_MATCH) \
1651 : do { \
1652 : for (BUN ridx = 0; ridx < rci.ncand; ridx++) { \
1653 : ro = canditer_next(&rci); \
1654 : vr = VALUE(r, ro - rbase); \
1655 : nl = 0; \
1656 : use_re = use_strcmp = empty = false; \
1657 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty, vr, esc))) \
1658 : goto bailout; \
1659 : if (!empty) { \
1660 : if ((msg = mnre_like_build(&re, vr, false, use_strcmp, (unsigned char) *esc)) != MAL_SUCCEED) \
1661 : goto bailout; \
1662 : canditer_reset(&lci); \
1663 : TIMEOUT_LOOP_IDX_DECL(lidx, lci.ncand, qry_ctx) { \
1664 : lo = canditer_next(&lci); \
1665 : vl = VALUE(l, lo - lbase); \
1666 : if (strNil(vl)) { \
1667 : continue; \
1668 : } else { \
1669 : if (use_strcmp) { \
1670 : if (STRCMP) \
1671 : continue; \
1672 : } else { \
1673 : assert(re); \
1674 : if (MNRE_MATCH) \
1675 : continue; \
1676 : } \
1677 : } \
1678 : if (BATcount(r1) == BATcapacity(r1)) { \
1679 : newcap = BATgrows(r1); \
1680 : BATsetcount(r1, BATcount(r1)); \
1681 : if (r2) \
1682 : BATsetcount(r2, BATcount(r2)); \
1683 : if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
1684 : msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
1685 : goto bailout; \
1686 : } \
1687 : assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
1688 : } \
1689 : if (BATcount(r1) > 0) { \
1690 : if (lastl + 1 != lo) \
1691 : r1->tseqbase = oid_nil; \
1692 : if (nl == 0) { \
1693 : if (r2) \
1694 : r2->trevsorted = false; \
1695 : if (lastl > lo) { \
1696 : r1->tsorted = false; \
1697 : r1->tkey = false; \
1698 : } else if (lastl < lo) { \
1699 : r1->trevsorted = false; \
1700 : } else { \
1701 : r1->tkey = false; \
1702 : } \
1703 : } \
1704 : } \
1705 : APPEND(r1, lo); \
1706 : if (r2) \
1707 : APPEND(r2, ro); \
1708 : lastl = lo; \
1709 : nl++; \
1710 : } \
1711 : mnre_like_clean(&re); \
1712 : TIMEOUT_CHECK(qry_ctx, \
1713 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1714 : } \
1715 : if (r2) { \
1716 : if (nl > 1) { \
1717 : r2->tkey = false; \
1718 : r2->tseqbase = oid_nil; \
1719 : r1->trevsorted = false; \
1720 : } else if (nl == 0) { \
1721 : rskipped = BATcount(r2) > 0; \
1722 : } else if (rskipped) { \
1723 : r2->tseqbase = oid_nil; \
1724 : } \
1725 : } else if (nl > 1) { \
1726 : r1->trevsorted = false; \
1727 : } \
1728 : } \
1729 : } while (0)
1730 :
1731 : static char *
1732 43 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc,
1733 : bit caseignore, bit anti)
1734 : {
1735 43 : struct canditer lci, rci;
1736 43 : const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
1737 43 : int rskipped = 0; /* whether we skipped values in r */
1738 43 : oid lbase, rbase, lo, ro, lastl = 0; /* last value inserted into r1 */
1739 43 : BUN nl, newcap;
1740 43 : char *msg = MAL_SUCCEED;
1741 43 : struct RE *re = NULL;
1742 43 : bool use_re = false,
1743 43 : use_strcmp = false,
1744 43 : empty = false;
1745 43 : lng t0 = 0;
1746 :
1747 43 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
1748 :
1749 43 : TRC_DEBUG_IF(ALGO) t0 = GDKusec();
1750 :
1751 129 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
1752 43 : assert(ATOMtype(l->ttype) == TYPE_str);
1753 :
1754 43 : BAT *ol = NULL, *or = NULL;
1755 43 : if (caseignore) {
1756 7 : ol = l;
1757 7 : or = r;
1758 7 : l = BATcasefold(l, NULL);
1759 7 : r = BATcasefold(r, NULL);
1760 7 : if (l == NULL || r == NULL) {
1761 0 : BBPreclaim(l);
1762 0 : BBPreclaim(r);
1763 0 : throw(MAL, "pcre.join", GDK_EXCEPTION);
1764 : }
1765 : }
1766 :
1767 43 : canditer_init(&lci, l, sl);
1768 43 : canditer_init(&rci, r, sr);
1769 :
1770 43 : BATiter li = bat_iterator(l);
1771 43 : BATiter ri = bat_iterator(r);
1772 43 : lbase = l->hseqbase;
1773 43 : rbase = r->hseqbase;
1774 43 : lvals = (const char *) li.base;
1775 43 : rvals = (const char *) ri.base;
1776 43 : assert(ri.vh && r->ttype);
1777 43 : lvars = li.vh->base;
1778 43 : rvars = ri.vh->base;
1779 :
1780 43 : r1->tkey = true;
1781 43 : r1->tsorted = true;
1782 43 : r1->trevsorted = true;
1783 43 : r1->tnil = false;
1784 43 : r1->tnonil = true;
1785 43 : if (r2) {
1786 26 : r2->tkey = true;
1787 26 : r2->tsorted = true;
1788 26 : r2->trevsorted = true;
1789 26 : r2->tnil = false;
1790 26 : r2->tnonil = true;
1791 : }
1792 :
1793 43 : if (anti) {
1794 642 : pcre_join_loop(strcmp(vl, vr) == 0, mnre_match(vl, re));
1795 : } else {
1796 456 : pcre_join_loop(strcmp(vl, vr) != 0, !mnre_match(vl, re));
1797 : }
1798 43 : bat_iterator_end(&li);
1799 43 : bat_iterator_end(&ri);
1800 43 : if (ol) {
1801 7 : BBPreclaim(l);
1802 7 : BBPreclaim(r);
1803 7 : l = ol;
1804 7 : r = or;
1805 : }
1806 :
1807 43 : assert(!r2 || BATcount(r1) == BATcount(r2));
1808 : /* also set other bits of heap to correct value to indicate size */
1809 43 : BATsetcount(r1, BATcount(r1));
1810 43 : if (r2)
1811 26 : BATsetcount(r2, BATcount(r2));
1812 43 : if (BATcount(r1) > 0) {
1813 30 : if (BATtdense(r1))
1814 7 : r1->tseqbase = ((oid *) r1->theap->base)[0];
1815 30 : if (r2 && BATtdense(r2))
1816 14 : r2->tseqbase = ((oid *) r2->theap->base)[0];
1817 : } else {
1818 13 : r1->tseqbase = 0;
1819 13 : if (r2)
1820 6 : r2->tseqbase = 0;
1821 : }
1822 :
1823 20 : if (r2)
1824 26 : TRC_DEBUG(ALGO,
1825 : "l=%s#" BUNFMT "[%s]%s%s,"
1826 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
1827 : "sr=%s#" BUNFMT "%s%s -> "
1828 : "%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s (" LLFMT " usec)\n",
1829 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
1830 : l->tsorted ? "-sorted" : "",
1831 : l->trevsorted ? "-revsorted" : "",
1832 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
1833 : r->tsorted ? "-sorted" : "",
1834 : r->trevsorted ? "-revsorted" : "",
1835 : sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
1836 : sl && sl->tsorted ? "-sorted" : "",
1837 : sl && sl->trevsorted ? "-revsorted" : "",
1838 : sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
1839 : sr && sr->tsorted ? "-sorted" : "",
1840 : sr && sr->trevsorted ? "-revsorted" : "",
1841 : BATgetId(r1), BATcount(r1),
1842 : r1->tsorted ? "-sorted" : "",
1843 : r1->trevsorted ? "-revsorted" : "",
1844 : BATgetId(r2), BATcount(r2),
1845 : r2->tsorted ? "-sorted" : "",
1846 : r2->trevsorted ? "-revsorted" : "", GDKusec() - t0);
1847 : else
1848 17 : TRC_DEBUG(ALGO,
1849 : "l=%s#" BUNFMT "[%s]%s%s,"
1850 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
1851 : "sr=%s#" BUNFMT "%s%s -> "
1852 : "%s#" BUNFMT "%s%s (" LLFMT " usec)\n",
1853 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
1854 : l->tsorted ? "-sorted" : "",
1855 : l->trevsorted ? "-revsorted" : "",
1856 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
1857 : r->tsorted ? "-sorted" : "",
1858 : r->trevsorted ? "-revsorted" : "",
1859 : sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
1860 : sl && sl->tsorted ? "-sorted" : "",
1861 : sl && sl->trevsorted ? "-revsorted" : "",
1862 : sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
1863 : sr && sr->tsorted ? "-sorted" : "",
1864 : sr && sr->trevsorted ? "-revsorted" : "",
1865 : BATgetId(r1), BATcount(r1),
1866 : r1->tsorted ? "-sorted" : "",
1867 : r1->trevsorted ? "-revsorted" : "", GDKusec() - t0);
1868 : return MAL_SUCCEED;
1869 :
1870 0 : bailout:
1871 0 : bat_iterator_end(&li);
1872 0 : bat_iterator_end(&ri);
1873 0 : mnre_like_clean(&re);
1874 0 : assert(msg != MAL_SUCCEED);
1875 : return msg;
1876 : }
1877 :
1878 : static str
1879 43 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid,
1880 : bat ciid, bit anti)
1881 : {
1882 43 : BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL,
1883 43 : *candleft = NULL, *candright = NULL;
1884 43 : BAT *result1 = NULL, *result2 = NULL;
1885 43 : char *msg = MAL_SUCCEED;
1886 43 : const char *esc = "";
1887 43 : bit ci;
1888 43 : BATiter bi;
1889 :
1890 43 : if ((left = BATdescriptor(lid)) == NULL)
1891 0 : goto fail;
1892 43 : if ((right = BATdescriptor(rid)) == NULL)
1893 0 : goto fail;
1894 43 : if ((escape = BATdescriptor(elid)) == NULL)
1895 0 : goto fail;
1896 43 : if ((caseignore = BATdescriptor(ciid)) == NULL)
1897 0 : goto fail;
1898 43 : if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
1899 0 : goto fail;
1900 43 : if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
1901 0 : goto fail;
1902 43 : result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
1903 43 : if (r2)
1904 26 : result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
1905 43 : if (!result1 || (r2 && !result2)) {
1906 0 : msg = createException(MAL, "pcre.join",
1907 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1908 0 : goto fail;
1909 : }
1910 43 : result1->tnil = false;
1911 43 : result1->tnonil = true;
1912 43 : result1->tkey = true;
1913 43 : result1->tsorted = true;
1914 43 : result1->trevsorted = true;
1915 43 : result1->tseqbase = 0;
1916 43 : if (r2) {
1917 26 : result2->tnil = false;
1918 26 : result2->tnonil = true;
1919 26 : result2->tkey = true;
1920 26 : result2->tsorted = true;
1921 26 : result2->trevsorted = true;
1922 26 : result2->tseqbase = 0;
1923 : }
1924 43 : if (BATcount(escape) != 1) {
1925 0 : msg = createException(MAL, "pcre.join",
1926 : SQLSTATE(42000)
1927 : "At the moment, only one value is allowed for the escape input at pcre join");
1928 0 : goto fail;
1929 : }
1930 43 : if (BATcount(caseignore) != 1) {
1931 0 : msg = createException(MAL, "pcre.join",
1932 : SQLSTATE(42000)
1933 : "At the moment, only one value is allowed for the case ignore input at pcre join");
1934 0 : goto fail;
1935 : }
1936 43 : bi = bat_iterator(caseignore);
1937 43 : ci = *(bit *) BUNtloc(bi, 0);
1938 43 : bat_iterator_end(&bi);
1939 43 : bi = bat_iterator(escape);
1940 43 : esc = BUNtvar(bi, 0);
1941 43 : msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci,
1942 : anti);
1943 43 : bat_iterator_end(&bi);
1944 43 : if (msg)
1945 0 : goto fail;
1946 43 : *r1 = result1->batCacheid;
1947 43 : BBPkeepref(result1);
1948 43 : if (r2) {
1949 26 : *r2 = result2->batCacheid;
1950 26 : BBPkeepref(result2);
1951 : }
1952 43 : BBPunfix(left->batCacheid);
1953 43 : BBPunfix(right->batCacheid);
1954 43 : BBPreclaim(escape);
1955 43 : BBPreclaim(caseignore);
1956 43 : BBPreclaim(candleft);
1957 43 : BBPreclaim(candright);
1958 : return MAL_SUCCEED;
1959 :
1960 0 : fail:
1961 0 : BBPreclaim(left);
1962 0 : BBPreclaim(right);
1963 0 : BBPreclaim(escape);
1964 0 : BBPreclaim(caseignore);
1965 0 : BBPreclaim(candleft);
1966 0 : BBPreclaim(candright);
1967 0 : BBPreclaim(result1);
1968 0 : BBPreclaim(result2);
1969 0 : if (msg)
1970 : return msg;
1971 0 : throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1972 : }
1973 :
1974 : static str
1975 26 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid,
1976 : const bat *cid, const bat *slid, const bat *srid,
1977 : const bit *nil_matches, const lng *estimate, const bit *anti)
1978 : {
1979 26 : (void) nil_matches;
1980 26 : (void) estimate;
1981 26 : return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
1982 26 : *elid, *cid, *anti);
1983 : }
1984 :
1985 : static str
1986 17 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid,
1987 : const bat *cid, const bat *slid, const bat *srid,
1988 : const bit *nil_matches, const lng *estimate, const bit *anti)
1989 : {
1990 17 : (void) nil_matches;
1991 17 : (void) estimate;
1992 17 : return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
1993 17 : *elid, *cid, *anti);
1994 : }
1995 :
1996 : #include "mel.h"
1997 : mel_atom pcre_init_atoms[] = {
1998 : { .name="pcre", }, { .cmp=NULL }
1999 : };
2000 : mel_func pcre_init_funcs[] = {
2001 : command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
2002 : command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2003 : command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2004 : command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
2005 : command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2006 : command("pcre", "replace_first", PCREreplacefirst_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2007 : command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
2008 : command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
2009 : command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2010 : command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2011 : command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2012 : command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2013 : command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2014 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2015 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2016 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2017 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2018 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2019 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2020 : command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds. The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
2021 : command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
2022 : command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
2023 : { .imp=NULL }
2024 : };
2025 : #include "mal_import.h"
2026 : #ifdef _MSC_VER
2027 : #undef read
2028 : #pragma section(".CRT$XCU",read)
2029 : #endif
2030 308 : LIB_STARTUP_FUNC(init_pcre_mal)
2031 308 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }
|