Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024, 2025 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * N. Nes
15 : * PCRE library interface
16 : * The PCRE library is a set of functions that implement regular
17 : * expression pattern matching using the same syntax and semantics as Perl,
18 : * with just a few differences. The current implementation of PCRE
19 : * (release 4.x) corresponds approximately with Perl 5.8, including support
20 : * for UTF-8 encoded strings. However, this support has to be
21 : * explicitly enabled; it is not the default.
22 : *
23 : * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
24 : */
25 : #include "monetdb_config.h"
26 : #include <string.h>
27 :
28 : #include "mal.h"
29 : #include "mal_client.h"
30 : #include "mal_interpreter.h"
31 : #include "mal_exception.h"
32 :
33 : #include <wchar.h>
34 : #include <wctype.h>
35 :
36 : #ifdef HAVE_LIBPCRE
37 : #include <pcre.h>
38 : #ifndef PCRE_STUDY_JIT_COMPILE
39 : /* old library version on e.g. EPEL 6 */
40 : #define pcre_free_study(x) pcre_free(x)
41 : #define PCRE_STUDY_JIT_COMPILE 0
42 : #endif
43 : #define JIT_COMPILE_MIN 1024 /* when to try JIT compilation of patterns */
44 :
45 : #else
46 :
47 : #include <regex.h>
48 :
49 : typedef regex_t pcre;
50 : #endif
51 :
52 : /* current implementation assumes simple %keyword% [keyw%]* */
53 : struct RE {
54 : char *k;
55 : bool search:1, atend:1, case_ignore:1;
56 : size_t skip; /* number of codepoints to skip before matching */
57 : size_t len; /* number of bytes in string */
58 : size_t ulen; /* number of codepoints in string */
59 : struct RE *n;
60 : };
61 :
62 : /* We cannot use strcasecmp and strncasecmp since they work byte for
63 : * byte and don't deal with multibyte encodings (such as UTF-8). */
64 :
65 : static inline bool
66 15243 : mnre_is_pattern_properly_escaped(const char *pat, unsigned char esc)
67 : {
68 15243 : bool escaped = false;
69 :
70 15243 : if (pat == 0)
71 : return true;
72 117132 : while (*pat) {
73 101889 : if (escaped) {
74 : escaped = false;
75 88802 : } else if ((unsigned char) *pat == esc) {
76 101889 : escaped = true;
77 : }
78 101889 : pat++;
79 : }
80 15243 : return escaped ? false : true;
81 : }
82 :
83 : /* returns true if the pattern does not contain wildcard
84 : * characters ('%' or '_') and no character is escaped
85 : */
86 : static inline bool
87 15166 : is_strcmpable(const char *pat, const char *esc)
88 : {
89 15166 : if (pat[strcspn(pat, "%_")])
90 : return false;
91 1974 : return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
92 : }
93 :
94 : /* Match regular expression by comparing bytes.
95 : */
96 : static inline bool
97 376693 : mnre_match(const char *restrict s, const struct RE *restrict pattern)
98 : {
99 376693 : const struct RE *r;
100 :
101 434284 : for (r = pattern; r; r = r->n) {
102 400581 : for (size_t i = 0; i < r->skip; s++) {
103 22615 : if (*s == 0)
104 : return false;
105 22759 : i += (*s & 0xC0) != 0x80;
106 : }
107 377966 : if (r->search) {
108 151271 : if (r->atend) {
109 : /* we're searching for a string at the end, so just skip
110 : * over everything and just compare with the tail of the
111 : * haystack */
112 22169 : size_t slen = strlen(s);
113 22169 : if (slen < r->ulen) {
114 : /* remaining string too short: each codepoint
115 : * requires at least one byte */
116 : return false;
117 : }
118 22413 : const char *e = s + slen;
119 22413 : if (!r->case_ignore) {
120 22331 : if (slen < r->len) {
121 : /* remaining string is too short to match */
122 : return false;
123 : }
124 22468 : e -= r->len;
125 22468 : if ((*e & 0xC0) == 0x80) {
126 : /* not at start of a Unicode character, so
127 : * cannot match (this test not strictly
128 : * required: the strcmp should also return
129 : * unequal) */
130 : return false;
131 : }
132 22976 : return strcmp(e, r->k) == 0;
133 : }
134 : size_t ulen = r->ulen;
135 363 : while (e > s && ulen != 0) {
136 281 : ulen -= (*--e & 0xC0) != 0x80;
137 : }
138 : /* ulen != 0 means remaining string is too short */
139 144 : return ulen == 0 && GDKstrcasecmp(e, r->k) == 0;
140 : }
141 : /* in case we have a pattern consisting of % followed by _,
142 : * we need to backtrack, so use recursion; here we know we
143 : * have the %, look for an _ in the rest of the pattern
144 : * (note %_ and _% are equivalent and is taken care of by
145 : * the pattern construction in mnre_create) */
146 134125 : for (const struct RE *p = r->n; p; p = p->n) {
147 7165 : if (p->skip != 0) {
148 2142 : struct RE pat = *r;
149 2142 : pat.search = false;
150 2142 : pat.skip = 0;
151 160183 : do {
152 160183 : if (mnre_match(s, &pat))
153 : return true;
154 159980 : do
155 159980 : s++;
156 159996 : while (*s && (*s & 0xC0) == 0x80);
157 159996 : } while (*s != 0);
158 : return false;
159 : }
160 : }
161 : }
162 353655 : if (r->k[0] == 0 && (r->search || *s == 0))
163 : return true;
164 353613 : if (r->case_ignore) {
165 11394 : for (;;) {
166 11394 : if (r->search && (s = GDKstrcasestr(s, r->k)) == NULL)
167 : return false;
168 3735 : if (*s == '\0')
169 : return false;
170 : /* in "atend" comparison, compare whole string, else
171 : * only part */
172 3794 : if ((!r->search || r->atend) &&
173 59 : (r->atend ? GDKstrcasecmp(s, r->k) : GDKstrncasecmp(s, r->k, SIZE_MAX, r->len)) != 0) {
174 : /* no match */
175 22 : if (!r->search)
176 : return false;
177 : /* try again with next character */
178 0 : do
179 0 : s++;
180 0 : while (*s != '\0' && (*s & 0xC0) == 0x80);
181 0 : continue;
182 : }
183 : /* match; find end of match by counting codepoints */
184 58479 : for (size_t i = 0; *s && i < r->ulen; s++)
185 54766 : i += (*s & 0xC0) != 0x80;
186 : break;
187 : }
188 : } else {
189 342219 : for (;;) {
190 342219 : if (r->search && (s = strstr(s, r->k)) == NULL)
191 : return false;
192 243517 : if (*s == '\0')
193 : return false;
194 : /* in "atend" comparison, include NUL byte in the compare */
195 243034 : if ((!r->search || r->atend) &&
196 191694 : strncmp(s, r->k, r->len + r->atend) != 0) {
197 : /* no match */
198 189156 : if (!r->search)
199 : return false;
200 : /* try again with next character: have search start
201 : * after current first byte */
202 0 : if ((s = strchr(s + 1, r->k[0])) == NULL)
203 : return false;
204 0 : continue;
205 : }
206 : /* match */
207 53878 : s += r->len;
208 53878 : break;
209 : }
210 : }
211 : }
212 : return true;
213 : }
214 :
215 : static void
216 14310 : mnre_destroy(struct RE *p)
217 : {
218 14310 : if (p) {
219 14310 : GDKfree(p->k);
220 15325 : do {
221 15325 : struct RE *n = p->n;
222 :
223 15325 : GDKfree(p);
224 15319 : p = n;
225 15319 : } while (p);
226 : }
227 14326 : }
228 :
229 : /* Create a linked list of RE structures. Depending on the
230 : * caseignore and the ascii_pattern flags, the w
231 : * (if caseignore == true && ascii_pattern == false) or the k
232 : * (in every other case) field is used. These in the first
233 : * structure are allocated, whereas in all subsequent
234 : * structures the fields point into the allocated buffer of
235 : * the first.
236 : */
237 : static struct RE *
238 14315 : mnre_create(const char *pat, bool caseignore, uint32_t esc)
239 : {
240 14315 : struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
241 14333 : bool escaped = false;
242 14333 : char *p, *q;
243 :
244 14333 : if (r == NULL)
245 : return NULL;
246 14333 : *r = (struct RE) {
247 : .atend = true,
248 : .case_ignore = caseignore,
249 : };
250 :
251 21132 : for (;;) {
252 21132 : if (esc != '%' && *pat == '%') {
253 6545 : pat++; /* skip % */
254 6545 : r->search = true;
255 14587 : } else if (esc != '_' && *pat == '_') {
256 254 : pat++;
257 254 : r->skip++;
258 : } else {
259 : break;
260 : }
261 : }
262 14333 : if ((p = GDKstrdup(pat)) == NULL) {
263 0 : GDKfree(r);
264 0 : return NULL;
265 : }
266 :
267 14322 : r->k = p;
268 14322 : q = p;
269 104410 : while (*p) {
270 90106 : if (escaped) {
271 13119 : *q++ = *p;
272 13119 : n->len++;
273 13119 : n->ulen += (*p & 0xC0) != 0x80;
274 13119 : escaped = false;
275 76987 : } else if ((unsigned char) *p == esc) {
276 : escaped = true;
277 63870 : } else if (*p == '%' || *p == '_') {
278 14223 : n->atend = false;
279 14223 : bool search = false;
280 14223 : size_t skip = 0;
281 42831 : for (;;) {
282 28527 : if (*p == '_')
283 782 : skip++;
284 27745 : else if (*p == '%')
285 : search = true;
286 : else
287 : break;
288 14304 : p++;
289 : }
290 14223 : if (*p || skip != 0) {
291 1002 : n = n->n = GDKmalloc(sizeof(struct RE));
292 984 : if (n == NULL)
293 0 : goto bailout;
294 984 : *n = (struct RE) {
295 : .search = search,
296 : .atend = true,
297 : .skip = skip,
298 : .k = p,
299 : .case_ignore = caseignore,
300 : };
301 : }
302 14205 : *q = 0;
303 14205 : q = p;
304 14205 : continue; /* skip increment, we already did it */
305 : } else {
306 49647 : *q++ = *p;
307 49647 : n->len++;
308 49647 : n->ulen += (*p & 0xC0) != 0x80;
309 : }
310 75883 : p++;
311 : }
312 14304 : *q = 0;
313 14304 : return r;
314 0 : bailout:
315 0 : mnre_destroy(r);
316 0 : return NULL;
317 : }
318 :
319 : #ifdef HAVE_LIBPCRE
320 : static str
321 25 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
322 : {
323 25 : pcre *r;
324 25 : const char *err_p = NULL;
325 25 : int errpos = 0;
326 25 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
327 25 : if (insensitive)
328 0 : options |= PCRE_CASELESS;
329 :
330 25 : if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
331 0 : throw(MAL, "pcre.compile", OPERATION_FAILED
332 : " with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern);
333 : }
334 25 : *res = r;
335 25 : return MAL_SUCCEED;
336 : }
337 : #endif
338 :
339 : /* maximum number of back references and quoted \ or $ in replacement string */
340 : #define MAX_NR_REFS 20
341 :
342 : struct backref {
343 : int idx;
344 : int start;
345 : int end;
346 : };
347 :
348 : #ifdef HAVE_LIBPCRE
349 : /* fill in parameter backrefs (length maxrefs) with information about
350 : * back references in the replacement string; a back reference is a
351 : * dollar or backslash followed by a number */
352 : static int
353 140 : parse_replacement(const char *replacement, int len_replacement,
354 : struct backref *backrefs, int maxrefs)
355 : {
356 140 : int nbackrefs = 0;
357 :
358 209 : for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
359 70 : if (replacement[i] == '$' || replacement[i] == '\\') {
360 11 : char *endptr;
361 11 : backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
362 10 : if (endptr > replacement + i + 1) {
363 10 : int k = (int) (endptr - (replacement + i + 1));
364 10 : backrefs[nbackrefs].start = i;
365 10 : backrefs[nbackrefs].end = i + k + 1;
366 10 : nbackrefs++;
367 0 : } else if (replacement[i] == replacement[i + 1]) {
368 : /* doubled $ or \, we must copy just one to the output */
369 0 : backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */
370 0 : backrefs[nbackrefs].start = i;
371 0 : backrefs[nbackrefs].end = i + 1;
372 0 : i++; /* don't look at second $ or \ again */
373 0 : nbackrefs++;
374 : }
375 : /* else: $ or \ followed by something we don't recognize,
376 : * so just leave it */
377 : }
378 : }
379 139 : return nbackrefs;
380 : }
381 :
382 : static char *
383 51089 : single_replace(pcre *pcre_code, pcre_extra *extra,
384 : const char *origin_str, int len_origin_str,
385 : int exec_options, int *ovector, int ovecsize,
386 : const char *replacement, int len_replacement,
387 : struct backref *backrefs, int nbackrefs,
388 : bool global, char *result, int *max_result)
389 : {
390 51089 : int offset = 0;
391 51089 : int len_result = 0;
392 51089 : int addlen;
393 51089 : int empty_match_correction = 0;
394 191007 : char *tmp;
395 :
396 191007 : do {
397 191007 : int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
398 : exec_options, ovector, ovecsize);
399 191230 : if (j <= 0)
400 : break;
401 :
402 143595 : empty_match_correction = ovector[0] == ovector[1] ? 1 : 0;
403 :
404 : // calculate the length of the string that will be appended to result
405 287190 : addlen = ovector[0] - offset
406 143595 : + (nbackrefs == 0 ? len_replacement : 0) + empty_match_correction;
407 143595 : if (len_result + addlen >= *max_result) {
408 12149 : tmp = GDKrealloc(result, len_result + addlen + 1);
409 12149 : if (tmp == NULL) {
410 0 : GDKfree(result);
411 0 : return NULL;
412 : }
413 12149 : result = tmp;
414 12149 : *max_result = len_result + addlen + 1;
415 : }
416 : // append to the result the parts of the original string that are left unchanged
417 143595 : if (ovector[0] > offset) {
418 139349 : strncpy(result + len_result, origin_str + offset,
419 139349 : ovector[0] - offset);
420 139349 : len_result += ovector[0] - offset;
421 : }
422 : // append to the result the replacement of the matched string
423 143595 : if (nbackrefs == 0) {
424 139924 : strncpy(result + len_result, replacement, len_replacement);
425 139924 : len_result += len_replacement;
426 : } else {
427 : int prevend = 0;
428 7342 : for (int i = 0; i < nbackrefs; i++) {
429 3671 : int off, len;
430 3671 : if (backrefs[i].idx >= ovecsize / 3) {
431 : /* out of bounds, replace with empty string */
432 : off = 0;
433 : len = 0;
434 : } else {
435 3671 : off = ovector[backrefs[i].idx * 2];
436 3671 : len = ovector[backrefs[i].idx * 2 + 1] - off;
437 : }
438 3671 : addlen = backrefs[i].start - prevend + len;
439 3671 : if (len_result + addlen >= *max_result) {
440 37 : tmp = GDKrealloc(result, len_result + addlen + 1);
441 37 : if (tmp == NULL) {
442 0 : GDKfree(result);
443 0 : return NULL;
444 : }
445 37 : result = tmp;
446 37 : *max_result = len_result + addlen + 1;
447 : }
448 3671 : if (backrefs[i].start > prevend) {
449 2 : strncpy(result + len_result, replacement + prevend,
450 2 : backrefs[i].start - prevend);
451 2 : len_result += backrefs[i].start - prevend;
452 : }
453 3671 : if (len > 0) {
454 3671 : strncpy(result + len_result, origin_str + off, len);
455 3671 : len_result += len;
456 : }
457 3671 : prevend = backrefs[i].end;
458 : }
459 : /* copy rest of replacement string (after last backref) */
460 3671 : addlen = len_replacement - prevend;
461 3671 : if (addlen > 0) {
462 2 : if (len_result + addlen >= *max_result) {
463 1 : tmp = GDKrealloc(result, len_result + addlen + 1);
464 1 : if (tmp == NULL) {
465 0 : GDKfree(result);
466 0 : return NULL;
467 : }
468 1 : result = tmp;
469 1 : *max_result = len_result + addlen + 1;
470 : }
471 2 : strncpy(result + len_result, replacement + prevend, addlen);
472 2 : len_result += addlen;
473 : }
474 : }
475 : // In case of an empty match just advance the offset by 1
476 143595 : offset = ovector[1] + empty_match_correction;
477 : // and copy the character that we just advanced over
478 143595 : if (empty_match_correction) {
479 14 : strncpy(result + len_result, origin_str + ovector[1], 1);
480 14 : ++len_result;
481 : }
482 : // before we loop around check with the offset - 1 if we had an empty match
483 : // since we manually advanced the offset by one. otherwise we gonna skip a
484 : // replacement at the end of the string
485 143595 : } while ((offset - empty_match_correction) < len_origin_str && global);
486 :
487 51312 : if (offset < len_origin_str) {
488 47381 : addlen = len_origin_str - offset;
489 47381 : if (len_result + addlen >= *max_result) {
490 684 : tmp = GDKrealloc(result, len_result + addlen + 1);
491 685 : if (tmp == NULL) {
492 0 : GDKfree(result);
493 0 : return NULL;
494 : }
495 685 : result = tmp;
496 685 : *max_result = len_result + addlen + 1;
497 : }
498 47382 : strncpy(result + len_result, origin_str + offset, addlen);
499 47382 : len_result += addlen;
500 : }
501 : /* null terminate string */
502 51313 : result[len_result] = '\0';
503 51313 : return result;
504 : }
505 : #endif
506 :
507 : static str
508 14 : pcre_replace(str *res, const char *origin_str, const char *pattern,
509 : const char *replacement, const char *flags, bool global)
510 : {
511 : #ifdef HAVE_LIBPCRE
512 14 : const char *err_p = NULL;
513 14 : pcre *pcre_code = NULL;
514 14 : pcre_extra *extra;
515 14 : char *tmpres;
516 14 : int max_result;
517 14 : int i, errpos = 0;
518 14 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
519 14 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
520 14 : int *ovector, ovecsize;
521 14 : int len_origin_str = (int) strlen(origin_str);
522 14 : int len_replacement = (int) strlen(replacement);
523 14 : struct backref backrefs[MAX_NR_REFS];
524 14 : int nbackrefs = 0;
525 :
526 21 : while (*flags) {
527 7 : switch (*flags) {
528 : case 'e':
529 : exec_options &= ~PCRE_NOTEMPTY;
530 : break;
531 1 : case 'i':
532 1 : compile_options |= PCRE_CASELESS;
533 1 : break;
534 1 : case 'm':
535 1 : compile_options |= PCRE_MULTILINE;
536 1 : break;
537 1 : case 's':
538 1 : compile_options |= PCRE_DOTALL;
539 1 : break;
540 1 : case 'x':
541 1 : compile_options |= PCRE_EXTENDED;
542 1 : break;
543 0 : default:
544 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
545 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
546 : *flags);
547 : }
548 7 : flags++;
549 : }
550 :
551 14 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
552 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
553 : OPERATION_FAILED
554 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
555 : pattern, errpos, err_p);
556 : }
557 :
558 : /* Since the compiled pattern is going to be used several times, it is
559 : * worth spending more time analyzing it in order to speed up the time
560 : * taken for matching.
561 : */
562 14 : extra = pcre_study(pcre_code, 0, &err_p);
563 14 : if (err_p != NULL) {
564 0 : pcre_free(pcre_code);
565 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
566 : OPERATION_FAILED
567 : ": pcre study of pattern (%s) failed with '%s'.\n", pattern,
568 : err_p);
569 : }
570 14 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
571 14 : ovecsize = (i + 1) * 3;
572 14 : if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
573 0 : pcre_free_study(extra);
574 0 : pcre_free(pcre_code);
575 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
576 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
577 : }
578 :
579 : /* identify back references in the replacement string */
580 14 : nbackrefs = parse_replacement(replacement, len_replacement,
581 : backrefs, MAX_NR_REFS);
582 :
583 14 : max_result = len_origin_str + 1;
584 14 : tmpres = GDKmalloc(max_result);
585 14 : if (tmpres == NULL) {
586 0 : GDKfree(ovector);
587 0 : pcre_free_study(extra);
588 0 : pcre_free(pcre_code);
589 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
590 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
591 : }
592 :
593 14 : tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
594 : exec_options, ovector, ovecsize, replacement,
595 : len_replacement, backrefs, nbackrefs, global,
596 : tmpres, &max_result);
597 14 : GDKfree(ovector);
598 14 : pcre_free_study(extra);
599 14 : pcre_free(pcre_code);
600 14 : if (tmpres == NULL)
601 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
602 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
603 :
604 14 : *res = tmpres;
605 14 : return MAL_SUCCEED;
606 : #else
607 : (void) res;
608 : (void) origin_str;
609 : (void) pattern;
610 : (void) replacement;
611 : (void) flags;
612 : (void) global;
613 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
614 : "Database was compiled without PCRE support.");
615 : #endif
616 : }
617 :
618 : static str
619 126 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
620 : const char *replacement, const char *flags, bool global)
621 : {
622 : #ifdef HAVE_LIBPCRE
623 126 : const char *err_p = NULL;
624 126 : char *tmpres;
625 126 : int i, errpos = 0;
626 126 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
627 126 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
628 126 : pcre *pcre_code = NULL;
629 126 : pcre_extra *extra;
630 126 : BAT *tmpbat;
631 126 : BUN p, q;
632 126 : int *ovector, ovecsize;
633 126 : int len_replacement = (int) strlen(replacement);
634 126 : struct backref backrefs[MAX_NR_REFS];
635 126 : int nbackrefs = 0;
636 126 : const char *origin_str;
637 126 : int max_dest_size = 0;
638 :
639 162 : while (*flags) {
640 36 : switch (*flags) {
641 : case 'e':
642 : exec_options &= ~PCRE_NOTEMPTY;
643 : break;
644 9 : case 'i':
645 9 : compile_options |= PCRE_CASELESS;
646 9 : break;
647 18 : case 'm':
648 18 : compile_options |= PCRE_MULTILINE;
649 18 : break;
650 9 : case 's':
651 9 : compile_options |= PCRE_DOTALL;
652 9 : break;
653 0 : case 'x':
654 0 : compile_options |= PCRE_EXTENDED;
655 0 : break;
656 0 : default:
657 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
658 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
659 : *flags);
660 : }
661 36 : flags++;
662 : }
663 :
664 126 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
665 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
666 : OPERATION_FAILED
667 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
668 : pattern, errpos, err_p);
669 : }
670 :
671 : /* Since the compiled pattern is going to be used several times,
672 : * it is worth spending more time analyzing it in order to speed
673 : * up the time taken for matching.
674 : */
675 252 : extra = pcre_study(pcre_code,
676 126 : BATcount(origin_strs) >
677 : JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
678 126 : if (err_p != NULL) {
679 0 : pcre_free(pcre_code);
680 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
681 : OPERATION_FAILED);
682 : }
683 126 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
684 126 : ovecsize = (i + 1) * 3;
685 126 : if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
686 0 : pcre_free_study(extra);
687 0 : pcre_free(pcre_code);
688 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
689 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
690 : }
691 :
692 : /* identify back references in the replacement string */
693 126 : nbackrefs = parse_replacement(replacement, len_replacement,
694 : backrefs, MAX_NR_REFS);
695 :
696 125 : tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs),
697 : TRANSIENT);
698 :
699 : /* the buffer for all destination strings is allocated only once,
700 : * and extended when needed */
701 126 : max_dest_size = len_replacement + 1;
702 126 : tmpres = GDKmalloc(max_dest_size);
703 126 : if (tmpbat == NULL || tmpres == NULL) {
704 0 : pcre_free_study(extra);
705 0 : pcre_free(pcre_code);
706 0 : GDKfree(ovector);
707 0 : BBPreclaim(tmpbat);
708 0 : GDKfree(tmpres);
709 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
710 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
711 : }
712 126 : BATiter origin_strsi = bat_iterator(origin_strs);
713 51425 : BATloop(origin_strs, p, q) {
714 51299 : origin_str = BUNtvar(origin_strsi, p);
715 102341 : tmpres = single_replace(pcre_code, extra, origin_str,
716 51299 : (int) strlen(origin_str), exec_options,
717 : ovector, ovecsize, replacement,
718 : len_replacement, backrefs, nbackrefs, global,
719 : tmpres, &max_dest_size);
720 51042 : if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
721 0 : bat_iterator_end(&origin_strsi);
722 0 : pcre_free_study(extra);
723 0 : pcre_free(pcre_code);
724 0 : GDKfree(ovector);
725 0 : GDKfree(tmpres);
726 0 : BBPreclaim(tmpbat);
727 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
728 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
729 : }
730 : }
731 126 : bat_iterator_end(&origin_strsi);
732 126 : pcre_free_study(extra);
733 126 : pcre_free(pcre_code);
734 126 : GDKfree(ovector);
735 126 : GDKfree(tmpres);
736 126 : *res = tmpbat;
737 126 : return MAL_SUCCEED;
738 : #else
739 : (void) res;
740 : (void) origin_strs;
741 : (void) pattern;
742 : (void) replacement;
743 : (void) flags;
744 : (void) global;
745 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
746 : "Database was compiled without PCRE support.");
747 : #endif
748 : }
749 :
750 : static str
751 5 : pcre_match_with_flags(bit *ret, const char *val, const char *pat,
752 : const char *flags)
753 : {
754 5 : int pos;
755 : #ifdef HAVE_LIBPCRE
756 5 : const char *err_p = NULL;
757 5 : int errpos = 0;
758 5 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_DOTALL;
759 5 : pcre *re;
760 : #else
761 : int options = REG_NOSUB | REG_EXTENDED;
762 : regex_t re;
763 : int errcode;
764 : int retval;
765 : #endif
766 :
767 5 : while (*flags) {
768 0 : switch (*flags) {
769 0 : case 'i':
770 : #ifdef HAVE_LIBPCRE
771 0 : options |= PCRE_CASELESS;
772 : #else
773 : options |= REG_ICASE;
774 : #endif
775 0 : break;
776 0 : case 'm':
777 : #ifdef HAVE_LIBPCRE
778 0 : options |= PCRE_MULTILINE;
779 : #else
780 : options |= REG_NEWLINE;
781 : #endif
782 0 : break;
783 : #ifdef HAVE_LIBPCRE
784 0 : case 's':
785 0 : options |= PCRE_DOTALL;
786 0 : break;
787 : #endif
788 0 : case 'x':
789 : #ifdef HAVE_LIBPCRE
790 0 : options |= PCRE_EXTENDED;
791 : #else
792 : options |= REG_EXTENDED;
793 : #endif
794 0 : break;
795 0 : default:
796 0 : throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
797 : ": unsupported flag character '%c'\n", *flags);
798 : }
799 0 : flags++;
800 : }
801 5 : if (strNil(val)) {
802 0 : *ret = FALSE;
803 0 : return MAL_SUCCEED;
804 : }
805 :
806 : #ifdef HAVE_LIBPCRE
807 5 : if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
808 : #else
809 : if ((errcode = regcomp(&re, pat, options)) != 0)
810 : #endif
811 : {
812 0 : throw(MAL, "pcre.match", OPERATION_FAILED
813 : ": compilation of regular expression (%s) failed "
814 : #ifdef HAVE_LIBPCRE
815 : "at %d with '%s'", pat, errpos, err_p
816 : #else
817 : , pat
818 : #endif
819 : );
820 : }
821 : #ifdef HAVE_LIBPCRE
822 5 : pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK,
823 : NULL, 0);
824 5 : pcre_free(re);
825 : #else
826 : retval = regexec(&re, val, (size_t) 0, NULL, 0);
827 : pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
828 : regfree(&re);
829 : #endif
830 5 : if (pos >= 0)
831 3 : *ret = TRUE;
832 2 : else if (pos == -1)
833 2 : *ret = FALSE;
834 : else
835 0 : throw(MAL, "pcre.match", OPERATION_FAILED
836 : ": matching of regular expression (%s) failed with %d", pat, pos);
837 : return MAL_SUCCEED;
838 : }
839 :
840 : #ifdef HAVE_LIBPCRE
841 : /* special characters in PCRE that need to be escaped */
842 : static const char pcre_specials[] = "$()*+.?[\\]^{|}";
843 : #else
844 : /* special characters in POSIX basic regular expressions that need to
845 : * be escaped */
846 : static const char pcre_specials[] = "$()*+.?[\\^{|";
847 : #endif
848 :
849 : /* change SQL LIKE pattern into PCRE pattern */
850 : static str
851 6 : sql2pcre(str *r, const char *pat, const char *esc_str)
852 : {
853 6 : int escaped = 0;
854 6 : int hasWildcard = 0;
855 6 : char *ppat;
856 12 : int esc = strNil(esc_str) ? 0 : esc_str[0]; /* should change to utf8_convert() */
857 6 : int specials;
858 6 : int c;
859 :
860 6 : if (strlen(esc_str) > 1)
861 0 : throw(MAL, "pcre.sql2pcre",
862 : SQLSTATE(22019) ILLEGAL_ARGUMENT
863 : ": ESCAPE string must have length 1");
864 6 : if (pat == NULL)
865 0 : throw(MAL, "pcre.sql2pcre",
866 : SQLSTATE(22019) ILLEGAL_ARGUMENT
867 : ": (I)LIKE pattern must not be NULL");
868 6 : ppat = GDKmalloc(strlen(pat) * 3 +
869 : 3 /* 3 = "^'the translated regexp'$0" */ );
870 6 : if (ppat == NULL)
871 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
872 :
873 6 : *r = ppat;
874 : /* The escape character can be a char which is special in a PCRE
875 : * expression. If the user used the "+" char as escape and has "++"
876 : * in their pattern, then replacing this with "+" is not correct and
877 : * should be "\+" instead. */
878 6 : specials = (esc && strchr(pcre_specials, esc) != NULL);
879 :
880 6 : *ppat++ = '^';
881 17 : while ((c = *pat++) != 0) {
882 11 : if (c == esc) {
883 2 : if (escaped) {
884 1 : if (specials) { /* change ++ into \+ */
885 1 : *ppat++ = esc;
886 : } else { /* do not escape simple escape symbols */
887 0 : ppat[-1] = esc; /* overwrite backslash */
888 : }
889 : escaped = 0;
890 : } else {
891 1 : *ppat++ = '\\';
892 1 : escaped = 1;
893 : }
894 : hasWildcard = 1;
895 9 : } else if (strchr(pcre_specials, c) != NULL) {
896 : /* escape PCRE special chars, avoid double backslash if the
897 : * user uses an invalid escape sequence */
898 2 : if (!escaped)
899 2 : *ppat++ = '\\';
900 2 : *ppat++ = c;
901 2 : hasWildcard = 1;
902 2 : escaped = 0;
903 7 : } else if (c == '%' && !escaped) {
904 3 : *ppat++ = '.';
905 3 : *ppat++ = '*';
906 3 : *ppat++ = '?';
907 3 : hasWildcard = 1;
908 : /* collapse multiple %, but only if it isn't the escape */
909 3 : if (esc != '%')
910 3 : while (*pat == '%')
911 0 : pat++;
912 4 : } else if (c == '_' && !escaped) {
913 3 : *ppat++ = '.';
914 3 : hasWildcard = 1;
915 : } else {
916 1 : if (escaped) {
917 0 : ppat[-1] = c; /* overwrite backslash of invalid escape */
918 : } else {
919 1 : *ppat++ = c;
920 : }
921 : escaped = 0;
922 : }
923 : }
924 : /* no wildcard or escape character at end of string */
925 6 : if (!hasWildcard || escaped) {
926 1 : GDKfree(*r);
927 1 : *r = NULL;
928 1 : if (escaped)
929 0 : throw(MAL, "pcre.sql2pcre",
930 : SQLSTATE(22019) ILLEGAL_ARGUMENT
931 : ": (I)LIKE pattern must not end with escape character");
932 1 : *r = GDKstrdup(str_nil);
933 1 : if (*r == NULL)
934 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
935 : } else {
936 5 : *ppat++ = '$';
937 5 : *ppat = 0;
938 : }
939 : return MAL_SUCCEED;
940 : }
941 :
942 : #ifdef HAVE_LIBPCRE
943 : /* change SQL PATINDEX pattern into PCRE pattern */
944 : static str
945 25 : pat2pcre(str *r, const char *pat)
946 : {
947 25 : size_t len = strlen(pat);
948 25 : char *ppat = GDKmalloc(len * 2 + 3 /* 3 = "^'the translated regexp'$0" */ );
949 25 : int start = 0;
950 :
951 25 : if (ppat == NULL)
952 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
953 25 : *r = ppat;
954 77 : while (*pat) {
955 52 : int c = *pat++;
956 :
957 52 : if (strchr(pcre_specials, c) != NULL) {
958 17 : *ppat++ = '\\';
959 17 : *ppat++ = c;
960 35 : } else if (c == '%') {
961 3 : if (start && *pat) {
962 0 : *ppat++ = '.';
963 0 : *ppat++ = '*';
964 : }
965 3 : start++;
966 32 : } else if (c == '_') {
967 0 : *ppat++ = '.';
968 : } else {
969 32 : *ppat++ = c;
970 : }
971 : }
972 25 : *ppat = 0;
973 25 : return MAL_SUCCEED;
974 : }
975 : #endif
976 :
977 : /*
978 : * @+ Wrapping
979 : */
980 :
981 : static str
982 14 : PCREreplace_wrap(str *res, const char *const *or, const char *const *pat,
983 : const char *const *repl, const char *const *flags)
984 : {
985 14 : return pcre_replace(res, *or, *pat, *repl, *flags, true);
986 : }
987 :
988 : static str
989 0 : PCREreplacefirst_wrap(str *res, const char *const *or, const char *const *pat,
990 : const char *const *repl, const char *const *flags)
991 : {
992 0 : return pcre_replace(res, *or, *pat, *repl, *flags, false);
993 : }
994 :
995 : static str
996 126 : PCREreplace_bat_wrap(bat *res, const bat *bid, const char *const *pat,
997 : const char *const *repl, const char *const *flags)
998 : {
999 126 : BAT *b, *bn = NULL;
1000 126 : str msg;
1001 126 : if ((b = BATdescriptor(*bid)) == NULL)
1002 0 : throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1003 :
1004 126 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
1005 126 : if (msg == MAL_SUCCEED) {
1006 126 : *res = bn->batCacheid;
1007 126 : BBPkeepref(bn);
1008 : }
1009 126 : BBPunfix(b->batCacheid);
1010 126 : return msg;
1011 : }
1012 :
1013 : static str
1014 0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const char *const *pat,
1015 : const char *const *repl, const char *const *flags)
1016 : {
1017 0 : BAT *b, *bn = NULL;
1018 0 : str msg;
1019 0 : if ((b = BATdescriptor(*bid)) == NULL)
1020 0 : throw(MAL, "batpcre.replace_first", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1021 :
1022 0 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
1023 0 : if (msg == MAL_SUCCEED) {
1024 0 : *res = bn->batCacheid;
1025 0 : BBPkeepref(bn);
1026 : }
1027 0 : BBPunfix(b->batCacheid);
1028 0 : return msg;
1029 : }
1030 :
1031 : static str
1032 5 : PCREmatch(bit *ret, const char *const *val, const char *const *pat)
1033 : {
1034 5 : return pcre_match_with_flags(ret, *val, *pat, "");
1035 : }
1036 :
1037 : static str
1038 0 : PCREimatch(bit *ret, const char *const *val, const char *const *pat)
1039 : {
1040 0 : return pcre_match_with_flags(ret, *val, *pat, "i");
1041 : }
1042 :
1043 : static str
1044 25 : PCREindex(int *res, const pcre *pattern, const char *const *s)
1045 : {
1046 : #ifdef HAVE_LIBPCRE
1047 25 : int v[3];
1048 :
1049 25 : v[0] = v[1] = *res = 0;
1050 25 : if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0,
1051 : PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
1052 23 : *res = v[1];
1053 : }
1054 25 : return MAL_SUCCEED;
1055 : #else
1056 : (void) res;
1057 : (void) pattern;
1058 : (void) s;
1059 : throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
1060 : #endif
1061 : }
1062 :
1063 : static str
1064 27 : PCREpatindex(int *ret, const char *const *pat, const char *const *val)
1065 : {
1066 : #ifdef HAVE_LIBPCRE
1067 27 : pcre *re = NULL;
1068 27 : char *ppat = NULL, *msg;
1069 :
1070 53 : if (strNil(*pat) || strNil(*val)) {
1071 2 : *ret = int_nil;
1072 2 : return MAL_SUCCEED;
1073 : }
1074 :
1075 25 : if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
1076 : return msg;
1077 25 : if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
1078 0 : GDKfree(ppat);
1079 0 : return msg;
1080 : }
1081 25 : GDKfree(ppat);
1082 25 : msg = PCREindex(ret, re, val);
1083 25 : pcre_free(re);
1084 25 : return msg;
1085 : #else
1086 : (void) ret;
1087 : (void) pat;
1088 : (void) val;
1089 : throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
1090 : #endif
1091 : }
1092 :
1093 : static str
1094 0 : PCREquote(str *ret, const char *const *val)
1095 : {
1096 0 : char *p;
1097 0 : const char *s = *val;
1098 :
1099 0 : *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */
1100 0 : if (p == NULL)
1101 0 : throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1102 : /* quote all non-alphanumeric ASCII characters (i.e. leave
1103 : non-ASCII and alphanumeric alone) */
1104 0 : while (*s) {
1105 0 : if (!((*s & 0x80) != 0 ||
1106 0 : ('a' <= *s && *s <= 'z') ||
1107 0 : ('A' <= *s && *s <= 'Z') || isdigit((unsigned char) *s)))
1108 0 : *p++ = '\\';
1109 0 : *p++ = *s++;
1110 : }
1111 0 : *p = 0;
1112 0 : return MAL_SUCCEED;
1113 : }
1114 :
1115 : static str
1116 6 : PCREsql2pcre(str *ret, const char *const *pat, const char *const *esc)
1117 : {
1118 6 : return sql2pcre(ret, *pat, *esc);
1119 : }
1120 :
1121 : static inline str
1122 15696 : choose_like_path(bool *use_re, bool *use_strcmp, bool *empty,
1123 : const char *pat, const char *esc)
1124 : {
1125 15696 : str res = MAL_SUCCEED;
1126 15696 : *use_re = false;
1127 15696 : *use_strcmp = false;
1128 15696 : *empty = false;
1129 :
1130 :
1131 30961 : if (strNil(pat) || strNil(esc)) {
1132 431 : *empty = true;
1133 : } else {
1134 15265 : if (!mnre_is_pattern_properly_escaped(pat, (unsigned char) *esc))
1135 5 : throw(MAL, "pcre.sql2pcre",
1136 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1137 : ": (I)LIKE pattern must not end with escape character");
1138 15169 : if (is_strcmpable(pat, esc)) {
1139 941 : *use_re = true;
1140 941 : *use_strcmp = true;
1141 : } else {
1142 14228 : *use_re = true;
1143 : }
1144 : }
1145 : return res;
1146 : }
1147 :
1148 : static str
1149 234 : PCRElike_imp(bit *ret, const char *const *s, const char *const *pat,
1150 : const char *const *esc, const bit *isens)
1151 : {
1152 234 : str res = MAL_SUCCEED;
1153 234 : bool use_re = false, use_strcmp = false, empty = false;
1154 234 : struct RE *re = NULL;
1155 :
1156 234 : if ((res = choose_like_path(&use_re, &use_strcmp, &empty,
1157 : *pat, *esc)) != MAL_SUCCEED)
1158 : return res;
1159 :
1160 459 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ?
1161 225 : "pcrelike: pattern matching using strcmp" : use_re ?
1162 : "pcrelike: pattern matching using RE" :
1163 : "pcrelike: pattern matching using pcre");
1164 :
1165 468 : if (strNil(*s) || empty) {
1166 0 : *ret = bit_nil;
1167 : } else {
1168 234 : if (use_strcmp) {
1169 9 : *ret = *isens ? GDKstrcasecmp(*s, *pat) == 0
1170 7 : : strcmp(*s, *pat) == 0;
1171 : } else {
1172 225 : if (!(re = mnre_create(*pat, *isens, (unsigned char) **esc)))
1173 0 : res = createException(MAL, "pcre.like4",
1174 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1175 : else
1176 225 : *ret = mnre_match(*s, re);
1177 : }
1178 : }
1179 :
1180 234 : if (re)
1181 225 : mnre_destroy(re);
1182 : return res;
1183 : }
1184 :
1185 : static str
1186 234 : PCRElike(bit *ret, const char *const *s, const char *const *pat,
1187 : const char *const *esc, const bit *isens)
1188 : {
1189 229 : return PCRElike_imp(ret, s, pat, esc, isens);
1190 : }
1191 :
1192 : static str
1193 5 : PCREnotlike(bit *ret, const char *const *s, const char *const *pat,
1194 : const char *const *esc, const bit *isens)
1195 : {
1196 5 : str tmp;
1197 5 : bit r;
1198 :
1199 5 : rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
1200 5 : *ret = r == bit_nil ? bit_nil : !r;
1201 5 : return MAL_SUCCEED;
1202 : }
1203 :
1204 : static inline str
1205 15036 : mnre_like_build(struct RE **re, const char *pat, bool caseignore,
1206 : bool use_strcmp, uint32_t esc)
1207 : {
1208 15036 : if (!use_strcmp) {
1209 14102 : if (!(*re = mnre_create(pat, caseignore, esc)))
1210 0 : return createException(MAL, "pcre.re_like_build",
1211 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1212 : }
1213 : return MAL_SUCCEED;
1214 : }
1215 :
1216 : static inline bit
1217 6320 : mnre_like_proj_apply(const char *s, const struct RE *restrict re,
1218 : const char *pat,
1219 : bool caseignore, bool anti, bool use_strcmp)
1220 : {
1221 6320 : if (strNil(s))
1222 450 : return bit_nil;
1223 5870 : if (use_strcmp) {
1224 1196 : if (caseignore) {
1225 542 : if (anti)
1226 511 : return GDKstrcasecmp(s, pat) != 0;
1227 : else
1228 31 : return GDKstrcasecmp(s, pat) == 0;
1229 : } else {
1230 654 : if (anti)
1231 301 : return strcmp(s, pat) != 0;
1232 : else
1233 353 : return strcmp(s, pat) == 0;
1234 : }
1235 : } else {
1236 4674 : if (anti)
1237 139 : return !mnre_match(s, re);
1238 : else
1239 4535 : return mnre_match(s, re);
1240 : }
1241 : }
1242 :
1243 : static inline void
1244 15169 : mnre_like_clean(struct RE **re)
1245 : {
1246 15169 : if (*re) {
1247 942 : mnre_destroy(*re);
1248 14097 : *re = NULL;
1249 : }
1250 : }
1251 :
1252 : static str
1253 1102 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci,
1254 : const char *const *esc, const bit *isens, const bit *not)
1255 : {
1256 1102 : str msg = MAL_SUCCEED;
1257 1102 : BAT *b = NULL, *pbn = NULL, *bn = NULL;
1258 1102 : const char *input = NULL;
1259 1102 : bool use_re = false,
1260 1102 : use_strcmp = false,
1261 1102 : empty = false,
1262 1102 : isensitive = (bool) *isens,
1263 1102 : anti = (bool) *not,
1264 1102 : has_nil = false,
1265 1102 : input_is_a_bat = isaBatType(getArgType(mb, pci, 1)),
1266 1102 : pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
1267 1102 : bat *r = getArgReference_bat(stk, pci, 0);
1268 1102 : BUN q = 0;
1269 1102 : bit *restrict ret = NULL;
1270 1102 : struct RE *mnre_simple = NULL;
1271 1102 : BATiter bi = (BATiter) { 0 }, pi;
1272 :
1273 1102 : (void) cntxt;
1274 1102 : if (input_is_a_bat) {
1275 1102 : bat *bid = getArgReference_bat(stk, pci, 1);
1276 1102 : if (!(b = BATdescriptor(*bid))) {
1277 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1278 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1279 0 : goto bailout;
1280 : }
1281 : }
1282 1111 : if (pattern_is_a_bat) {
1283 121 : bat *pb = getArgReference_bat(stk, pci, 2);
1284 121 : if (!(pbn = BATdescriptor(*pb))) {
1285 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1286 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1287 0 : goto bailout;
1288 : }
1289 : }
1290 1110 : assert((!b || ATOMstorage(b->ttype) == TYPE_str)
1291 : && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
1292 :
1293 1110 : q = BATcount(b ? b : pbn);
1294 1110 : if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
1295 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1296 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1297 0 : goto bailout;
1298 : }
1299 1110 : ret = (bit *) Tloc(bn, 0);
1300 :
1301 1110 : if (pattern_is_a_bat) {
1302 120 : pi = bat_iterator(pbn);
1303 121 : if (b)
1304 121 : bi = bat_iterator(b);
1305 : else
1306 0 : input = *getArgReference_str(stk, pci, 1);
1307 :
1308 1189 : for (BUN p = 0; p < q; p++) {
1309 1069 : const char *next_input = b ? BUNtvar(bi, p) : input,
1310 1069 : *np = BUNtvar(pi, p);
1311 :
1312 1069 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty,
1313 : np, *esc)) != MAL_SUCCEED) {
1314 0 : bat_iterator_end(&pi);
1315 0 : if (b)
1316 0 : bat_iterator_end(&bi);
1317 0 : goto bailout;
1318 : }
1319 :
1320 1075 : if (empty) {
1321 402 : ret[p] = bit_nil;
1322 : } else {
1323 673 : if ((msg = mnre_like_build(&mnre_simple, np, isensitive,
1324 : use_strcmp,
1325 673 : (unsigned char) **esc)) != MAL_SUCCEED) {
1326 0 : bat_iterator_end(&pi);
1327 0 : if (b)
1328 0 : bat_iterator_end(&bi);
1329 0 : goto bailout;
1330 : }
1331 673 : ret[p] = mnre_like_proj_apply(next_input, mnre_simple, np,
1332 : isensitive, anti, use_strcmp);
1333 668 : mnre_like_clean(&mnre_simple);
1334 : }
1335 1070 : has_nil |= is_bit_nil(ret[p]);
1336 : }
1337 120 : bat_iterator_end(&pi);
1338 120 : if (b)
1339 120 : bat_iterator_end(&bi);
1340 : } else {
1341 990 : const char *pat = *getArgReference_str(stk, pci, 2);
1342 990 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty,
1343 : pat, *esc)) != MAL_SUCCEED)
1344 5 : goto bailout;
1345 :
1346 983 : bi = bat_iterator(b);
1347 1893 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp
1348 : ? "pcrelike: pattern matching using strcmp" :
1349 908 : use_re ? "pcrelike: pattern matching using RE" :
1350 : "pcrelike: pattern matching using pcre");
1351 :
1352 985 : if (empty) {
1353 43 : for (BUN p = 0; p < q; p++)
1354 26 : ret[p] = bit_nil;
1355 : has_nil = true;
1356 : } else {
1357 967 : if ((msg = mnre_like_build(&mnre_simple, pat, isensitive, use_strcmp,
1358 968 : (unsigned char) **esc)) != MAL_SUCCEED) {
1359 6 : bat_iterator_end(&bi);
1360 0 : goto bailout;
1361 : }
1362 6612 : for (BUN p = 0; p < q; p++) {
1363 5644 : const char *s = BUNtvar(bi, p);
1364 5650 : ret[p] = mnre_like_proj_apply(s, mnre_simple, pat, isensitive,
1365 : anti, use_strcmp);
1366 5651 : has_nil |= is_bit_nil(ret[p]);
1367 : }
1368 : }
1369 985 : bat_iterator_end(&bi);
1370 : }
1371 :
1372 1111 : bailout:
1373 1111 : mnre_like_clean(&mnre_simple);
1374 1111 : if (bn && !msg) {
1375 1105 : BATsetcount(bn, q);
1376 1105 : bn->tnil = has_nil;
1377 1105 : bn->tnonil = !has_nil;
1378 1105 : bn->tkey = BATcount(bn) <= 1;
1379 1105 : bn->tsorted = BATcount(bn) <= 1;
1380 1105 : bn->trevsorted = BATcount(bn) <= 1;
1381 1105 : *r = bn->batCacheid;
1382 1105 : BBPkeepref(bn);
1383 6 : } else if (bn)
1384 5 : BBPreclaim(bn);
1385 1112 : BBPreclaim(b);
1386 1111 : BBPreclaim(pbn);
1387 1111 : return msg;
1388 : }
1389 :
1390 : static str
1391 937 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1392 : {
1393 937 : const char *esc = *getArgReference_str(stk, pci, 3);
1394 937 : const bit *ci = getArgReference_bit(stk, pci, 4);
1395 937 : bit no = FALSE;
1396 :
1397 937 : return BATPCRElike_imp(cntxt, mb, stk, pci, &esc, ci, &no);
1398 : }
1399 :
1400 : static str
1401 167 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1402 : {
1403 167 : const char *esc = *getArgReference_str(stk, pci, 3);
1404 167 : const bit *ci = getArgReference_bit(stk, pci, 4);
1405 167 : bit yes = TRUE;
1406 :
1407 167 : return BATPCRElike_imp(cntxt, mb, stk, pci, &esc, ci, &yes);
1408 : }
1409 :
1410 : /* scan select loop with or without candidates */
1411 : #define pcrescanloop(TEST, KEEP_NULLS) \
1412 : do { \
1413 : TRC_DEBUG(ALGO, \
1414 : "PCREselect(b=%s#"BUNFMT",anti=%d): " \
1415 : "scanselect %s\n", BATgetId(b), BATcount(b), \
1416 : anti, #TEST); \
1417 : if (!s || BATtdense(s)) { \
1418 : for (; p < q; p++) { \
1419 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1420 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1421 : const char *restrict v = BUNtvar(bi, p - off); \
1422 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1423 : vals[cnt++] = p; \
1424 : } \
1425 : } else { \
1426 : for (; p < ncands; p++) { \
1427 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1428 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1429 : oid o = canditer_next(ci); \
1430 : const char *restrict v = BUNtvar(bi, o - off); \
1431 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1432 : vals[cnt++] = o; \
1433 : } \
1434 : } \
1435 : } while (0)
1436 :
1437 : static str
1438 13258 : mnre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
1439 : BUN *rcnt, const char *pat, bool caseignore, bool anti,
1440 : bool use_strcmp, uint32_t esc, bool keep_nulls)
1441 : {
1442 13258 : BATiter bi = bat_iterator(b);
1443 13293 : BUN cnt = 0, ncands = ci->ncand;
1444 13293 : oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
1445 13293 : struct RE *re = NULL;
1446 13293 : str msg = MAL_SUCCEED;
1447 :
1448 13293 : size_t counter = 0;
1449 13293 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
1450 :
1451 13289 : if ((msg = mnre_like_build(&re, pat, caseignore, use_strcmp,
1452 : esc)) != MAL_SUCCEED)
1453 0 : goto bailout;
1454 :
1455 13258 : if (use_strcmp) {
1456 125 : if (caseignore) {
1457 47 : if (anti)
1458 58 : pcrescanloop(!strNil(v)
1459 : && GDKstrcasecmp(v, pat) != 0, keep_nulls);
1460 : else
1461 685 : pcrescanloop(!strNil(v)
1462 : && GDKstrcasecmp(v, pat) == 0, keep_nulls);
1463 : } else {
1464 78 : if (anti)
1465 5 : pcrescanloop(!strNil(v) && strcmp(v, pat) != 0, keep_nulls);
1466 : else
1467 10196 : pcrescanloop(!strNil(v) && strcmp(v, pat) == 0, keep_nulls);
1468 : }
1469 : } else {
1470 13133 : if (caseignore) {
1471 142 : if (anti) {
1472 44 : pcrescanloop(!strNil(v)
1473 : && !mnre_match(v, re), keep_nulls);
1474 : } else {
1475 11531 : pcrescanloop(!strNil(v)
1476 : && mnre_match(v, re), keep_nulls);
1477 : }
1478 : } else {
1479 12991 : if (anti)
1480 56504 : pcrescanloop(!strNil(v)
1481 : && !mnre_match(v, re), keep_nulls);
1482 : else
1483 149012 : pcrescanloop(!strNil(v)
1484 : && mnre_match(v, re), keep_nulls);
1485 : }
1486 : }
1487 :
1488 53 : bailout:
1489 13258 : bat_iterator_end(&bi);
1490 13280 : mnre_like_clean(&re);
1491 13285 : *rcnt = cnt;
1492 13285 : return msg;
1493 : }
1494 :
1495 : static str
1496 13241 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const char *const *pat,
1497 : const char *const *esc, const bit *caseignore, const bit *anti)
1498 : {
1499 13241 : BAT *b, *s = NULL, *bn = NULL, *old_s = NULL;
1500 13241 : str msg = MAL_SUCCEED;
1501 13241 : bool use_re = false,
1502 13241 : use_strcmp = false,
1503 13241 : empty = false;
1504 13241 : bool with_strimps = false;
1505 13241 : bool with_strimps_anti = false;
1506 13241 : BUN p = 0, q = 0, rcnt = 0;
1507 13241 : struct canditer ci;
1508 :
1509 13241 : if ((b = BATdescriptor(*bid)) == NULL) {
1510 0 : msg = createException(MAL, "algebra.likeselect",
1511 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1512 0 : goto bailout;
1513 : }
1514 13291 : if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
1515 0 : msg = createException(MAL, "algebra.likeselect",
1516 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1517 0 : goto bailout;
1518 : }
1519 :
1520 13293 : assert(ATOMstorage(b->ttype) == TYPE_str);
1521 :
1522 13293 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty,
1523 : *pat, *esc)) != MAL_SUCCEED)
1524 0 : goto bailout;
1525 :
1526 13195 : if (empty) {
1527 0 : if (!(bn = BATdense(0, 0, 0)))
1528 0 : msg = createException(MAL, "algebra.likeselect",
1529 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1530 :
1531 0 : goto bailout;
1532 : }
1533 : /* Since the strimp pre-filtering of a LIKE query produces a superset of the actual result the complement of that
1534 : * set will necessarily reject some of the matching entries in the NOT LIKE query.
1535 : *
1536 : * In this case we run the PCRElikeselect as a LIKE query with strimps and return the complement of the result,
1537 : * taking extra care to not return NULLs. This currently means that we do not run strimps for NOT LIKE queries if
1538 : * the BAT contains NULLs.
1539 : */
1540 13195 : if (BAThasstrimps(b)) {
1541 48 : if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
1542 48 : BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
1543 48 : if (tmp_s) {
1544 48 : old_s = s;
1545 48 : s = tmp_s;
1546 48 : if (!*anti)
1547 : with_strimps = true;
1548 : else
1549 0 : with_strimps_anti = true;
1550 : }
1551 : } else { /* If we cannot filter with the strimp just continue normally */
1552 0 : GDKclrerr();
1553 : }
1554 : }
1555 :
1556 :
1557 13291 : MT_thread_setalgorithm(use_strcmp
1558 13291 : ? (with_strimps ?
1559 : "pcrelike: pattern matching using strcmp with strimps"
1560 : : (with_strimps_anti ?
1561 : "pcrelike: pattern matching using strcmp with strimps anti"
1562 13291 : : "pcrelike: pattern matching using strcmp")) :
1563 13166 : use_re ? (with_strimps ?
1564 : "pcrelike: pattern matching using RE with strimps"
1565 : : (with_strimps_anti ?
1566 : "pcrelike: patterm matching using RE with strimps anti"
1567 : :
1568 : "pcrelike: pattern matching using RE"))
1569 : : (with_strimps ?
1570 : "pcrelike: pattern matching using pcre with strimps"
1571 : : (with_strimps_anti ?
1572 : "pcrelike: pattermatching using pcre with strimps anti"
1573 : : "pcrelike: pattern matching using pcre")));
1574 :
1575 13288 : canditer_init(&ci, b, s);
1576 13289 : if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
1577 0 : msg = createException(MAL, "algebra.likeselect",
1578 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1579 0 : goto bailout;
1580 : }
1581 :
1582 13272 : if (!s || BATtdense(s)) {
1583 4890 : if (s) {
1584 8329 : assert(BATtdense(s));
1585 8329 : p = (BUN) s->tseqbase;
1586 8329 : q = p + BATcount(s);
1587 8329 : if ((oid) p < b->hseqbase)
1588 : p = b->hseqbase;
1589 8329 : if ((oid) q > b->hseqbase + BATcount(b))
1590 : q = b->hseqbase + BATcount(b);
1591 : } else {
1592 4890 : p = b->hseqbase;
1593 4890 : q = BATcount(b) + b->hseqbase;
1594 : }
1595 : }
1596 :
1597 13272 : msg = mnre_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, *caseignore, *anti
1598 8359 : && !with_strimps_anti, use_strcmp,
1599 13272 : (unsigned char) **esc, with_strimps_anti);
1600 :
1601 13284 : if (!msg) { /* set some properties */
1602 13287 : BATsetcount(bn, rcnt);
1603 13259 : bn->tsorted = true;
1604 13259 : bn->trevsorted = bn->batCount <= 1;
1605 13259 : bn->tkey = true;
1606 13259 : bn->tnil = false;
1607 13259 : bn->tnonil = true;
1608 13259 : bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 ? *(const oid *) Tloc(bn, 0) : rcnt == b->batCount ? b->hseqbase : oid_nil;
1609 13259 : if (with_strimps_anti) {
1610 : /* Reverse the result taking into account the original candidate list. */
1611 : // BAT *rev = BATdiffcand(BATdense(b->hseqbase, 0, b->batCount), bn);
1612 0 : BAT *rev;
1613 0 : if (old_s) {
1614 0 : rev = BATdiffcand(old_s, bn);
1615 : #ifndef NDEBUG
1616 0 : BAT *is = BATintersectcand(old_s, bn);
1617 0 : if (is) {
1618 0 : assert(is->batCount == bn->batCount);
1619 0 : BBPreclaim(is);
1620 : }
1621 0 : assert(rev->batCount == old_s->batCount - bn->batCount);
1622 : #endif
1623 : }
1624 :
1625 : else
1626 0 : rev = BATnegcands(0, b->batCount, bn);
1627 : /* BAT *rev = BATnegcands(0, b->batCount, bn); */
1628 0 : BBPunfix(bn->batCacheid);
1629 0 : bn = rev;
1630 : }
1631 : }
1632 :
1633 :
1634 13256 : bailout:
1635 13256 : BBPreclaim(b);
1636 13287 : BBPreclaim(s);
1637 13283 : BBPreclaim(old_s);
1638 13283 : if (bn && !msg) {
1639 13283 : *ret = bn->batCacheid;
1640 13283 : BBPkeepref(bn);
1641 0 : } else if (bn)
1642 0 : BBPreclaim(bn);
1643 13288 : return msg;
1644 : }
1645 :
1646 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
1647 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##i.width))
1648 :
1649 : /* nested loop implementation for PCRE join */
1650 : #define pcre_join_loop(STRCMP, MNRE_MATCH) \
1651 : do { \
1652 : for (BUN ridx = 0; ridx < rci.ncand; ridx++) { \
1653 : ro = canditer_next(&rci); \
1654 : vr = VALUE(r, ro - rbase); \
1655 : nl = 0; \
1656 : use_re = use_strcmp = empty = false; \
1657 : if ((msg = choose_like_path(&use_re, &use_strcmp, &empty, vr, esc))) \
1658 : goto bailout; \
1659 : if (!empty) { \
1660 : if ((msg = mnre_like_build(&re, vr, false, use_strcmp, (unsigned char) *esc)) != MAL_SUCCEED) \
1661 : goto bailout; \
1662 : canditer_reset(&lci); \
1663 : TIMEOUT_LOOP_IDX_DECL(lidx, lci.ncand, qry_ctx) { \
1664 : lo = canditer_next(&lci); \
1665 : vl = VALUE(l, lo - lbase); \
1666 : if (strNil(vl)) { \
1667 : continue; \
1668 : } else { \
1669 : if (use_strcmp) { \
1670 : if (STRCMP) \
1671 : continue; \
1672 : } else { \
1673 : assert(re); \
1674 : if (MNRE_MATCH) \
1675 : continue; \
1676 : } \
1677 : } \
1678 : if (BATcount(r1) == BATcapacity(r1)) { \
1679 : newcap = BATgrows(r1); \
1680 : BATsetcount(r1, BATcount(r1)); \
1681 : if (r2) \
1682 : BATsetcount(r2, BATcount(r2)); \
1683 : if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
1684 : msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
1685 : goto bailout; \
1686 : } \
1687 : assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
1688 : } \
1689 : if (BATcount(r1) > 0) { \
1690 : if (lastl + 1 != lo) \
1691 : r1->tseqbase = oid_nil; \
1692 : if (nl == 0) { \
1693 : if (r2) \
1694 : r2->trevsorted = false; \
1695 : if (lastl > lo) { \
1696 : r1->tsorted = false; \
1697 : r1->tkey = false; \
1698 : } else if (lastl < lo) { \
1699 : r1->trevsorted = false; \
1700 : } else { \
1701 : r1->tkey = false; \
1702 : } \
1703 : } \
1704 : } \
1705 : APPEND(r1, lo); \
1706 : if (r2) \
1707 : APPEND(r2, ro); \
1708 : lastl = lo; \
1709 : nl++; \
1710 : } \
1711 : mnre_like_clean(&re); \
1712 : TIMEOUT_CHECK(qry_ctx, \
1713 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1714 : } \
1715 : if (r2) { \
1716 : if (nl > 1) { \
1717 : r2->tkey = false; \
1718 : r2->tseqbase = oid_nil; \
1719 : r1->trevsorted = false; \
1720 : } else if (nl == 0) { \
1721 : rskipped = BATcount(r2) > 0; \
1722 : } else if (rskipped) { \
1723 : r2->tseqbase = oid_nil; \
1724 : } \
1725 : } else if (nl > 1) { \
1726 : r1->trevsorted = false; \
1727 : } \
1728 : } \
1729 : } while (0)
1730 :
1731 : static char *
1732 51 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc,
1733 : bit caseignore, bit anti)
1734 : {
1735 51 : struct canditer lci, rci;
1736 51 : const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
1737 51 : int rskipped = 0; /* whether we skipped values in r */
1738 51 : oid lbase, rbase, lo, ro, lastl = 0; /* last value inserted into r1 */
1739 51 : BUN nl, newcap;
1740 51 : char *msg = MAL_SUCCEED;
1741 51 : struct RE *re = NULL;
1742 51 : bool use_re = false,
1743 51 : use_strcmp = false,
1744 51 : empty = false;
1745 51 : lng t0 = 0;
1746 :
1747 51 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
1748 :
1749 51 : TRC_DEBUG_IF(ALGO) t0 = GDKusec();
1750 :
1751 153 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
1752 51 : assert(ATOMtype(l->ttype) == TYPE_str);
1753 :
1754 51 : BAT *ol = NULL, *or = NULL;
1755 51 : if (caseignore) {
1756 3 : ol = l;
1757 3 : or = r;
1758 3 : l = BATcasefold(l, NULL);
1759 3 : r = BATcasefold(r, NULL);
1760 3 : if (l == NULL || r == NULL) {
1761 0 : BBPreclaim(l);
1762 0 : BBPreclaim(r);
1763 0 : throw(MAL, "pcre.join", GDK_EXCEPTION);
1764 : }
1765 : }
1766 :
1767 51 : canditer_init(&lci, l, sl);
1768 51 : canditer_init(&rci, r, sr);
1769 :
1770 51 : BATiter li = bat_iterator(l);
1771 51 : BATiter ri = bat_iterator(r);
1772 51 : lbase = l->hseqbase;
1773 51 : rbase = r->hseqbase;
1774 51 : lvals = (const char *) li.base;
1775 51 : rvals = (const char *) ri.base;
1776 51 : assert(ri.vh && r->ttype);
1777 51 : lvars = li.vh->base;
1778 51 : rvars = ri.vh->base;
1779 :
1780 51 : r1->tkey = true;
1781 51 : r1->tsorted = true;
1782 51 : r1->trevsorted = true;
1783 51 : r1->tnil = false;
1784 51 : r1->tnonil = true;
1785 51 : if (r2) {
1786 25 : r2->tkey = true;
1787 25 : r2->tsorted = true;
1788 25 : r2->trevsorted = true;
1789 25 : r2->tnil = false;
1790 25 : r2->tnonil = true;
1791 : }
1792 :
1793 51 : if (anti) {
1794 490 : pcre_join_loop(strcmp(vl, vr) == 0, mnre_match(vl, re));
1795 : } else {
1796 471 : pcre_join_loop(strcmp(vl, vr) != 0, !mnre_match(vl, re));
1797 : }
1798 51 : bat_iterator_end(&li);
1799 51 : bat_iterator_end(&ri);
1800 50 : if (ol) {
1801 3 : BBPreclaim(l);
1802 3 : BBPreclaim(r);
1803 3 : l = ol;
1804 3 : r = or;
1805 : }
1806 :
1807 51 : assert(!r2 || BATcount(r1) == BATcount(r2));
1808 : /* also set other bits of heap to correct value to indicate size */
1809 51 : BATsetcount(r1, BATcount(r1));
1810 50 : if (r2)
1811 24 : BATsetcount(r2, BATcount(r2));
1812 51 : if (BATcount(r1) > 0) {
1813 36 : if (BATtdense(r1))
1814 13 : r1->tseqbase = ((oid *) r1->theap->base)[0];
1815 36 : if (r2 && BATtdense(r2))
1816 17 : r2->tseqbase = ((oid *) r2->theap->base)[0];
1817 : } else {
1818 15 : r1->tseqbase = 0;
1819 15 : if (r2)
1820 8 : r2->tseqbase = 0;
1821 : }
1822 :
1823 25 : if (r2)
1824 25 : TRC_DEBUG(ALGO,
1825 : "l=%s#" BUNFMT "[%s]%s%s,"
1826 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
1827 : "sr=%s#" BUNFMT "%s%s -> "
1828 : "%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s (" LLFMT " usec)\n",
1829 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
1830 : l->tsorted ? "-sorted" : "",
1831 : l->trevsorted ? "-revsorted" : "",
1832 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
1833 : r->tsorted ? "-sorted" : "",
1834 : r->trevsorted ? "-revsorted" : "",
1835 : sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
1836 : sl && sl->tsorted ? "-sorted" : "",
1837 : sl && sl->trevsorted ? "-revsorted" : "",
1838 : sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
1839 : sr && sr->tsorted ? "-sorted" : "",
1840 : sr && sr->trevsorted ? "-revsorted" : "",
1841 : BATgetId(r1), BATcount(r1),
1842 : r1->tsorted ? "-sorted" : "",
1843 : r1->trevsorted ? "-revsorted" : "",
1844 : BATgetId(r2), BATcount(r2),
1845 : r2->tsorted ? "-sorted" : "",
1846 : r2->trevsorted ? "-revsorted" : "", GDKusec() - t0);
1847 : else
1848 26 : TRC_DEBUG(ALGO,
1849 : "l=%s#" BUNFMT "[%s]%s%s,"
1850 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
1851 : "sr=%s#" BUNFMT "%s%s -> "
1852 : "%s#" BUNFMT "%s%s (" LLFMT " usec)\n",
1853 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
1854 : l->tsorted ? "-sorted" : "",
1855 : l->trevsorted ? "-revsorted" : "",
1856 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
1857 : r->tsorted ? "-sorted" : "",
1858 : r->trevsorted ? "-revsorted" : "",
1859 : sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
1860 : sl && sl->tsorted ? "-sorted" : "",
1861 : sl && sl->trevsorted ? "-revsorted" : "",
1862 : sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
1863 : sr && sr->tsorted ? "-sorted" : "",
1864 : sr && sr->trevsorted ? "-revsorted" : "",
1865 : BATgetId(r1), BATcount(r1),
1866 : r1->tsorted ? "-sorted" : "",
1867 : r1->trevsorted ? "-revsorted" : "", GDKusec() - t0);
1868 : return MAL_SUCCEED;
1869 :
1870 0 : bailout:
1871 0 : bat_iterator_end(&li);
1872 0 : bat_iterator_end(&ri);
1873 0 : mnre_like_clean(&re);
1874 0 : assert(msg != MAL_SUCCEED);
1875 : return msg;
1876 : }
1877 :
1878 : static str
1879 48 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid,
1880 : bat ciid, bit anti)
1881 : {
1882 48 : BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL,
1883 48 : *candleft = NULL, *candright = NULL;
1884 48 : BAT *result1 = NULL, *result2 = NULL;
1885 48 : char *msg = MAL_SUCCEED;
1886 48 : const char *esc = "";
1887 48 : bit ci;
1888 48 : BATiter bi;
1889 :
1890 48 : if ((left = BATdescriptor(lid)) == NULL)
1891 0 : goto fail;
1892 51 : if ((right = BATdescriptor(rid)) == NULL)
1893 0 : goto fail;
1894 51 : if ((escape = BATdescriptor(elid)) == NULL)
1895 0 : goto fail;
1896 51 : if ((caseignore = BATdescriptor(ciid)) == NULL)
1897 0 : goto fail;
1898 51 : if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
1899 0 : goto fail;
1900 51 : if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
1901 0 : goto fail;
1902 51 : result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
1903 51 : if (r2)
1904 25 : result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
1905 51 : if (!result1 || (r2 && !result2)) {
1906 0 : msg = createException(MAL, "pcre.join",
1907 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1908 0 : goto fail;
1909 : }
1910 51 : result1->tnil = false;
1911 51 : result1->tnonil = true;
1912 51 : result1->tkey = true;
1913 51 : result1->tsorted = true;
1914 51 : result1->trevsorted = true;
1915 51 : result1->tseqbase = 0;
1916 51 : if (r2) {
1917 25 : result2->tnil = false;
1918 25 : result2->tnonil = true;
1919 25 : result2->tkey = true;
1920 25 : result2->tsorted = true;
1921 25 : result2->trevsorted = true;
1922 25 : result2->tseqbase = 0;
1923 : }
1924 51 : if (BATcount(escape) != 1) {
1925 0 : msg = createException(MAL, "pcre.join",
1926 : SQLSTATE(42000)
1927 : "At the moment, only one value is allowed for the escape input at pcre join");
1928 0 : goto fail;
1929 : }
1930 51 : if (BATcount(caseignore) != 1) {
1931 0 : msg = createException(MAL, "pcre.join",
1932 : SQLSTATE(42000)
1933 : "At the moment, only one value is allowed for the case ignore input at pcre join");
1934 0 : goto fail;
1935 : }
1936 51 : bi = bat_iterator(caseignore);
1937 51 : ci = *(bit *) BUNtloc(bi, 0);
1938 51 : bat_iterator_end(&bi);
1939 51 : bi = bat_iterator(escape);
1940 51 : esc = BUNtvar(bi, 0);
1941 51 : msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci,
1942 : anti);
1943 50 : bat_iterator_end(&bi);
1944 51 : if (msg)
1945 0 : goto fail;
1946 51 : *r1 = result1->batCacheid;
1947 51 : BBPkeepref(result1);
1948 51 : if (r2) {
1949 25 : *r2 = result2->batCacheid;
1950 25 : BBPkeepref(result2);
1951 : }
1952 51 : BBPunfix(left->batCacheid);
1953 51 : BBPunfix(right->batCacheid);
1954 51 : BBPreclaim(escape);
1955 51 : BBPreclaim(caseignore);
1956 51 : BBPreclaim(candleft);
1957 51 : BBPreclaim(candright);
1958 : return MAL_SUCCEED;
1959 :
1960 0 : fail:
1961 0 : BBPreclaim(left);
1962 0 : BBPreclaim(right);
1963 0 : BBPreclaim(escape);
1964 0 : BBPreclaim(caseignore);
1965 0 : BBPreclaim(candleft);
1966 0 : BBPreclaim(candright);
1967 0 : BBPreclaim(result1);
1968 0 : BBPreclaim(result2);
1969 0 : if (msg)
1970 : return msg;
1971 0 : throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1972 : }
1973 :
1974 : static str
1975 22 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid,
1976 : const bat *cid, const bat *slid, const bat *srid,
1977 : const bit *nil_matches, const lng *estimate, const bit *anti)
1978 : {
1979 22 : (void) nil_matches;
1980 22 : (void) estimate;
1981 22 : return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
1982 22 : *elid, *cid, *anti);
1983 : }
1984 :
1985 : static str
1986 26 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid,
1987 : const bat *cid, const bat *slid, const bat *srid,
1988 : const bit *nil_matches, const lng *estimate, const bit *anti)
1989 : {
1990 26 : (void) nil_matches;
1991 26 : (void) estimate;
1992 26 : return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
1993 26 : *elid, *cid, *anti);
1994 : }
1995 :
1996 : #include "mel.h"
1997 : mel_atom pcre_init_atoms[] = {
1998 : { .name="pcre", }, { .cmp=NULL }
1999 : };
2000 : mel_func pcre_init_funcs[] = {
2001 : command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
2002 : command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2003 : command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2004 : command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
2005 : command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2006 : command("pcre", "replace_first", PCREreplacefirst_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2007 : command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
2008 : command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
2009 : command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2010 : command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2011 : command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2012 : command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2013 : command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2014 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2015 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2016 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2017 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2018 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2019 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2020 : command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds. The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
2021 : command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
2022 : command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
2023 : { .imp=NULL }
2024 : };
2025 : #include "mal_import.h"
2026 : #ifdef _MSC_VER
2027 : #undef read
2028 : #pragma section(".CRT$XCU",read)
2029 : #endif
2030 351 : LIB_STARTUP_FUNC(init_pcre_mal)
2031 351 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }
|