Mercurial > hg > MonetDB-extend
comparison regexp/regexp.c @ 33:a8ffdbc388ce
Ported to Jun2020 branch.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Tue, 09 Jun 2020 10:30:35 +0200 (2020-06-09) |
parents | 25cd8af6fa82 |
children | e5d2d0c9b7b3 |
comparison
equal
deleted
inserted
replaced
32:25cd8af6fa82 | 33:a8ffdbc388ce |
---|---|
9 #include <monetdb_config.h> | 9 #include <monetdb_config.h> |
10 | 10 |
11 /* mal_exception.h actually contains everything we need */ | 11 /* mal_exception.h actually contains everything we need */ |
12 #include <mal_exception.h> | 12 #include <mal_exception.h> |
13 | 13 |
14 /* for the CANDINIT macro */ | 14 /* for the candidate iterator */ |
15 #include <gdk_cand.h> /* pre Jul2017-SP4, replace with CANDINIT definition */ | 15 #include <gdk_cand.h> |
16 | 16 |
17 /* system include files */ | 17 /* system include files */ |
18 #include <string.h> | 18 #include <string.h> |
19 | 19 |
20 /* we use the PCRE library to do regular expression matching */ | 20 /* we use the PCRE library to do regular expression matching */ |
72 int options; | 72 int options; |
73 int pos = 0; | 73 int pos = 0; |
74 pcre *re; | 74 pcre *re; |
75 | 75 |
76 /* if any of the input values are nil, the result is no match */ | 76 /* if any of the input values are nil, the result is no match */ |
77 if (GDK_STRNIL(val) || | 77 if (strNil(val) || strNil(pat) || strNil(flags)) { |
78 GDK_STRNIL(pat) || | |
79 GDK_STRNIL(flags)) { | |
80 /* special case for NIL inputs: NILs don't match anything */ | 78 /* special case for NIL inputs: NILs don't match anything */ |
81 *ret = 0; | 79 *ret = 0; |
82 return MAL_SUCCEED; | 80 return MAL_SUCCEED; |
83 } | 81 } |
84 | 82 |
145 BBPunfix(b->batCacheid); | 143 BBPunfix(b->batCacheid); |
146 throw(MAL, "batregexp.rematch", SEMANTIC_TYPE_MISMATCH); | 144 throw(MAL, "batregexp.rematch", SEMANTIC_TYPE_MISMATCH); |
147 } | 145 } |
148 | 146 |
149 /* if any of the input values are nil, the result is no match */ | 147 /* if any of the input values are nil, the result is no match */ |
150 if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { | 148 if (strNil(pat) || strNil(flags)) { |
151 /* no matches when the pattern or the flags is NIL | 149 /* no matches when the pattern or the flags is NIL |
152 * we return an a BAT with all NIL values */ | 150 * we return an a BAT with all NIL values */ |
153 bit f = bit_nil; | 151 bit f = bit_nil; |
154 if ((bn = BATconstant(b->hseqbase, TYPE_bit, &f, BATcount(b), TRANSIENT)) == NULL) | 152 if ((bn = BATconstant(b->hseqbase, TYPE_bit, &f, BATcount(b), TRANSIENT)) == NULL) |
155 throw(MAL, "batregexp.rematch", GDK_EXCEPTION); | 153 throw(MAL, "batregexp.rematch", GDK_EXCEPTION); |
208 bn->tnil = false; | 206 bn->tnil = false; |
209 bn->tnonil = true; | 207 bn->tnonil = true; |
210 for (start = 0, end = BATcount(b); start < end; start++) { | 208 for (start = 0, end = BATcount(b); start < end; start++) { |
211 const char *val = BUNtvar(bi, start); | 209 const char *val = BUNtvar(bi, start); |
212 /* nil values never match */ | 210 /* nil values never match */ |
213 if (GDK_STRNIL(val)) { | 211 if (strNil(val)) { |
214 *outp++ = bit_nil; | 212 *outp++ = bit_nil; |
215 bn->tnil = true; | 213 bn->tnil = true; |
216 bn->tnonil = false; | 214 bn->tnonil = false; |
217 } else { | 215 } else { |
218 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); | 216 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); |
275 static char * | 273 static char * |
276 do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti) | 274 do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti) |
277 { | 275 { |
278 BAT *b, *s = NULL; /* input BAT and optional candidate list */ | 276 BAT *b, *s = NULL; /* input BAT and optional candidate list */ |
279 | 277 |
280 BUN start, end; /* start and end index */ | 278 struct canditer ci; /* candidate iterator */ |
281 BUN cnt; /* difference between start and end (unused) */ | |
282 const oid *cand, *candend; /* start and end of candidate list */ | |
283 BATiter bi; /* helper to loop through values */ | 279 BATiter bi; /* helper to loop through values */ |
284 | 280 |
285 BAT *bn; /* result BAT */ | 281 BAT *bn; /* result BAT */ |
286 oid *outp; /* pointer through which we add to result */ | 282 oid *outp; /* pointer through which we add to result */ |
287 | 283 |
290 int options; /* PCRE options */ | 286 int options; /* PCRE options */ |
291 pcre *re; /* compiled regular expression */ | 287 pcre *re; /* compiled regular expression */ |
292 pcre_extra *sd; /* studied regular expression */ | 288 pcre_extra *sd; /* studied regular expression */ |
293 | 289 |
294 /* if any of the input values are nil, the result is no match */ | 290 /* if any of the input values are nil, the result is no match */ |
295 if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { | 291 if (strNil(pat) || strNil(flags)) { |
296 /* no matches when the pattern or the flags is NIL | 292 /* no matches when the pattern or the flags is NIL |
297 * we return an empty BAT of the correct type */ | 293 * we return an empty BAT of the correct type */ |
298 if ((bn = BATdense(0, 0, 0)) == NULL) | 294 if ((bn = BATdense(0, 0, 0)) == NULL) |
299 throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); | 295 throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); |
300 *ret = bn->batCacheid; | 296 *ret = bn->batCacheid; |
324 (s = BATdescriptor(sid)) == NULL) { | 320 (s = BATdescriptor(sid)) == NULL) { |
325 BBPunfix(b->batCacheid); | 321 BBPunfix(b->batCacheid); |
326 throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); | 322 throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); |
327 } | 323 } |
328 | 324 |
325 if (canditer_init(&ci, b, s) == 0) { | |
326 /* trivially empty result */ | |
327 BBPunfix(b->batCacheid); | |
328 if (s) | |
329 BBPunfix(s->batCacheid); | |
330 bn = BATdense(0, 0, 0); | |
331 *ret = bn->batCacheid; | |
332 BBPkeepref(*ret); | |
333 return MAL_SUCCEED; | |
334 } | |
335 | |
329 /* allocate a result BAT; the capacity we ask for is the | 336 /* allocate a result BAT; the capacity we ask for is the |
330 * maximum potential result size (i.e. the size of the | 337 * maximum potential result size (i.e. the size of the |
331 * candidate list if there is one, else the size of the input | 338 * candidate list if there is one, else the size of the input |
332 * BAT b) */ | 339 * BAT b) */ |
333 bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT); | 340 bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT); |
334 if (bn == NULL) { | 341 if (bn == NULL) { |
335 BBPunfix(b->batCacheid); | 342 BBPunfix(b->batCacheid); |
336 if (s) | 343 if (s) |
337 BBPunfix(s->batCacheid); | 344 BBPunfix(s->batCacheid); |
338 throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); | 345 throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); |
365 throw(MAL, "regexp.rematchselect", | 372 throw(MAL, "regexp.rematchselect", |
366 "study of regular expression (%s) failed with %s", | 373 "study of regular expression (%s) failed with %s", |
367 pat, err); | 374 pat, err); |
368 } | 375 } |
369 | 376 |
370 /* initialize the start, end, cnt, cand, and candend variables | |
371 * from b and s*/ | |
372 CANDINIT(b, s, start, end, cnt, cand, candend); | |
373 | |
374 /* now, start and end are the limits in b that we need to look | |
375 * at, and if set, cand and candend are the beginning and end | |
376 * of the list of OIDs of b that we need to consider */ | |
377 | |
378 bi = bat_iterator(b); | 377 bi = bat_iterator(b); |
379 | 378 |
380 if (cand) { | 379 /* iterate through the candidates */ |
381 /* there is a non-dense candidate list, we run through | 380 for (BUN i = 0; i < ci.ncand; i++) { |
382 * it and only check values in b that are referenced | 381 /* get the next candidate */ |
383 * in this candidate list */ | 382 oid o = canditer_next(&ci); |
384 while (cand < candend) { | 383 |
385 /* the candidate list has a list of OIDs which | 384 /* the candidate list has a list of OIDs which are |
386 * are relative to b->hseqbase, we need to | 385 * relative to b->hseqbase, we need to convert that to |
387 * convert that to an index relative to the | 386 * an index relative to the start of the array in the |
388 * start of the array in the (tail) heap */ | 387 * (tail) heap */ |
389 const char *val = BUNtvar(bi, *cand - b->hseqbase); | 388 const char *val = BUNtvar(bi, o - b->hseqbase); |
390 /* nil values never match */ | 389 |
391 if (!GDK_STRNIL(val)) { | 390 /* nil values never match */ |
392 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); | 391 if (!strNil(val)) { |
393 if (pos >= 0) { | 392 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); |
394 /* regular expression matched */ | 393 if (pos >= 0) { |
395 if (!anti) | 394 /* regular expression matched */ |
396 *outp++ = *cand; | 395 if (!anti) |
397 } else if (pos == PCRE_ERROR_NOMATCH) { | 396 *outp++ = o; |
398 /* regular expression didn't match */ | 397 } else if (pos == PCRE_ERROR_NOMATCH) { |
399 if (anti) | 398 /* regular expression didn't match */ |
400 *outp++ = *cand; | 399 if (anti) |
401 } else { | 400 *outp++ = o; |
402 /* error during processing */ | 401 } else { |
403 BBPunfix(b->batCacheid); | 402 /* error during processing */ |
404 BBPunfix(s->batCacheid); | 403 BBPunfix(b->batCacheid); |
405 BBPreclaim(bn); | 404 BBPunfix(s->batCacheid); |
406 pcre_free_study(sd); | 405 BBPreclaim(bn); |
407 pcre_free(re); | 406 pcre_free_study(sd); |
408 throw(MAL, "regexp.rematchselect", | 407 pcre_free(re); |
409 "matching of regular expression (%s) failed with %d", | 408 throw(MAL, "regexp.rematchselect", |
410 pat, pos); | 409 "matching of regular expression (%s) failed with %d", |
411 } | 410 pat, pos); |
412 } | 411 } |
413 cand++; | |
414 } | 412 } |
415 /* we're done with s */ | 413 } |
414 /* we're done with b, s, and re */ | |
415 BBPunfix(b->batCacheid); | |
416 if (s) | |
416 BBPunfix(s->batCacheid); | 417 BBPunfix(s->batCacheid); |
417 } else { | |
418 /* we're done with s */ | |
419 if (s) { | |
420 /* there is a dense candidate list, we don't | |
421 * need it anymore since we extracted start | |
422 * and end */ | |
423 BBPunfix(s->batCacheid); | |
424 } | |
425 while (start < end) { | |
426 const char *val = BUNtvar(bi, start); | |
427 /* nil values never match */ | |
428 if (!GDK_STRNIL(val)) { | |
429 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); | |
430 /* note that we need to convert the | |
431 * index relative to the start of the | |
432 * array to a value relative to | |
433 * b->hseqbase (opposite from what | |
434 * happens above) */ | |
435 if (pos >= 0) { | |
436 /* regular expression matched */ | |
437 if (!anti) | |
438 *outp++ = b->hseqbase + start; | |
439 } else if (pos == PCRE_ERROR_NOMATCH) { | |
440 /* regular expression didn't match */ | |
441 if (anti) | |
442 *outp++ = b->hseqbase + start; | |
443 } else { | |
444 /* error during processing */ | |
445 BBPunfix(b->batCacheid); | |
446 BBPreclaim(bn); | |
447 pcre_free_study(sd); | |
448 pcre_free(re); | |
449 throw(MAL, "regexp.rematchselect", | |
450 "matching of regular expression (%s) failed with %d", | |
451 pat, pos); | |
452 } | |
453 } | |
454 start++; | |
455 } | |
456 } | |
457 | |
458 /* we're done with b and re */ | |
459 BBPunfix(b->batCacheid); | |
460 pcre_free_study(sd); | 418 pcre_free_study(sd); |
461 pcre_free(re); | 419 pcre_free(re); |
462 | 420 |
463 /* set properties and size of result BAT */ | 421 /* set properties and size of result BAT */ |
464 BATsetcount(bn, (BUN) (outp - (oid *) Tloc(bn, 0))); /* size is pointer difference */ | 422 BATsetcount(bn, (BUN) (outp - (oid *) Tloc(bn, 0))); /* size is pointer difference */ |
517 bat slid, bat srid, bit nil_matches, lng estimate) | 475 bat slid, bat srid, bit nil_matches, lng estimate) |
518 { | 476 { |
519 BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */ | 477 BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */ |
520 BAT *bn1, *bn2; /* output BATs */ | 478 BAT *bn1, *bn2; /* output BATs */ |
521 | 479 |
522 BUN lstart, lend; /* start and end index */ | 480 struct canditer lci; /* candidate iterator for l */ |
523 BUN lcnt; /* difference between start and end (unused) */ | 481 struct canditer rci; /* candidate iterator for r */ |
524 const oid *lcand, *lcandend; /* start and end of candidate list */ | 482 |
525 BATiter li; /* helper to loop through values */ | 483 BATiter li; /* helper to loop through values */ |
526 | |
527 BUN rstart, rend; /* start and end index */ | |
528 BUN rcnt; /* difference between start and end (unused) */ | |
529 const oid *rcand, *rcandend; /* start and end of candidate list */ | |
530 BATiter ri; /* helper to loop through values */ | 484 BATiter ri; /* helper to loop through values */ |
531 oid ro; /* right OID being matched */ | 485 oid ro; /* right OID being matched */ |
532 | 486 |
533 const char *pat; /* the regular expression being matched */ | 487 const char *pat; /* the regular expression being matched */ |
534 const char *err; /* error message from PCRE library */ | 488 const char *err; /* error message from PCRE library */ |
537 pcre *re; /* compiled regular expression */ | 491 pcre *re; /* compiled regular expression */ |
538 pcre_extra *sd; /* studied regular expression */ | 492 pcre_extra *sd; /* studied regular expression */ |
539 | 493 |
540 (void) nil_matches; /* only relevant for equi-join */ | 494 (void) nil_matches; /* only relevant for equi-join */ |
541 | 495 |
542 if (GDK_STRNIL(flags)) { | 496 if (strNil(flags)) { |
543 /* no matches when the flags is NIL | 497 /* no matches when the flags is NIL |
544 * we return two empty BATs */ | 498 * we return two empty BATs */ |
545 bn1 = BATdense(0, 0, 0); | 499 bn1 = BATdense(0, 0, 0); |
546 bn2 = BATdense(0, 0, 0); | 500 bn2 = BATdense(0, 0, 0); |
547 if (bn1 == NULL || bn2 == NULL) { | 501 if (bn1 == NULL || bn2 == NULL) { |
590 if (sr) | 544 if (sr) |
591 BBPunfix(sr->batCacheid); | 545 BBPunfix(sr->batCacheid); |
592 throw(MAL, "regexp.rematchjoin", SEMANTIC_TYPE_MISMATCH); | 546 throw(MAL, "regexp.rematchjoin", SEMANTIC_TYPE_MISMATCH); |
593 } | 547 } |
594 | 548 |
549 if (canditer_init(&lci, l, sl) == 0 || | |
550 canditer_init(&rci, r, sr) == 0) { | |
551 /* if either side is empty (or no candidates) the | |
552 * result is empty */ | |
553 BBPunfix(l->batCacheid); | |
554 BBPunfix(r->batCacheid); | |
555 if (sl) | |
556 BBPunfix(sl->batCacheid); | |
557 if (sr) | |
558 BBPunfix(sr->batCacheid); | |
559 bn1 = BATdense(0, 0, 0); | |
560 bn2 = BATdense(0, 0, 0); | |
561 if (bn1 == NULL || bn2 == NULL) { | |
562 BBPreclaim(bn1); | |
563 BBPreclaim(bn2); | |
564 throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); | |
565 } | |
566 return MAL_SUCCEED; | |
567 } | |
568 | |
595 /* if there is no valid estimate, use the size of the left | 569 /* if there is no valid estimate, use the size of the left |
596 * input as size estimate */ | 570 * input as size estimate */ |
597 if (is_lng_nil(estimate) || estimate == 0) | 571 if (is_lng_nil(estimate) || estimate == 0) |
598 estimate = sl ? BATcount(sl) : BATcount(l); | 572 estimate = lci.ncand; |
599 | 573 |
600 /* create the output BATs */ | 574 /* create the output BATs */ |
601 bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT); | 575 bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT); |
602 bn2 = COLnew(0, TYPE_oid, estimate, TRANSIENT); | 576 bn2 = COLnew(0, TYPE_oid, estimate, TRANSIENT); |
603 if (bn1 == NULL || bn2 == NULL) { | 577 if (bn1 == NULL || bn2 == NULL) { |
612 if (sr) | 586 if (sr) |
613 BBPunfix(sr->batCacheid); | 587 BBPunfix(sr->batCacheid); |
614 throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); | 588 throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); |
615 } | 589 } |
616 | 590 |
617 /* extract boundaries and candidate lists from inputs */ | |
618 CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend); | |
619 CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend); | |
620 | |
621 li = bat_iterator(l); | 591 li = bat_iterator(l); |
622 ri = bat_iterator(r); | 592 ri = bat_iterator(r); |
623 | 593 |
624 /* if there is a candidate list (sl or sr is not NULL), but it | 594 for (BUN i = 0; i < rci.ncand; i++) { |
625 * is dense (lcand/rcand is NULL), we don't need the BAT | 595 ro = canditer_next(&rci); |
626 * descriptor anymore, the start and end values suffice */ | 596 pat = BUNtvar(ri, ro - r->hseqbase); |
627 if (sl && lcand == NULL) { | 597 |
628 BBPunfix(sl->batCacheid); | |
629 sl = NULL; | |
630 } | |
631 if (sr && rcand == NULL) { | |
632 BBPunfix(sr->batCacheid); | |
633 sr = NULL; | |
634 } | |
635 | |
636 for (;;) { | |
637 if (rcand) { | |
638 if (rcand == rcandend) | |
639 break; | |
640 ro = *rcand++; | |
641 pat = BUNtvar(ri, ro - r->hseqbase); | |
642 } else { | |
643 if (rstart == rend) | |
644 break; | |
645 ro = rstart + r->hseqbase; | |
646 pat = BUNtvar(ri, rstart); | |
647 rstart++; | |
648 } | |
649 /* nil regular expressions don't match (despite | 598 /* nil regular expressions don't match (despite |
650 * nil_matches) */ | 599 * nil_matches) */ |
651 if (GDK_STRNIL(pat)) | 600 if (strNil(pat)) |
652 continue; | 601 continue; |
653 re = pcre_compile(pat, options, &err, &pos, NULL); | 602 re = pcre_compile(pat, options, &err, &pos, NULL); |
654 sd = NULL; | 603 sd = NULL; |
655 if (re == NULL) | 604 if (re == NULL) |
656 goto bailout; | 605 goto bailout; |
657 sd = pcre_study(re, 0, &err); | 606 sd = pcre_study(re, 0, &err); |
658 if (err != NULL) | 607 if (err != NULL) |
659 goto bailout; | 608 goto bailout; |
660 /* inner loop: try to be more efficient than outer | 609 |
661 * loop, so test for lcand once instead of each | 610 /* inner loop: reset iterator, then iterate over it */ |
662 * iteration */ | 611 canditer_reset(&lci); |
663 if (lcand) { | 612 for (BUN j = 0; j < lci.ncand; j++) { |
664 for (const oid *cand = lcand; cand < lcandend; cand++) { | 613 oid lo = canditer_next(&lci); |
665 const char *val = BUNtvar(li, *cand - l->hseqbase); | 614 const char *val = BUNtvar(li, lo - l->hseqbase); |
666 if (GDK_STRNIL(val)) | 615 if (strNil(val)) |
667 continue; | 616 continue; |
668 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); | 617 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); |
669 if (pos >= 0) { | 618 if (pos >= 0) { |
670 /* regular expression matched */ | 619 /* regular expression matched */ |
671 if (BUNappend(bn1, cand, false) != GDK_SUCCEED || | 620 if (BUNappend(bn1, &lo, false) != GDK_SUCCEED || |
672 BUNappend(bn2, &ro, false) != GDK_SUCCEED) | 621 BUNappend(bn2, &ro, false) != GDK_SUCCEED) |
673 goto bailout; | |
674 } else if (pos != PCRE_ERROR_NOMATCH) { | |
675 /* error during processing */ | |
676 err = "matching of regular expression failed"; | |
677 goto bailout; | 622 goto bailout; |
678 } | 623 } else if (pos != PCRE_ERROR_NOMATCH) { |
679 } | 624 /* error during processing */ |
680 } else { | 625 err = "matching of regular expression failed"; |
681 for (BUN start = lstart; start < lend; start++) { | 626 goto bailout; |
682 const char *val = BUNtvar(li, start); | |
683 if (GDK_STRNIL(val)) | |
684 continue; | |
685 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); | |
686 if (pos >= 0) { | |
687 /* regular expression matched */ | |
688 oid lo = start + l->hseqbase; | |
689 if (BUNappend(bn1, &lo, false) != GDK_SUCCEED || | |
690 BUNappend(bn2, &ro, false) != GDK_SUCCEED) | |
691 goto bailout; | |
692 } else if (pos != PCRE_ERROR_NOMATCH) { | |
693 /* error during processing */ | |
694 err = "matching of regular expression failed"; | |
695 goto bailout; | |
696 } | |
697 } | 627 } |
698 } | 628 } |
699 pcre_free_study(sd); | 629 pcre_free_study(sd); |
700 pcre_free(re); | 630 pcre_free(re); |
701 } | 631 } |