comparison regexp/regexp.c @ 33:a8ffdbc388ce

Ported to Jun2020 branch.
author Sjoerd Mullender <sjoerd@acm.org>
date Tue, 09 Jun 2020 10:30:35 +0200 (2020-06-09)
parents 25cd8af6fa82
children e5d2d0c9b7b3
comparison
equal deleted inserted replaced
32:25cd8af6fa82 33:a8ffdbc388ce
9 #include <monetdb_config.h> 9 #include <monetdb_config.h>
10 10
11 /* mal_exception.h actually contains everything we need */ 11 /* mal_exception.h actually contains everything we need */
12 #include <mal_exception.h> 12 #include <mal_exception.h>
13 13
14 /* for the CANDINIT macro */ 14 /* for the candidate iterator */
15 #include <gdk_cand.h> /* pre Jul2017-SP4, replace with CANDINIT definition */ 15 #include <gdk_cand.h>
16 16
17 /* system include files */ 17 /* system include files */
18 #include <string.h> 18 #include <string.h>
19 19
20 /* we use the PCRE library to do regular expression matching */ 20 /* we use the PCRE library to do regular expression matching */
72 int options; 72 int options;
73 int pos = 0; 73 int pos = 0;
74 pcre *re; 74 pcre *re;
75 75
76 /* if any of the input values are nil, the result is no match */ 76 /* if any of the input values are nil, the result is no match */
77 if (GDK_STRNIL(val) || 77 if (strNil(val) || strNil(pat) || strNil(flags)) {
78 GDK_STRNIL(pat) ||
79 GDK_STRNIL(flags)) {
80 /* special case for NIL inputs: NILs don't match anything */ 78 /* special case for NIL inputs: NILs don't match anything */
81 *ret = 0; 79 *ret = 0;
82 return MAL_SUCCEED; 80 return MAL_SUCCEED;
83 } 81 }
84 82
145 BBPunfix(b->batCacheid); 143 BBPunfix(b->batCacheid);
146 throw(MAL, "batregexp.rematch", SEMANTIC_TYPE_MISMATCH); 144 throw(MAL, "batregexp.rematch", SEMANTIC_TYPE_MISMATCH);
147 } 145 }
148 146
149 /* if any of the input values are nil, the result is no match */ 147 /* if any of the input values are nil, the result is no match */
150 if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { 148 if (strNil(pat) || strNil(flags)) {
151 /* no matches when the pattern or the flags is NIL 149 /* no matches when the pattern or the flags is NIL
152 * we return an a BAT with all NIL values */ 150 * we return an a BAT with all NIL values */
153 bit f = bit_nil; 151 bit f = bit_nil;
154 if ((bn = BATconstant(b->hseqbase, TYPE_bit, &f, BATcount(b), TRANSIENT)) == NULL) 152 if ((bn = BATconstant(b->hseqbase, TYPE_bit, &f, BATcount(b), TRANSIENT)) == NULL)
155 throw(MAL, "batregexp.rematch", GDK_EXCEPTION); 153 throw(MAL, "batregexp.rematch", GDK_EXCEPTION);
208 bn->tnil = false; 206 bn->tnil = false;
209 bn->tnonil = true; 207 bn->tnonil = true;
210 for (start = 0, end = BATcount(b); start < end; start++) { 208 for (start = 0, end = BATcount(b); start < end; start++) {
211 const char *val = BUNtvar(bi, start); 209 const char *val = BUNtvar(bi, start);
212 /* nil values never match */ 210 /* nil values never match */
213 if (GDK_STRNIL(val)) { 211 if (strNil(val)) {
214 *outp++ = bit_nil; 212 *outp++ = bit_nil;
215 bn->tnil = true; 213 bn->tnil = true;
216 bn->tnonil = false; 214 bn->tnonil = false;
217 } else { 215 } else {
218 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); 216 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
275 static char * 273 static char *
276 do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti) 274 do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti)
277 { 275 {
278 BAT *b, *s = NULL; /* input BAT and optional candidate list */ 276 BAT *b, *s = NULL; /* input BAT and optional candidate list */
279 277
280 BUN start, end; /* start and end index */ 278 struct canditer ci; /* candidate iterator */
281 BUN cnt; /* difference between start and end (unused) */
282 const oid *cand, *candend; /* start and end of candidate list */
283 BATiter bi; /* helper to loop through values */ 279 BATiter bi; /* helper to loop through values */
284 280
285 BAT *bn; /* result BAT */ 281 BAT *bn; /* result BAT */
286 oid *outp; /* pointer through which we add to result */ 282 oid *outp; /* pointer through which we add to result */
287 283
290 int options; /* PCRE options */ 286 int options; /* PCRE options */
291 pcre *re; /* compiled regular expression */ 287 pcre *re; /* compiled regular expression */
292 pcre_extra *sd; /* studied regular expression */ 288 pcre_extra *sd; /* studied regular expression */
293 289
294 /* if any of the input values are nil, the result is no match */ 290 /* if any of the input values are nil, the result is no match */
295 if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { 291 if (strNil(pat) || strNil(flags)) {
296 /* no matches when the pattern or the flags is NIL 292 /* no matches when the pattern or the flags is NIL
297 * we return an empty BAT of the correct type */ 293 * we return an empty BAT of the correct type */
298 if ((bn = BATdense(0, 0, 0)) == NULL) 294 if ((bn = BATdense(0, 0, 0)) == NULL)
299 throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); 295 throw(MAL, "regexp.rematchselect", GDK_EXCEPTION);
300 *ret = bn->batCacheid; 296 *ret = bn->batCacheid;
324 (s = BATdescriptor(sid)) == NULL) { 320 (s = BATdescriptor(sid)) == NULL) {
325 BBPunfix(b->batCacheid); 321 BBPunfix(b->batCacheid);
326 throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); 322 throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING);
327 } 323 }
328 324
325 if (canditer_init(&ci, b, s) == 0) {
326 /* trivially empty result */
327 BBPunfix(b->batCacheid);
328 if (s)
329 BBPunfix(s->batCacheid);
330 bn = BATdense(0, 0, 0);
331 *ret = bn->batCacheid;
332 BBPkeepref(*ret);
333 return MAL_SUCCEED;
334 }
335
329 /* allocate a result BAT; the capacity we ask for is the 336 /* allocate a result BAT; the capacity we ask for is the
330 * maximum potential result size (i.e. the size of the 337 * maximum potential result size (i.e. the size of the
331 * candidate list if there is one, else the size of the input 338 * candidate list if there is one, else the size of the input
332 * BAT b) */ 339 * BAT b) */
333 bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT); 340 bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT);
334 if (bn == NULL) { 341 if (bn == NULL) {
335 BBPunfix(b->batCacheid); 342 BBPunfix(b->batCacheid);
336 if (s) 343 if (s)
337 BBPunfix(s->batCacheid); 344 BBPunfix(s->batCacheid);
338 throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); 345 throw(MAL, "regexp.rematchselect", GDK_EXCEPTION);
365 throw(MAL, "regexp.rematchselect", 372 throw(MAL, "regexp.rematchselect",
366 "study of regular expression (%s) failed with %s", 373 "study of regular expression (%s) failed with %s",
367 pat, err); 374 pat, err);
368 } 375 }
369 376
370 /* initialize the start, end, cnt, cand, and candend variables
371 * from b and s*/
372 CANDINIT(b, s, start, end, cnt, cand, candend);
373
374 /* now, start and end are the limits in b that we need to look
375 * at, and if set, cand and candend are the beginning and end
376 * of the list of OIDs of b that we need to consider */
377
378 bi = bat_iterator(b); 377 bi = bat_iterator(b);
379 378
380 if (cand) { 379 /* iterate through the candidates */
381 /* there is a non-dense candidate list, we run through 380 for (BUN i = 0; i < ci.ncand; i++) {
382 * it and only check values in b that are referenced 381 /* get the next candidate */
383 * in this candidate list */ 382 oid o = canditer_next(&ci);
384 while (cand < candend) { 383
385 /* the candidate list has a list of OIDs which 384 /* the candidate list has a list of OIDs which are
386 * are relative to b->hseqbase, we need to 385 * relative to b->hseqbase, we need to convert that to
387 * convert that to an index relative to the 386 * an index relative to the start of the array in the
388 * start of the array in the (tail) heap */ 387 * (tail) heap */
389 const char *val = BUNtvar(bi, *cand - b->hseqbase); 388 const char *val = BUNtvar(bi, o - b->hseqbase);
390 /* nil values never match */ 389
391 if (!GDK_STRNIL(val)) { 390 /* nil values never match */
392 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); 391 if (!strNil(val)) {
393 if (pos >= 0) { 392 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
394 /* regular expression matched */ 393 if (pos >= 0) {
395 if (!anti) 394 /* regular expression matched */
396 *outp++ = *cand; 395 if (!anti)
397 } else if (pos == PCRE_ERROR_NOMATCH) { 396 *outp++ = o;
398 /* regular expression didn't match */ 397 } else if (pos == PCRE_ERROR_NOMATCH) {
399 if (anti) 398 /* regular expression didn't match */
400 *outp++ = *cand; 399 if (anti)
401 } else { 400 *outp++ = o;
402 /* error during processing */ 401 } else {
403 BBPunfix(b->batCacheid); 402 /* error during processing */
404 BBPunfix(s->batCacheid); 403 BBPunfix(b->batCacheid);
405 BBPreclaim(bn); 404 BBPunfix(s->batCacheid);
406 pcre_free_study(sd); 405 BBPreclaim(bn);
407 pcre_free(re); 406 pcre_free_study(sd);
408 throw(MAL, "regexp.rematchselect", 407 pcre_free(re);
409 "matching of regular expression (%s) failed with %d", 408 throw(MAL, "regexp.rematchselect",
410 pat, pos); 409 "matching of regular expression (%s) failed with %d",
411 } 410 pat, pos);
412 } 411 }
413 cand++;
414 } 412 }
415 /* we're done with s */ 413 }
414 /* we're done with b, s, and re */
415 BBPunfix(b->batCacheid);
416 if (s)
416 BBPunfix(s->batCacheid); 417 BBPunfix(s->batCacheid);
417 } else {
418 /* we're done with s */
419 if (s) {
420 /* there is a dense candidate list, we don't
421 * need it anymore since we extracted start
422 * and end */
423 BBPunfix(s->batCacheid);
424 }
425 while (start < end) {
426 const char *val = BUNtvar(bi, start);
427 /* nil values never match */
428 if (!GDK_STRNIL(val)) {
429 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
430 /* note that we need to convert the
431 * index relative to the start of the
432 * array to a value relative to
433 * b->hseqbase (opposite from what
434 * happens above) */
435 if (pos >= 0) {
436 /* regular expression matched */
437 if (!anti)
438 *outp++ = b->hseqbase + start;
439 } else if (pos == PCRE_ERROR_NOMATCH) {
440 /* regular expression didn't match */
441 if (anti)
442 *outp++ = b->hseqbase + start;
443 } else {
444 /* error during processing */
445 BBPunfix(b->batCacheid);
446 BBPreclaim(bn);
447 pcre_free_study(sd);
448 pcre_free(re);
449 throw(MAL, "regexp.rematchselect",
450 "matching of regular expression (%s) failed with %d",
451 pat, pos);
452 }
453 }
454 start++;
455 }
456 }
457
458 /* we're done with b and re */
459 BBPunfix(b->batCacheid);
460 pcre_free_study(sd); 418 pcre_free_study(sd);
461 pcre_free(re); 419 pcre_free(re);
462 420
463 /* set properties and size of result BAT */ 421 /* set properties and size of result BAT */
464 BATsetcount(bn, (BUN) (outp - (oid *) Tloc(bn, 0))); /* size is pointer difference */ 422 BATsetcount(bn, (BUN) (outp - (oid *) Tloc(bn, 0))); /* size is pointer difference */
517 bat slid, bat srid, bit nil_matches, lng estimate) 475 bat slid, bat srid, bit nil_matches, lng estimate)
518 { 476 {
519 BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */ 477 BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */
520 BAT *bn1, *bn2; /* output BATs */ 478 BAT *bn1, *bn2; /* output BATs */
521 479
522 BUN lstart, lend; /* start and end index */ 480 struct canditer lci; /* candidate iterator for l */
523 BUN lcnt; /* difference between start and end (unused) */ 481 struct canditer rci; /* candidate iterator for r */
524 const oid *lcand, *lcandend; /* start and end of candidate list */ 482
525 BATiter li; /* helper to loop through values */ 483 BATiter li; /* helper to loop through values */
526
527 BUN rstart, rend; /* start and end index */
528 BUN rcnt; /* difference between start and end (unused) */
529 const oid *rcand, *rcandend; /* start and end of candidate list */
530 BATiter ri; /* helper to loop through values */ 484 BATiter ri; /* helper to loop through values */
531 oid ro; /* right OID being matched */ 485 oid ro; /* right OID being matched */
532 486
533 const char *pat; /* the regular expression being matched */ 487 const char *pat; /* the regular expression being matched */
534 const char *err; /* error message from PCRE library */ 488 const char *err; /* error message from PCRE library */
537 pcre *re; /* compiled regular expression */ 491 pcre *re; /* compiled regular expression */
538 pcre_extra *sd; /* studied regular expression */ 492 pcre_extra *sd; /* studied regular expression */
539 493
540 (void) nil_matches; /* only relevant for equi-join */ 494 (void) nil_matches; /* only relevant for equi-join */
541 495
542 if (GDK_STRNIL(flags)) { 496 if (strNil(flags)) {
543 /* no matches when the flags is NIL 497 /* no matches when the flags is NIL
544 * we return two empty BATs */ 498 * we return two empty BATs */
545 bn1 = BATdense(0, 0, 0); 499 bn1 = BATdense(0, 0, 0);
546 bn2 = BATdense(0, 0, 0); 500 bn2 = BATdense(0, 0, 0);
547 if (bn1 == NULL || bn2 == NULL) { 501 if (bn1 == NULL || bn2 == NULL) {
590 if (sr) 544 if (sr)
591 BBPunfix(sr->batCacheid); 545 BBPunfix(sr->batCacheid);
592 throw(MAL, "regexp.rematchjoin", SEMANTIC_TYPE_MISMATCH); 546 throw(MAL, "regexp.rematchjoin", SEMANTIC_TYPE_MISMATCH);
593 } 547 }
594 548
549 if (canditer_init(&lci, l, sl) == 0 ||
550 canditer_init(&rci, r, sr) == 0) {
551 /* if either side is empty (or no candidates) the
552 * result is empty */
553 BBPunfix(l->batCacheid);
554 BBPunfix(r->batCacheid);
555 if (sl)
556 BBPunfix(sl->batCacheid);
557 if (sr)
558 BBPunfix(sr->batCacheid);
559 bn1 = BATdense(0, 0, 0);
560 bn2 = BATdense(0, 0, 0);
561 if (bn1 == NULL || bn2 == NULL) {
562 BBPreclaim(bn1);
563 BBPreclaim(bn2);
564 throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION);
565 }
566 return MAL_SUCCEED;
567 }
568
595 /* if there is no valid estimate, use the size of the left 569 /* if there is no valid estimate, use the size of the left
596 * input as size estimate */ 570 * input as size estimate */
597 if (is_lng_nil(estimate) || estimate == 0) 571 if (is_lng_nil(estimate) || estimate == 0)
598 estimate = sl ? BATcount(sl) : BATcount(l); 572 estimate = lci.ncand;
599 573
600 /* create the output BATs */ 574 /* create the output BATs */
601 bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT); 575 bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT);
602 bn2 = COLnew(0, TYPE_oid, estimate, TRANSIENT); 576 bn2 = COLnew(0, TYPE_oid, estimate, TRANSIENT);
603 if (bn1 == NULL || bn2 == NULL) { 577 if (bn1 == NULL || bn2 == NULL) {
612 if (sr) 586 if (sr)
613 BBPunfix(sr->batCacheid); 587 BBPunfix(sr->batCacheid);
614 throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); 588 throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION);
615 } 589 }
616 590
617 /* extract boundaries and candidate lists from inputs */
618 CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
619 CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);
620
621 li = bat_iterator(l); 591 li = bat_iterator(l);
622 ri = bat_iterator(r); 592 ri = bat_iterator(r);
623 593
624 /* if there is a candidate list (sl or sr is not NULL), but it 594 for (BUN i = 0; i < rci.ncand; i++) {
625 * is dense (lcand/rcand is NULL), we don't need the BAT 595 ro = canditer_next(&rci);
626 * descriptor anymore, the start and end values suffice */ 596 pat = BUNtvar(ri, ro - r->hseqbase);
627 if (sl && lcand == NULL) { 597
628 BBPunfix(sl->batCacheid);
629 sl = NULL;
630 }
631 if (sr && rcand == NULL) {
632 BBPunfix(sr->batCacheid);
633 sr = NULL;
634 }
635
636 for (;;) {
637 if (rcand) {
638 if (rcand == rcandend)
639 break;
640 ro = *rcand++;
641 pat = BUNtvar(ri, ro - r->hseqbase);
642 } else {
643 if (rstart == rend)
644 break;
645 ro = rstart + r->hseqbase;
646 pat = BUNtvar(ri, rstart);
647 rstart++;
648 }
649 /* nil regular expressions don't match (despite 598 /* nil regular expressions don't match (despite
650 * nil_matches) */ 599 * nil_matches) */
651 if (GDK_STRNIL(pat)) 600 if (strNil(pat))
652 continue; 601 continue;
653 re = pcre_compile(pat, options, &err, &pos, NULL); 602 re = pcre_compile(pat, options, &err, &pos, NULL);
654 sd = NULL; 603 sd = NULL;
655 if (re == NULL) 604 if (re == NULL)
656 goto bailout; 605 goto bailout;
657 sd = pcre_study(re, 0, &err); 606 sd = pcre_study(re, 0, &err);
658 if (err != NULL) 607 if (err != NULL)
659 goto bailout; 608 goto bailout;
660 /* inner loop: try to be more efficient than outer 609
661 * loop, so test for lcand once instead of each 610 /* inner loop: reset iterator, then iterate over it */
662 * iteration */ 611 canditer_reset(&lci);
663 if (lcand) { 612 for (BUN j = 0; j < lci.ncand; j++) {
664 for (const oid *cand = lcand; cand < lcandend; cand++) { 613 oid lo = canditer_next(&lci);
665 const char *val = BUNtvar(li, *cand - l->hseqbase); 614 const char *val = BUNtvar(li, lo - l->hseqbase);
666 if (GDK_STRNIL(val)) 615 if (strNil(val))
667 continue; 616 continue;
668 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); 617 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
669 if (pos >= 0) { 618 if (pos >= 0) {
670 /* regular expression matched */ 619 /* regular expression matched */
671 if (BUNappend(bn1, cand, false) != GDK_SUCCEED || 620 if (BUNappend(bn1, &lo, false) != GDK_SUCCEED ||
672 BUNappend(bn2, &ro, false) != GDK_SUCCEED) 621 BUNappend(bn2, &ro, false) != GDK_SUCCEED)
673 goto bailout;
674 } else if (pos != PCRE_ERROR_NOMATCH) {
675 /* error during processing */
676 err = "matching of regular expression failed";
677 goto bailout; 622 goto bailout;
678 } 623 } else if (pos != PCRE_ERROR_NOMATCH) {
679 } 624 /* error during processing */
680 } else { 625 err = "matching of regular expression failed";
681 for (BUN start = lstart; start < lend; start++) { 626 goto bailout;
682 const char *val = BUNtvar(li, start);
683 if (GDK_STRNIL(val))
684 continue;
685 pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
686 if (pos >= 0) {
687 /* regular expression matched */
688 oid lo = start + l->hseqbase;
689 if (BUNappend(bn1, &lo, false) != GDK_SUCCEED ||
690 BUNappend(bn2, &ro, false) != GDK_SUCCEED)
691 goto bailout;
692 } else if (pos != PCRE_ERROR_NOMATCH) {
693 /* error during processing */
694 err = "matching of regular expression failed";
695 goto bailout;
696 }
697 } 627 }
698 } 628 }
699 pcre_free_study(sd); 629 pcre_free_study(sd);
700 pcre_free(re); 630 pcre_free(re);
701 } 631 }