Mercurial > hg > MonetDB-extend
changeset 33:a8ffdbc388ce
Ported to Jun2020 branch.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Tue, 09 Jun 2020 10:30:35 +0200 (2020-06-09) |
parents | 25cd8af6fa82 |
children | f712fa63c6cc |
files | regexp/regexp.c reverse/reverse.c |
diffstat | 2 files changed, 101 insertions(+), 171 deletions(-) [+] |
line wrap: on
line diff
--- a/regexp/regexp.c Tue Jun 18 19:00:56 2019 +0200 +++ b/regexp/regexp.c Tue Jun 09 10:30:35 2020 +0200 @@ -11,8 +11,8 @@ /* mal_exception.h actually contains everything we need */ #include <mal_exception.h> -/* for the CANDINIT macro */ -#include <gdk_cand.h> /* pre Jul2017-SP4, replace with CANDINIT definition */ +/* for the candidate iterator */ +#include <gdk_cand.h> /* system include files */ #include <string.h> @@ -74,9 +74,7 @@ pcre *re; /* if any of the input values are nil, the result is no match */ - if (GDK_STRNIL(val) || - GDK_STRNIL(pat) || - GDK_STRNIL(flags)) { + if (strNil(val) || strNil(pat) || strNil(flags)) { /* special case for NIL inputs: NILs don't match anything */ *ret = 0; return MAL_SUCCEED; @@ -147,7 +145,7 @@ } /* if any of the input values are nil, the result is no match */ - if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { + if (strNil(pat) || strNil(flags)) { /* no matches when the pattern or the flags is NIL * we return an a BAT with all NIL values */ bit f = bit_nil; @@ -210,7 +208,7 @@ for (start = 0, end = BATcount(b); start < end; start++) { const char *val = BUNtvar(bi, start); /* nil values never match */ - if (GDK_STRNIL(val)) { + if (strNil(val)) { *outp++ = bit_nil; bn->tnil = true; bn->tnonil = false; @@ -277,9 +275,7 @@ { BAT *b, *s = NULL; /* input BAT and optional candidate list */ - BUN start, end; /* start and end index */ - BUN cnt; /* difference between start and end (unused) */ - const oid *cand, *candend; /* start and end of candidate list */ + struct canditer ci; /* candidate iterator */ BATiter bi; /* helper to loop through values */ BAT *bn; /* result BAT */ @@ -292,7 +288,7 @@ pcre_extra *sd; /* studied regular expression */ /* if any of the input values are nil, the result is no match */ - if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { + if (strNil(pat) || strNil(flags)) { /* no matches when the pattern or the flags is NIL * we return an empty BAT of the correct type */ if ((bn = BATdense(0, 0, 0)) == NULL) @@ -326,11 +322,22 @@ throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); } + if (canditer_init(&ci, b, s) == 0) { + /* trivially empty result */ + BBPunfix(b->batCacheid); + if (s) + BBPunfix(s->batCacheid); + bn = BATdense(0, 0, 0); + *ret = bn->batCacheid; + BBPkeepref(*ret); + return MAL_SUCCEED; + } + /* allocate a result BAT; the capacity we ask for is the * maximum potential result size (i.e. the size of the * candidate list if there is one, else the size of the input * BAT b) */ - bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT); + bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT); if (bn == NULL) { BBPunfix(b->batCacheid); if (s) @@ -367,96 +374,47 @@ pat, err); } - /* initialize the start, end, cnt, cand, and candend variables - * from b and s*/ - CANDINIT(b, s, start, end, cnt, cand, candend); - - /* now, start and end are the limits in b that we need to look - * at, and if set, cand and candend are the beginning and end - * of the list of OIDs of b that we need to consider */ - bi = bat_iterator(b); - if (cand) { - /* there is a non-dense candidate list, we run through - * it and only check values in b that are referenced - * in this candidate list */ - while (cand < candend) { - /* the candidate list has a list of OIDs which - * are relative to b->hseqbase, we need to - * convert that to an index relative to the - * start of the array in the (tail) heap */ - const char *val = BUNtvar(bi, *cand - b->hseqbase); - /* nil values never match */ - if (!GDK_STRNIL(val)) { - pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); - if (pos >= 0) { - /* regular expression matched */ - if (!anti) - *outp++ = *cand; - } else if (pos == PCRE_ERROR_NOMATCH) { - /* regular expression didn't match */ - if (anti) - *outp++ = *cand; - } else { - /* error during processing */ - BBPunfix(b->batCacheid); - BBPunfix(s->batCacheid); - BBPreclaim(bn); - pcre_free_study(sd); - pcre_free(re); - throw(MAL, "regexp.rematchselect", - "matching of regular expression (%s) failed with %d", - pat, pos); - } + /* iterate through the candidates */ + for (BUN i = 0; i < ci.ncand; i++) { + /* get the next candidate */ + oid o = canditer_next(&ci); + + /* the candidate list has a list of OIDs which are + * relative to b->hseqbase, we need to convert that to + * an index relative to the start of the array in the + * (tail) heap */ + const char *val = BUNtvar(bi, o - b->hseqbase); + + /* nil values never match */ + if (!strNil(val)) { + pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); + if (pos >= 0) { + /* regular expression matched */ + if (!anti) + *outp++ = o; + } else if (pos == PCRE_ERROR_NOMATCH) { + /* regular expression didn't match */ + if (anti) + *outp++ = o; + } else { + /* error during processing */ + BBPunfix(b->batCacheid); + BBPunfix(s->batCacheid); + BBPreclaim(bn); + pcre_free_study(sd); + pcre_free(re); + throw(MAL, "regexp.rematchselect", + "matching of regular expression (%s) failed with %d", + pat, pos); } - cand++; - } - /* we're done with s */ - BBPunfix(s->batCacheid); - } else { - /* we're done with s */ - if (s) { - /* there is a dense candidate list, we don't - * need it anymore since we extracted start - * and end */ - BBPunfix(s->batCacheid); - } - while (start < end) { - const char *val = BUNtvar(bi, start); - /* nil values never match */ - if (!GDK_STRNIL(val)) { - pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); - /* note that we need to convert the - * index relative to the start of the - * array to a value relative to - * b->hseqbase (opposite from what - * happens above) */ - if (pos >= 0) { - /* regular expression matched */ - if (!anti) - *outp++ = b->hseqbase + start; - } else if (pos == PCRE_ERROR_NOMATCH) { - /* regular expression didn't match */ - if (anti) - *outp++ = b->hseqbase + start; - } else { - /* error during processing */ - BBPunfix(b->batCacheid); - BBPreclaim(bn); - pcre_free_study(sd); - pcre_free(re); - throw(MAL, "regexp.rematchselect", - "matching of regular expression (%s) failed with %d", - pat, pos); - } - } - start++; } } - - /* we're done with b and re */ + /* we're done with b, s, and re */ BBPunfix(b->batCacheid); + if (s) + BBPunfix(s->batCacheid); pcre_free_study(sd); pcre_free(re); @@ -519,14 +477,10 @@ BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */ BAT *bn1, *bn2; /* output BATs */ - BUN lstart, lend; /* start and end index */ - BUN lcnt; /* difference between start and end (unused) */ - const oid *lcand, *lcandend; /* start and end of candidate list */ + struct canditer lci; /* candidate iterator for l */ + struct canditer rci; /* candidate iterator for r */ + BATiter li; /* helper to loop through values */ - - BUN rstart, rend; /* start and end index */ - BUN rcnt; /* difference between start and end (unused) */ - const oid *rcand, *rcandend; /* start and end of candidate list */ BATiter ri; /* helper to loop through values */ oid ro; /* right OID being matched */ @@ -539,7 +493,7 @@ (void) nil_matches; /* only relevant for equi-join */ - if (GDK_STRNIL(flags)) { + if (strNil(flags)) { /* no matches when the flags is NIL * we return two empty BATs */ bn1 = BATdense(0, 0, 0); @@ -592,10 +546,30 @@ throw(MAL, "regexp.rematchjoin", SEMANTIC_TYPE_MISMATCH); } + if (canditer_init(&lci, l, sl) == 0 || + canditer_init(&rci, r, sr) == 0) { + /* if either side is empty (or no candidates) the + * result is empty */ + BBPunfix(l->batCacheid); + BBPunfix(r->batCacheid); + if (sl) + BBPunfix(sl->batCacheid); + if (sr) + BBPunfix(sr->batCacheid); + bn1 = BATdense(0, 0, 0); + bn2 = BATdense(0, 0, 0); + if (bn1 == NULL || bn2 == NULL) { + BBPreclaim(bn1); + BBPreclaim(bn2); + throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); + } + return MAL_SUCCEED; + } + /* if there is no valid estimate, use the size of the left * input as size estimate */ if (is_lng_nil(estimate) || estimate == 0) - estimate = sl ? BATcount(sl) : BATcount(l); + estimate = lci.ncand; /* create the output BATs */ bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT); @@ -614,41 +588,16 @@ throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); } - /* extract boundaries and candidate lists from inputs */ - CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend); - CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend); - li = bat_iterator(l); ri = bat_iterator(r); - /* if there is a candidate list (sl or sr is not NULL), but it - * is dense (lcand/rcand is NULL), we don't need the BAT - * descriptor anymore, the start and end values suffice */ - if (sl && lcand == NULL) { - BBPunfix(sl->batCacheid); - sl = NULL; - } - if (sr && rcand == NULL) { - BBPunfix(sr->batCacheid); - sr = NULL; - } + for (BUN i = 0; i < rci.ncand; i++) { + ro = canditer_next(&rci); + pat = BUNtvar(ri, ro - r->hseqbase); - for (;;) { - if (rcand) { - if (rcand == rcandend) - break; - ro = *rcand++; - pat = BUNtvar(ri, ro - r->hseqbase); - } else { - if (rstart == rend) - break; - ro = rstart + r->hseqbase; - pat = BUNtvar(ri, rstart); - rstart++; - } /* nil regular expressions don't match (despite * nil_matches) */ - if (GDK_STRNIL(pat)) + if (strNil(pat)) continue; re = pcre_compile(pat, options, &err, &pos, NULL); sd = NULL; @@ -657,43 +606,24 @@ sd = pcre_study(re, 0, &err); if (err != NULL) goto bailout; - /* inner loop: try to be more efficient than outer - * loop, so test for lcand once instead of each - * iteration */ - if (lcand) { - for (const oid *cand = lcand; cand < lcandend; cand++) { - const char *val = BUNtvar(li, *cand - l->hseqbase); - if (GDK_STRNIL(val)) - continue; - pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); - if (pos >= 0) { - /* regular expression matched */ - if (BUNappend(bn1, cand, false) != GDK_SUCCEED || - BUNappend(bn2, &ro, false) != GDK_SUCCEED) - goto bailout; - } else if (pos != PCRE_ERROR_NOMATCH) { - /* error during processing */ - err = "matching of regular expression failed"; + + /* inner loop: reset iterator, then iterate over it */ + canditer_reset(&lci); + for (BUN j = 0; j < lci.ncand; j++) { + oid lo = canditer_next(&lci); + const char *val = BUNtvar(li, lo - l->hseqbase); + if (strNil(val)) + continue; + pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); + if (pos >= 0) { + /* regular expression matched */ + if (BUNappend(bn1, &lo, false) != GDK_SUCCEED || + BUNappend(bn2, &ro, false) != GDK_SUCCEED) goto bailout; - } - } - } else { - for (BUN start = lstart; start < lend; start++) { - const char *val = BUNtvar(li, start); - if (GDK_STRNIL(val)) - continue; - pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); - if (pos >= 0) { - /* regular expression matched */ - oid lo = start + l->hseqbase; - if (BUNappend(bn1, &lo, false) != GDK_SUCCEED || - BUNappend(bn2, &ro, false) != GDK_SUCCEED) - goto bailout; - } else if (pos != PCRE_ERROR_NOMATCH) { - /* error during processing */ - err = "matching of regular expression failed"; - goto bailout; - } + } else if (pos != PCRE_ERROR_NOMATCH) { + /* error during processing */ + err = "matching of regular expression failed"; + goto bailout; } } pcre_free_study(sd);
--- a/reverse/reverse.c Tue Jun 18 19:00:56 2019 +0200 +++ b/reverse/reverse.c Tue Jun 09 10:30:35 2020 +0200 @@ -21,7 +21,7 @@ do_reverse(char *dst, const char *src, size_t len) { dst[len] = 0; - if (GDK_STRNIL(src)) { + if (strNil(src)) { /* special case for nil:str */ assert(len == strlen(str_nil)); strcpy(dst, str_nil);