changeset 33:a8ffdbc388ce

Ported to Jun2020 branch.
author Sjoerd Mullender <sjoerd@acm.org>
date Tue, 09 Jun 2020 10:30:35 +0200 (2020-06-09)
parents 25cd8af6fa82
children f712fa63c6cc
files regexp/regexp.c reverse/reverse.c
diffstat 2 files changed, 101 insertions(+), 171 deletions(-) [+]
line wrap: on
line diff
--- a/regexp/regexp.c	Tue Jun 18 19:00:56 2019 +0200
+++ b/regexp/regexp.c	Tue Jun 09 10:30:35 2020 +0200
@@ -11,8 +11,8 @@
 /* mal_exception.h actually contains everything we need */
 #include <mal_exception.h>
 
-/* for the CANDINIT macro */
-#include <gdk_cand.h> /* pre Jul2017-SP4, replace with CANDINIT definition */
+/* for the candidate iterator */
+#include <gdk_cand.h>
 
 /* system include files */
 #include <string.h>
@@ -74,9 +74,7 @@
 	pcre *re;
 
 	/* if any of the input values are nil, the result is no match */
-	if (GDK_STRNIL(val) ||
-	    GDK_STRNIL(pat) ||
-	    GDK_STRNIL(flags)) {
+	if (strNil(val) || strNil(pat) || strNil(flags)) {
 		/* special case for NIL inputs: NILs don't match anything */
 		*ret = 0;
 		return MAL_SUCCEED;
@@ -147,7 +145,7 @@
 	}
 
 	/* if any of the input values are nil, the result is no match */
-	if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) {
+	if (strNil(pat) || strNil(flags)) {
 		/* no matches when the pattern or the flags is NIL
 		 * we return an a BAT with all NIL values */
 		bit f = bit_nil;
@@ -210,7 +208,7 @@
 	for (start = 0, end = BATcount(b); start < end; start++) {
 		const char *val = BUNtvar(bi, start);
 		/* nil values never match */
-		if (GDK_STRNIL(val)) {
+		if (strNil(val)) {
 			*outp++ = bit_nil;
 			bn->tnil = true;
 			bn->tnonil = false;
@@ -277,9 +275,7 @@
 {
 	BAT *b, *s = NULL;	/* input BAT and optional candidate list */
 
-	BUN start, end;		/* start and end index */
-	BUN cnt;		/* difference between start and end (unused) */
-	const oid *cand, *candend; /* start and end of candidate list */
+	struct canditer ci;	/* candidate iterator */
 	BATiter bi;		/* helper to loop through values */
 
 	BAT *bn;		/* result BAT */
@@ -292,7 +288,7 @@
 	pcre_extra *sd;		/* studied regular expression */
 
 	/* if any of the input values are nil, the result is no match */
-	if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) {
+	if (strNil(pat) || strNil(flags)) {
 		/* no matches when the pattern or the flags is NIL
 		 * we return an empty BAT of the correct type */
 		if ((bn = BATdense(0, 0, 0)) == NULL)
@@ -326,11 +322,22 @@
 		throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING);
 	}
 
+	if (canditer_init(&ci, b, s) == 0) {
+		/* trivially empty result */
+		BBPunfix(b->batCacheid);
+		if (s)
+			BBPunfix(s->batCacheid);
+		bn = BATdense(0, 0, 0);
+		*ret = bn->batCacheid;
+		BBPkeepref(*ret);
+		return MAL_SUCCEED;
+	}
+
 	/* allocate a result BAT; the capacity we ask for is the
 	 * maximum potential result size (i.e. the size of the
 	 * candidate list if there is one, else the size of the input
 	 * BAT b) */
-	bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT);
+	bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT);
 	if (bn == NULL) {
 		BBPunfix(b->batCacheid);
 		if (s)
@@ -367,96 +374,47 @@
 		      pat, err);
 	}
 
-	/* initialize the start, end, cnt, cand, and candend variables
-	 * from b and s*/
-	CANDINIT(b, s, start, end, cnt, cand, candend);
-
-	/* now, start and end are the limits in b that we need to look
-	 * at, and if set, cand and candend are the beginning and end
-	 * of the list of OIDs of b that we need to consider */
-
 	bi = bat_iterator(b);
 
-	if (cand) {
-		/* there is a non-dense candidate list, we run through
-		 * it and only check values in b that are referenced
-		 * in this candidate list */
-		while (cand < candend) {
-			/* the candidate list has a list of OIDs which
-			 * are relative to b->hseqbase, we need to
-			 * convert that to an index relative to the
-			 * start of the array in the (tail) heap */
-			const char *val = BUNtvar(bi, *cand - b->hseqbase);
-			/* nil values never match */
-			if (!GDK_STRNIL(val)) {
-				pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
-				if (pos >= 0) {
-					/* regular expression matched */
-					if (!anti)
-						*outp++ = *cand;
-				} else if (pos == PCRE_ERROR_NOMATCH) {
-					/* regular expression didn't match */
-					if (anti)
-						*outp++ = *cand;
-				} else {
-					/* error during processing */
-					BBPunfix(b->batCacheid);
-					BBPunfix(s->batCacheid);
-					BBPreclaim(bn);
-					pcre_free_study(sd);
-					pcre_free(re);
-					throw(MAL, "regexp.rematchselect",
-					      "matching of regular expression (%s) failed with %d",
-					      pat, pos);
-				}
+	/* iterate through the candidates */
+	for (BUN i = 0; i < ci.ncand; i++) {
+		/* get the next candidate */
+		oid o = canditer_next(&ci);
+
+		/* the candidate list has a list of OIDs which are
+		 * relative to b->hseqbase, we need to convert that to
+		 * an index relative to the start of the array in the
+		 * (tail) heap */
+		const char *val = BUNtvar(bi, o - b->hseqbase);
+
+		/* nil values never match */
+		if (!strNil(val)) {
+			pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
+			if (pos >= 0) {
+				/* regular expression matched */
+				if (!anti)
+					*outp++ = o;
+			} else if (pos == PCRE_ERROR_NOMATCH) {
+				/* regular expression didn't match */
+				if (anti)
+					*outp++ = o;
+			} else {
+				/* error during processing */
+				BBPunfix(b->batCacheid);
+				BBPunfix(s->batCacheid);
+				BBPreclaim(bn);
+				pcre_free_study(sd);
+				pcre_free(re);
+				throw(MAL, "regexp.rematchselect",
+				      "matching of regular expression (%s) failed with %d",
+				      pat, pos);
 			}
-			cand++;
-		}
-		/* we're done with s */
-		BBPunfix(s->batCacheid);
-	} else {
-		/* we're done with s */
-		if (s) {
-			/* there is a dense candidate list, we don't
-			 * need it anymore since we extracted start
-			 * and end */
-			BBPunfix(s->batCacheid);
-		}
-		while (start < end) {
-			const char *val = BUNtvar(bi, start);
-			/* nil values never match */
-			if (!GDK_STRNIL(val)) {
-				pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
-				/* note that we need to convert the
-				 * index relative to the start of the
-				 * array to a value relative to
-				 * b->hseqbase (opposite from what
-				 * happens above) */
-				if (pos >= 0) {
-					/* regular expression matched */
-					if (!anti)
-						*outp++ = b->hseqbase + start;
-				} else if (pos == PCRE_ERROR_NOMATCH) {
-					/* regular expression didn't match */
-					if (anti)
-						*outp++ = b->hseqbase + start;
-				} else {
-					/* error during processing */
-					BBPunfix(b->batCacheid);
-					BBPreclaim(bn);
-					pcre_free_study(sd);
-					pcre_free(re);
-					throw(MAL, "regexp.rematchselect",
-					      "matching of regular expression (%s) failed with %d",
-					      pat, pos);
-				}
-			}
-			start++;
 		}
 	}
-
-	/* we're done with b and re */
+	/* we're done with b, s, and re */
 	BBPunfix(b->batCacheid);
+	if (s)
+		BBPunfix(s->batCacheid);
 	pcre_free_study(sd);
 	pcre_free(re);
 
@@ -519,14 +477,10 @@
 	BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */
 	BAT *bn1, *bn2;		/* output BATs */
 
-	BUN lstart, lend;	/* start and end index */
-	BUN lcnt;		/* difference between start and end (unused) */
-	const oid *lcand, *lcandend; /* start and end of candidate list */
+	struct canditer lci;	/* candidate iterator for l */
+	struct canditer rci;	/* candidate iterator for r */
+
 	BATiter li;		/* helper to loop through values */
-
-	BUN rstart, rend;	/* start and end index */
-	BUN rcnt;		/* difference between start and end (unused) */
-	const oid *rcand, *rcandend; /* start and end of candidate list */
 	BATiter ri;		/* helper to loop through values */
 	oid ro;			/* right OID being matched */
 
@@ -539,7 +493,7 @@
 
 	(void) nil_matches;	/* only relevant for equi-join */
 
-	if (GDK_STRNIL(flags)) {
+	if (strNil(flags)) {
 		/* no matches when the flags is NIL
 		 * we return two empty BATs */
 		bn1 = BATdense(0, 0, 0);
@@ -592,10 +546,30 @@
 		throw(MAL, "regexp.rematchjoin", SEMANTIC_TYPE_MISMATCH);
 	}
 
+	if (canditer_init(&lci, l, sl) == 0 ||
+	    canditer_init(&rci, r, sr) == 0) {
+		/* if either side is empty (or no candidates) the
+		 * result is empty */
+		BBPunfix(l->batCacheid);
+		BBPunfix(r->batCacheid);
+		if (sl)
+			BBPunfix(sl->batCacheid);
+		if (sr)
+			BBPunfix(sr->batCacheid);
+		bn1 = BATdense(0, 0, 0);
+		bn2 = BATdense(0, 0, 0);
+		if (bn1 == NULL || bn2 == NULL) {
+			BBPreclaim(bn1);
+			BBPreclaim(bn2);
+			throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION);
+		}
+		return MAL_SUCCEED;
+	}
+
 	/* if there is no valid estimate, use the size of the left
 	 * input as size estimate */
 	if (is_lng_nil(estimate) || estimate == 0)
-		estimate = sl ? BATcount(sl) : BATcount(l);
+		estimate = lci.ncand;
 
 	/* create the output BATs */
 	bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT);
@@ -614,41 +588,16 @@
 		throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION);
 	}
 
-	/* extract boundaries and candidate lists from inputs */
-	CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
-	CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);
-
 	li = bat_iterator(l);
 	ri = bat_iterator(r);
 
-	/* if there is a candidate list (sl or sr is not NULL), but it
-	 * is dense (lcand/rcand is NULL), we don't need the BAT
-	 * descriptor anymore, the start and end values suffice */
-	if (sl && lcand == NULL) {
-		BBPunfix(sl->batCacheid);
-		sl = NULL;
-	}
-	if (sr && rcand == NULL) {
-		BBPunfix(sr->batCacheid);
-		sr = NULL;
-	}
+	for (BUN i = 0; i < rci.ncand; i++) {
+		ro = canditer_next(&rci);
+		pat = BUNtvar(ri, ro - r->hseqbase);
 
-	for (;;) {
-		if (rcand) {
-			if (rcand == rcandend)
-				break;
-			ro = *rcand++;
-			pat = BUNtvar(ri, ro - r->hseqbase);
-		} else {
-			if (rstart == rend)
-				break;
-			ro = rstart + r->hseqbase;
-			pat = BUNtvar(ri, rstart);
-			rstart++;
-		}
 		/* nil regular expressions don't match (despite
 		 * nil_matches) */
-		if (GDK_STRNIL(pat))
+		if (strNil(pat))
 			continue;
 		re = pcre_compile(pat, options, &err, &pos, NULL);
 		sd = NULL;
@@ -657,43 +606,24 @@
 		sd = pcre_study(re, 0, &err);
 		if (err != NULL)
 			goto bailout;
-		/* inner loop: try to be more efficient than outer
-		 * loop, so test for lcand once instead of each
-		 * iteration */
-		if (lcand) {
-			for (const oid *cand = lcand; cand < lcandend; cand++) {
-				const char *val = BUNtvar(li, *cand - l->hseqbase);
-				if (GDK_STRNIL(val))
-					continue;
-				pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
-				if (pos >= 0) {
-					/* regular expression matched */
-					if (BUNappend(bn1, cand, false) != GDK_SUCCEED ||
-					    BUNappend(bn2, &ro, false) != GDK_SUCCEED)
-						goto bailout;
-				} else if (pos != PCRE_ERROR_NOMATCH) {
-					/* error during processing */
-					err = "matching of regular expression failed";
+
+		/* inner loop: reset iterator, then iterate over it */
+		canditer_reset(&lci);
+		for (BUN j = 0; j < lci.ncand; j++) {
+			oid lo = canditer_next(&lci);
+			const char *val = BUNtvar(li, lo - l->hseqbase);
+			if (strNil(val))
+				continue;
+			pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
+			if (pos >= 0) {
+				/* regular expression matched */
+				if (BUNappend(bn1, &lo, false) != GDK_SUCCEED ||
+				    BUNappend(bn2, &ro, false) != GDK_SUCCEED)
 					goto bailout;
-				}
-			}
-		} else {
-			for (BUN start = lstart; start < lend; start++) {
-				const char *val = BUNtvar(li, start);
-				if (GDK_STRNIL(val))
-					continue;
-				pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
-				if (pos >= 0) {
-					/* regular expression matched */
-					oid lo = start + l->hseqbase;
-					if (BUNappend(bn1, &lo, false) != GDK_SUCCEED ||
-					    BUNappend(bn2, &ro, false) != GDK_SUCCEED)
-						goto bailout;
-				} else if (pos != PCRE_ERROR_NOMATCH) {
-					/* error during processing */
-					err = "matching of regular expression failed";
-					goto bailout;
-				}
+			} else if (pos != PCRE_ERROR_NOMATCH) {
+				/* error during processing */
+				err = "matching of regular expression failed";
+				goto bailout;
 			}
 		}
 		pcre_free_study(sd);
--- a/reverse/reverse.c	Tue Jun 18 19:00:56 2019 +0200
+++ b/reverse/reverse.c	Tue Jun 09 10:30:35 2020 +0200
@@ -21,7 +21,7 @@
 do_reverse(char *dst, const char *src, size_t len)
 {
 	dst[len] = 0;
-	if (GDK_STRNIL(src)) {
+	if (strNil(src)) {
 		/* special case for nil:str */
 		assert(len == strlen(str_nil));
 		strcpy(dst, str_nil);