changeset 29:e44cffee8312

Implemented bulk variant of match function.
author Sjoerd Mullender <sjoerd@acm.org>
date Mon, 06 Aug 2018 20:59:23 +0200 (2018-08-06)
parents e925d55b369b
children 543dccbc169b
files regexp/README.rst regexp/regexp.c regexp/regexp.mal
diffstat 3 files changed, 194 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/regexp/README.rst	Thu Aug 02 21:15:48 2018 +0200
+++ b/regexp/README.rst	Mon Aug 06 20:59:23 2018 +0200
@@ -122,9 +122,18 @@
 MAL
 ...
 
-The MAL interface consists of three functions whose names are based on
-the names specified in the SQL interface.  The interface looks like
-this.  First the variant without the ``flags`` argument::
+The MAL interface consists of three or four functions whose names are
+based on the names specified in the SQL interface.  The three
+functions that need to be implemented have names that are equal to the
+name given as the ``EXTERNAL NAME`` in the SQL interface, plus that
+same name with ``join`` and ``select`` appended.  A fourth function
+can be optionally implemented.  It is the *bulk* variant of the first
+function.  This bulk function has a name that is equal to the name
+given in SQL with ``bat`` prepended.  The bulk variant returns a BAT
+with a single value for each input value.  See the *reverse* tutorial.
+
+The interface looks like this.  First the variant without the
+``flags`` argument::
 
   module regexp;
 
@@ -141,6 +150,12 @@
   comment "Return the matching pairs from the 'val' and 'pat'
   columns";
 
+  module batregexp;
+
+  command rematch(val:bat[:str], pat:str) :bat[:bit]
+  address regexpmatchbulk
+  comment "Return a BAT with true for match and false for no match";
+
 The variant with the ``flags`` argument looks like this::
 
   module regexp;
@@ -158,6 +173,12 @@
   comment "Return the matching pairs from the 'val' and 'pat'
   columns";
 
+  module batregexp;
+
+  command rematch(val:bat[:str], pat:str, flags:str) :bat[:bit]
+  address regexpmatchfbulk
+  comment "Return a BAT with true for match and false for no match";
+
 We put these MAL commands in the file
 ``$libdir/monetdb5/regexp.mal``.  In addition we create a file
 ``$libdir/monetdb5/autoload/81_regexp.mal`` that just contains::
@@ -236,6 +257,9 @@
       return MAL_SUCCEED;
   }
 
+We will not here describe the bulk variant.  The code is in the source
+file, though.
+
 Select
 ``````
 
--- a/regexp/regexp.c	Thu Aug 02 21:15:48 2018 +0200
+++ b/regexp/regexp.c	Mon Aug 06 20:59:23 2018 +0200
@@ -110,6 +110,163 @@
 }
 
 static char *
+do_matchbulk(bat *ret, bat bid, const char *pat, const char *flags, bit anti)
+{
+	BAT *b;			/* input BAT */
+	BATiter bi;		/* helper to loop through values */
+	BAT *bn;		/* result BAT */
+	bit *outp;		/* pointer through which we add to result */
+	BUN start, end;		/* iteration variables */
+
+	const char *err = NULL;	/* error message from PCRE library */
+	int pos = 0;		/* error position from PCRE library */
+	int options;		/* PCRE options */
+	pcre *re;		/* compiled regular expression */
+	pcre_extra *sd;		/* studied regular expression */
+
+	/* from the BAT ID we need to get the BAT descriptor, making
+	 * sure that the data of the BAT is loaded into memory */
+	if ((b = BATdescriptor(bid)) == NULL) {
+		throw(MAL, "batregexp.rematch", RUNTIME_OBJECT_MISSING);
+	}
+	/* check that the BAT has the expected type: we expect str or
+	 * something compatible with str (if we only want str, we need
+	 * to compare b->ttype with TYPE_str and not use ATOMstorage).
+	 * Note, the MAL interpreter will only call this function with
+	 * a str BAT because that is the only interface that is
+	 * defined in the MAL file, so this check is superfluous. */
+	if (ATOMstorage(b->ttype) != TYPE_str) {
+		BBPunfix(b->batCacheid);
+		throw(MAL, "batregexp.rematch", SEMANTIC_TYPE_MISMATCH);
+	}
+
+	/* if any of the input values are nil, the result is no match */
+	if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) {
+		/* no matches when the pattern or the flags is NIL
+		 * we return an a BAT with all NIL values */
+		bit f = bit_nil;
+		if ((bn = BATconstant(b->hseqbase, TYPE_bit, &f, BATcount(b), TRANSIENT)) == NULL)
+			throw(MAL, "batregexp.rematch", GDK_EXCEPTION);
+		*ret = bn->batCacheid;
+		BBPkeepref(*ret);
+		BBPunfix(b->batCacheid);
+		return MAL_SUCCEED;
+	}
+	options = parseflags(flags);
+	if (options == -1) {
+		BBPunfix(b->batCacheid);
+		throw(MAL, "batregexp.rematch", "bad flag character");
+	}
+
+	/* allocate a result BAT; the capacity we ask for is the the
+	 * size of the input BAT since we produce a value for each
+	 * input value */
+	bn = COLnew(b->hseqbase, TYPE_bit, BATcount(b), TRANSIENT);
+	if (bn == NULL) {
+		BBPunfix(b->batCacheid);
+		throw(MAL, "batregexp.rematch", GDK_EXCEPTION);
+	}
+
+	/* Position outp at the start of the result array.
+	 * We know the array is large enough even if every value were
+	 * to match, so we don't need to check for that. */
+	outp = (bit *) Tloc(bn, 0);
+
+	/* compile the regular expression */
+	re = pcre_compile(pat, options, &err, &pos, NULL);
+	if (re == NULL) {
+		BBPunfix(b->batCacheid);
+		BBPreclaim(bn);
+		throw(MAL, "batregexp.rematch",
+		      "compilation of regular expression (%s) failed at %d with %s",
+		      pat, pos, err);
+	}
+	/* invest in study of the r.e. */
+	sd = pcre_study(re, 0, &err);
+	if (err != NULL) {
+		pcre_free(re);
+		BBPunfix(b->batCacheid);
+		BBPreclaim(bn);
+		throw(MAL, "batregexp.rematch",
+		      "study of regular expression (%s) failed with %s",
+		      pat, err);
+	}
+
+	/* now, start and end are the limits in b that we need to look
+	 * at, and if set, cand and candend are the beginning and end
+	 * of the list of OIDs of b that we need to consider */
+
+	bi = bat_iterator(b);
+
+	/* we will change these if we add a NIL */
+	bn->tnil = false;
+	bn->tnonil = true;
+	for (start = 0, end = BATcount(b); start < end; start++) {
+		const char *val = BUNtvar(bi, start);
+		/* nil values never match */
+		if (GDK_STRNIL(val)) {
+			*outp++ = bit_nil;
+			bn->tnil = true;
+			bn->tnonil = false;
+		} else {
+			pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
+			if (pos < 0 && pos != PCRE_ERROR_NOMATCH) {
+				/* error during processing */
+				BBPunfix(b->batCacheid);
+				BBPreclaim(bn);
+				pcre_free_study(sd);
+				pcre_free(re);
+				throw(MAL, "batregexp.rematch",
+				      "matching of regular expression (%s) failed with %d",
+				      pat, pos);
+			}
+			*outp++ = pos >= 0; /* TRUE if match, FALSE if not */
+		}
+	}
+
+	/* set properties and size of result BAT */
+	BATsetcount(bn, BATcount(b));
+
+	if (BATcount(bn) > 1) {
+		/* if more than 1 result, it is not reverse sorted */
+		bn->tsorted = false;	/* probably not sorted */
+		bn->trevsorted = false;	/* probably not reverse sorted */
+		bn->tkey = false;	/* probably not key */
+	} else {
+		/* if empty or a single result, it is sorted, reverse
+		 * sorted, and key */
+		bn->tsorted = true;
+		bn->trevsorted = true;
+		bn->tkey = true;
+	}
+	bn->tnosorted = 0;	/* we don't know for sure */
+	bn->tnorevsorted = 0;	/* we don't know for sure */
+	bn->tnokey[0] = bn->tnokey[1] = 0;
+
+	/* we're done with b and re */
+	BBPunfix(b->batCacheid);
+	pcre_free_study(sd);
+	pcre_free(re);
+
+	*ret = bn->batCacheid;
+	BBPkeepref(*ret);
+	return MAL_SUCCEED;
+}
+
+char *
+regexpmatchbulk(bat *ret, const bat *bid, const char **pat, const bit *anti)
+{
+	return do_matchbulk(ret, *bid, *pat, "", *anti);
+}
+
+
+char *
+regexpmatchfbulk(bat *ret, const bat *bid, const char **pat, const char **flags, const bit *anti)
+{
+	return do_matchbulk(ret, *bid, *pat, *flags, *anti);
+}
+
+static char *
 do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti)
 {
 	BAT *b, *s = NULL;	/* input BAT and optional candidate list */
--- a/regexp/regexp.mal	Thu Aug 02 21:15:48 2018 +0200
+++ b/regexp/regexp.mal	Mon Aug 06 20:59:23 2018 +0200
@@ -34,3 +34,13 @@
 address regexpmatchfjoin
 comment "Return the matching pairs from the 'val' and 'pat'
 columns";
+
+module batregexp;
+
+command rematch(val:bat[:str], pat:str) :bat[:bit]
+address regexpmatchbulk
+comment "Return a BAT with true for match and false for no match";
+
+command rematch(val:bat[:str], pat:str, flags:str) :bat[:bit]
+address regexpmatchfbulk
+comment "Return a BAT with true for match and false for no match";