Mercurial > hg > MonetDB-extend
changeset 29:e44cffee8312
Implemented bulk variant of match function.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Mon, 06 Aug 2018 20:59:23 +0200 (2018-08-06) |
parents | e925d55b369b |
children | 543dccbc169b |
files | regexp/README.rst regexp/regexp.c regexp/regexp.mal |
diffstat | 3 files changed, 194 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/regexp/README.rst Thu Aug 02 21:15:48 2018 +0200 +++ b/regexp/README.rst Mon Aug 06 20:59:23 2018 +0200 @@ -122,9 +122,18 @@ MAL ... -The MAL interface consists of three functions whose names are based on -the names specified in the SQL interface. The interface looks like -this. First the variant without the ``flags`` argument:: +The MAL interface consists of three or four functions whose names are +based on the names specified in the SQL interface. The three +functions that need to be implemented have names that are equal to the +name given as the ``EXTERNAL NAME`` in the SQL interface, plus that +same name with ``join`` and ``select`` appended. A fourth function +can be optionally implemented. It is the *bulk* variant of the first +function. This bulk function has a name that is equal to the name +given in SQL with ``bat`` prepended. The bulk variant returns a BAT +with a single value for each input value. See the *reverse* tutorial. + +The interface looks like this. First the variant without the +``flags`` argument:: module regexp; @@ -141,6 +150,12 @@ comment "Return the matching pairs from the 'val' and 'pat' columns"; + module batregexp; + + command rematch(val:bat[:str], pat:str) :bat[:bit] + address regexpmatchbulk + comment "Return a BAT with true for match and false for no match"; + The variant with the ``flags`` argument looks like this:: module regexp; @@ -158,6 +173,12 @@ comment "Return the matching pairs from the 'val' and 'pat' columns"; + module batregexp; + + command rematch(val:bat[:str], pat:str, flags:str) :bat[:bit] + address regexpmatchfbulk + comment "Return a BAT with true for match and false for no match"; + We put these MAL commands in the file ``$libdir/monetdb5/regexp.mal``. In addition we create a file ``$libdir/monetdb5/autoload/81_regexp.mal`` that just contains:: @@ -236,6 +257,9 @@ return MAL_SUCCEED; } +We will not here describe the bulk variant. The code is in the source +file, though. + Select ``````
--- a/regexp/regexp.c Thu Aug 02 21:15:48 2018 +0200 +++ b/regexp/regexp.c Mon Aug 06 20:59:23 2018 +0200 @@ -110,6 +110,163 @@ } static char * +do_matchbulk(bat *ret, bat bid, const char *pat, const char *flags, bit anti) +{ + BAT *b; /* input BAT */ + BATiter bi; /* helper to loop through values */ + BAT *bn; /* result BAT */ + bit *outp; /* pointer through which we add to result */ + BUN start, end; /* iteration variables */ + + const char *err = NULL; /* error message from PCRE library */ + int pos = 0; /* error position from PCRE library */ + int options; /* PCRE options */ + pcre *re; /* compiled regular expression */ + pcre_extra *sd; /* studied regular expression */ + + /* from the BAT ID we need to get the BAT descriptor, making + * sure that the data of the BAT is loaded into memory */ + if ((b = BATdescriptor(bid)) == NULL) { + throw(MAL, "batregexp.rematch", RUNTIME_OBJECT_MISSING); + } + /* check that the BAT has the expected type: we expect str or + * something compatible with str (if we only want str, we need + * to compare b->ttype with TYPE_str and not use ATOMstorage). + * Note, the MAL interpreter will only call this function with + * a str BAT because that is the only interface that is + * defined in the MAL file, so this check is superfluous. */ + if (ATOMstorage(b->ttype) != TYPE_str) { + BBPunfix(b->batCacheid); + throw(MAL, "batregexp.rematch", SEMANTIC_TYPE_MISMATCH); + } + + /* if any of the input values are nil, the result is no match */ + if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { + /* no matches when the pattern or the flags is NIL + * we return an a BAT with all NIL values */ + bit f = bit_nil; + if ((bn = BATconstant(b->hseqbase, TYPE_bit, &f, BATcount(b), TRANSIENT)) == NULL) + throw(MAL, "batregexp.rematch", GDK_EXCEPTION); + *ret = bn->batCacheid; + BBPkeepref(*ret); + BBPunfix(b->batCacheid); + return MAL_SUCCEED; + } + options = parseflags(flags); + if (options == -1) { + BBPunfix(b->batCacheid); + throw(MAL, "batregexp.rematch", "bad flag character"); + } + + /* allocate a result BAT; the capacity we ask for is the the + * size of the input BAT since we produce a value for each + * input value */ + bn = COLnew(b->hseqbase, TYPE_bit, BATcount(b), TRANSIENT); + if (bn == NULL) { + BBPunfix(b->batCacheid); + throw(MAL, "batregexp.rematch", GDK_EXCEPTION); + } + + /* Position outp at the start of the result array. + * We know the array is large enough even if every value were + * to match, so we don't need to check for that. */ + outp = (bit *) Tloc(bn, 0); + + /* compile the regular expression */ + re = pcre_compile(pat, options, &err, &pos, NULL); + if (re == NULL) { + BBPunfix(b->batCacheid); + BBPreclaim(bn); + throw(MAL, "batregexp.rematch", + "compilation of regular expression (%s) failed at %d with %s", + pat, pos, err); + } + /* invest in study of the r.e. */ + sd = pcre_study(re, 0, &err); + if (err != NULL) { + pcre_free(re); + BBPunfix(b->batCacheid); + BBPreclaim(bn); + throw(MAL, "batregexp.rematch", + "study of regular expression (%s) failed with %s", + pat, err); + } + + /* now, start and end are the limits in b that we need to look + * at, and if set, cand and candend are the beginning and end + * of the list of OIDs of b that we need to consider */ + + bi = bat_iterator(b); + + /* we will change these if we add a NIL */ + bn->tnil = false; + bn->tnonil = true; + for (start = 0, end = BATcount(b); start < end; start++) { + const char *val = BUNtvar(bi, start); + /* nil values never match */ + if (GDK_STRNIL(val)) { + *outp++ = bit_nil; + bn->tnil = true; + bn->tnonil = false; + } else { + pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); + if (pos < 0 && pos != PCRE_ERROR_NOMATCH) { + /* error during processing */ + BBPunfix(b->batCacheid); + BBPreclaim(bn); + pcre_free_study(sd); + pcre_free(re); + throw(MAL, "batregexp.rematch", + "matching of regular expression (%s) failed with %d", + pat, pos); + } + *outp++ = pos >= 0; /* TRUE if match, FALSE if not */ + } + } + + /* set properties and size of result BAT */ + BATsetcount(bn, BATcount(b)); + + if (BATcount(bn) > 1) { + /* if more than 1 result, it is not reverse sorted */ + bn->tsorted = false; /* probably not sorted */ + bn->trevsorted = false; /* probably not reverse sorted */ + bn->tkey = false; /* probably not key */ + } else { + /* if empty or a single result, it is sorted, reverse + * sorted, and key */ + bn->tsorted = true; + bn->trevsorted = true; + bn->tkey = true; + } + bn->tnosorted = 0; /* we don't know for sure */ + bn->tnorevsorted = 0; /* we don't know for sure */ + bn->tnokey[0] = bn->tnokey[1] = 0; + + /* we're done with b and re */ + BBPunfix(b->batCacheid); + pcre_free_study(sd); + pcre_free(re); + + *ret = bn->batCacheid; + BBPkeepref(*ret); + return MAL_SUCCEED; +} + +char * +regexpmatchbulk(bat *ret, const bat *bid, const char **pat, const bit *anti) +{ + return do_matchbulk(ret, *bid, *pat, "", *anti); +} + + +char * +regexpmatchfbulk(bat *ret, const bat *bid, const char **pat, const char **flags, const bit *anti) +{ + return do_matchbulk(ret, *bid, *pat, *flags, *anti); +} + +static char * do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti) { BAT *b, *s = NULL; /* input BAT and optional candidate list */
--- a/regexp/regexp.mal Thu Aug 02 21:15:48 2018 +0200 +++ b/regexp/regexp.mal Mon Aug 06 20:59:23 2018 +0200 @@ -34,3 +34,13 @@ address regexpmatchfjoin comment "Return the matching pairs from the 'val' and 'pat' columns"; + +module batregexp; + +command rematch(val:bat[:str], pat:str) :bat[:bit] +address regexpmatchbulk +comment "Return a BAT with true for match and false for no match"; + +command rematch(val:bat[:str], pat:str, flags:str) :bat[:bit] +address regexpmatchfbulk +comment "Return a BAT with true for match and false for no match";