Mercurial > hg > MonetDB-extend
changeset 15:59bbfa0096b3
Added a tutorial for creating a FILTER FUNCTION.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Mon, 22 Jan 2018 17:56:06 +0100 (2018-01-22) |
parents | 5a5167adae4a |
children | e799d117c5b1 |
files | README.rst regexp/81_regexp.mal regexp/81_regexp.sql regexp/Makefile regexp/README.rst regexp/regexp.c regexp/regexp.mal |
diffstat | 7 files changed, 1189 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/README.rst Thu Dec 07 17:44:37 2017 +0100 +++ b/README.rst Mon Jan 22 17:56:06 2018 +0100 @@ -49,3 +49,16 @@ tutorial you will create a simple user-defined function (UDF) that takes one (string) value as input and produces one (string) value as output. + +regexp +...... + +The tutorial in the ``regexp`` subdirectory contains an example of a +FILTER FUNCTION. These functions are used to filter values in a +column. The simplest example of a filter is (given a table ``t`` that +has an integer column ``c``):: + + SELECT c FROM t WHERE c = 0; + +This query filter the table t and select all rows where the value of +column ``c`` is equal to ``0``.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexp/81_regexp.mal Mon Jan 22 17:56:06 2018 +0100 @@ -0,0 +1,1 @@ +include regexp;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexp/81_regexp.sql Mon Jan 22 17:56:06 2018 +0100 @@ -0,0 +1,8 @@ +-- This Source Code Form is subject to the terms of the Mozilla Public +-- License, v. 2.0. If a copy of the MPL was not distributed with this +-- file, You can obtain one at http://mozilla.org/MPL/2.0/. +-- +-- Copyright 2013-2018 MonetDB B.V. + +CREATE FILTER FUNCTION rematch(val STRING, pat STRING) + EXTERNAL NAME regexp.rematch;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexp/Makefile Mon Jan 22 17:56:06 2018 +0100 @@ -0,0 +1,38 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright 2013-2017 MonetDB B.V. + +LIBDIR = $(shell pkg-config --variable=libdir monetdb5) + +CFLAGS += $(shell pkg-config --cflags monetdb5 libpcre) +LDFLAGS += $(shell pkg-config --libs monetdb5 libpcre) + +all: lib_regexp.so + +lib_regexp.so: regexp.o + $(CC) -fPIC -DPIC -o lib_regexp.so -shared regexp.o $(LDFLAGS) -Wl,-soname -Wl,lib_regexp.so + +regexp.o: regexp.c + $(CC) -fPIC -DPIC $(CFLAGS) -c regexp.c + +doc: README.html README.pdf + +README.html: README.rst + rst2html -d README.rst > README.html + +README.pdf: README.rst + rst2pdf README.rst + +clean: + rm -f README.html README.pdf *.o *.so + +install: lib_regexp.so + cp regexp.mal lib_regexp.so $(DESTDIR)$(LIBDIR)/monetdb5 + cp 81_regexp.sql $(DESTDIR)$(LIBDIR)/monetdb5/createdb + cp 81_regexp.mal $(DESTDIR)$(LIBDIR)/monetdb5/autoload + +tar: MonetDB-regexp-1.1.tar.bz2 +MonetDB-regexp-1.1.tar.bz2: README.rst Makefile 81_regexp.mal regexp.mal 81_regexp.sql regexp.c + tar -cjf MonetDB-regexp-1.1.tar.bz2 --transform='s|^|MonetDB-regexp-1.1/|' README.rst Makefile 81_regexp.mal regexp.mal 81_regexp.sql regexp.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexp/README.rst Mon Jan 22 17:56:06 2018 +0100 @@ -0,0 +1,546 @@ +.. This Source Code Form is subject to the terms of the Mozilla Public +.. License, v. 2.0. If a copy of the MPL was not distributed with this +.. file, You can obtain one at http://mozilla.org/MPL/2.0/. +.. +.. Copyright 2018 MonetDB B.V. + +.. This document is written in reStructuredText (see + http://docutils.sourceforge.net/ for more information). + Use ``rst2html.py`` to convert this file to HTML. + +============================== +Implementing a Filter Function +============================== + +:Author: Sjoerd Mullender +:Organization: CWI, MonetDB B.V. +:Abstract: In this short tutorial we describe how to create a filter + function for MonetDB/SQL. + +Introduction +------------ + +We want to create a function or set of functions that allows us to +filter a column based on whether a regular expression matches. When +we're done, we will be able to issue a query such as this:: + + SELECT * FROM t WHERE [name] rematch ['^mue?ller$']; + +What this query does is select all rows from table ``t`` where the +column ``name`` matches the regular expression ``^mue?ller$``. This +tutorial is not about how to create regular expressions or what they +mean. This tutorial is about how to extend MonetDB/SQL with functions +so that a query like this is possible. + +Note that this query is similar to queries using the LIKE predicate, +although regular expressions are more powerful than the patterns +allowed by LIKE. The following three queries are equivalent:: + + SELECT * FROM t WHERE name LIKE '%foo%'; + SELECT * FROM t WHERE [name] sys."like" ['%foo%']; + SELECT * FROM t WHERE [name] sys.rematch ['.*foo.*']; + +The first two of these queries work out of the box (given a table +``t`` witch a ``STRING`` column ``name``). + +We need to write ``"like"`` instead of ``LIKE`` or ``like`` since +``LIKE`` is an SQL keyword and in this case we want to use the SQL +function called ``sys.like``. + +The function ``rematch`` that we will create, and also the function +``like`` that already exists are FILTER functions. We can also use +the filter function for simple (scalar) queries, such as:: + + SELECT ['some string'] sys."like" ['%foo%']; + +The value on the left is matched with the pattern on the right and the +result is a Boolean value indicating whether the value matches the +pattern (``false`` in this case). + +In addition to this simple filter function that matches a value with a +pattern, we will create a variant that also has a ``flags`` argument. +It can be useful to pass some extra flags to the matching operator to +tell it to do case insensitive matching, or always match the whole +string vs. finding a matching substring. The way we will be able to +call this variant is:: + + SELECT * FROM t WHERE [name] sys.rematch ['.*foo.*', 'i']; + +This particular query will be equivalent to both:: + + SELECT * FROM t WHERE name ILIKE '%foo%'; + SELECT * FROM t WHERE [name] sys."ilike" ['%foo%']; + +Implementation +-------------- + +Three things need to be done. We need to add a function definition to +the SQL catalog, we need to describe the MAL interface for the MAL +interpreter, and we need to create the C implementation of the +function. We will describe the three steps here in this order. + +SQL +... + +The SQL interface is simple:: + + CREATE FILTER FUNCTION rematch(val STRING, pat STRING) + EXTERNAL NAME regexp.rematch; + CREATE FILTER FUNCTION rematch(val STRING, pat STRING, flags STRING) + EXTERNAL NAME regexp.rematch; + +When we create a ``FILTER FUNCTION`` in SQL, the SQL system expects +three MAL functions whose names are based on the external name given. +The functions are named ``regexp.rematch``, ``regexp.rematchselect``, +and ``regexp.rematchjoin``. In other words, the name as given, and +the name with the string ``select`` and ``join`` appended. These +functions are used to implement a simple scalar version of the filter, +a version that is used in simple select queries, and one that is used +in joins. + +Note that we use the same MAL names for both variants. The +interpreter will be able to distinguish the variants based on the +arguments given. + +This statement will normally be executed once when the database is +created, after which it is part of the SQL catalog. This is +accomplished by having the statement in a file in the +``$libdir/monetdb/createdb`` directory. Since files in that directory +are executed in order, the convention is to add a two digit number at +the front of the file name to force the order. So we have a file +``81_regexp.sql`` where we put this statement. + +At the SQL side we don't have to do anything more. + +MAL +... + +The MAL interface consists of three functions whose names are based on +the names specified in the SQL interface. The interface looks like +this. First the variant without the ``flags`` argument:: + + module regexp; + + command rematch(val:str, pat:str) :bit + address regexpmatch + comment "Return true when the value 'val' matches the regular expression 'pat'"; + + command rematchselect(val:bat[:str], s:bat[:oid], pat:str, anti:bit):bat[:oid] + address regexpmatchselect + comment "Return the list of matches in 'val' that match the regular expression 'pat'"; + + command rematchjoin(val:bat[:str], pat:bat[:str], sl:bat[:oid], sr:bat[:oid], nil_matches:bit, estimate:lng)(lr:bat[:oid],rr:bat[:oid]) + address regexpmatchjoin + comment "Return the matching pairs from the 'val' and 'pat' + columns"; + +The variant with the ``flags`` argument looks like this:: + + module regexp; + + command rematch(val:str, pat:str, flags:str) :bit + address regexpmatchf + comment "Return true when the value 'val' matches the regular expression 'pat'"; + + command rematchselect(val:bat[:str], s:bat[:oid], pat:str, flags:str, anti:bit):bat[:oid] + address regexpmatchfselect + comment "Return the list of matches in 'val' that match the regular expression 'pat'"; + + command rematchjoin(val:bat[:str], pat:bat[:str], flags:str, sl:bat[:oid], sr:bat[:oid], nil_matches:bit, estimate:lng)(lr:bat[:oid],rr:bat[:oid]) + address regexpmatchfjoin + comment "Return the matching pairs from the 'val' and 'pat' + columns"; + +We put these MAL commands in the file +``$libdir/monetdb5/regexp.mal``. In addition we create a file +``$libdir/monetdb5/autoload/81_regexp.mal`` that just contains:: + + include regexp; + +The files in the ``autoload`` directory are executed in order every +time the server is started so that by putting the ``81_regexp.mal`` +file there, we make sure that the system knows about these functions. + +C Implementation +................ + +First we'll describe the core of the matching process. This has +nothing to do with how MonetDB works and everything with how the PCRE +library works. However, for a complete example, we need this part. + +We have a regular expression in the variable ``pat``, and a set of +option letters in the variable ``flags``. We first need to convert +the option letters to an options value that is understood by the +library, then we need to compile the pattern to an internal +representation. Finally we can use this internal representation to +check whether the string value in ``val`` matches the pattern. + +:: + /* first the three input variables */ + const char *pat = "the regular expression"; + const char *flags = "i"; /* the flags, here "i", meaning case insensitive */ + const char *val = "the value being matched"; + + int options = PCRE_UTF8; /* MonetDB use the UTF-8 encoding exclusively */ + /* the following two variables are for error handling */ + const char *err; /* here we will collect the error message */ + int pos; /* here we collect the position of the error and the result of the match */ + + while (*flags) { + switch (*flags) { + case 'i': options |= PCRE_CASELESS; break; + case 'x': options |= PCRE_EXTENDED; break; + /* more cases are possible: there are more potential options */ + default: /* unrecognized option; here we should generate an error */ + } + flags++; + } + re = pcre_compile(pat, options, &err, &pos, NULL); + if (re == NULL) { + /* an error occurred during compilation, the message is in err, + * the position of the error in pat is in pos */ + return; /* return an error */ + } + pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, 0, NULL, 0); + if (pos >= 0) { + /* there was a match */ + } else if (pos == PCRE_ERROR_NOMATCH) { + /* there was no match and no other error */ + } else { + /* there was some error as specified in the value of pos */ + return; /* return an error */ + } + +Match +````` + +With this information, we can implement the first function easily. +The first function is a simple scalar version, i.e. a version that +works on a single value. We also give the version with flags +argument. Since they are very similar, they share all code:: + + char * + regexpmatch(bit *ret, const char **val, const char **pat) + { + return do_match(ret, *val, *pat, ""); + } + + char * + regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags) + { + return do_match(ret, *val, *pat, *flags); + } + +The function ``do_match`` does all the work and is in essence the +above code (in real code we need to add error handling):: + + static char * + do_match(bit *ret, const char *val, const char *pat, const char *flags) + { + int options; /* calculate value from flags */ + const char *err = NULL; + int errpos = 0; + pcre *re; + + if (strcmp(val, str_nil) == 0 || + strcmp(pat, str_nil) == 0 || + strcmp(flags, str_nil) == 0) { + /* special case for NIL inputs: NILs don't match anything */ + *ret = 0; + return MAL_SUCCEED; + } + + re = pcre_compile(pat, options, &err, &pos, NULL); + pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, 0, NULL, 0); + *ret = pos >= 0; + return MAL_SUCCEED; + } + +Select +`````` + +The C interface of the two select functions (with and without flags) +is as follows:: + + char *regexpmatchselect(bat *ret, const bat *bid, const bat *sid, + const char **pat, const bit *anti); + char *regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, + const char **pat, const char **flags, const bit *anti); + +The select function is essentially a bulk version of the match +function. There is a single pattern that is matched against a +complete column of values. There are a few complicating issues, +though. + +Select returns a list of OID values of matching rows. This list +*must* be sorted and all values *must* be distinct. This must also be +indicated in the BAT properties. Because of the way we go through the +values, we know that the result we produce is already sorted and +distinct. + +The select function has a parameter ``const bit *anti``. If ``*anti`` +is set, the match is reversed, except that ``nil`` never matches. + +The select function also has a parameter ``const bat *sid``. This +value specifies a candidate list. The candidate list is optional. +Not having a candidate list is indicated by ``*sid`` being ``0`` or +``bat_nil``. To be sure, we also check whether ``sid`` itself is +equal to the ``NULL`` pointer. If there is no candidate list, all +values of the main input BAT ``b`` are considered for matching. If +there is a candidate list, it is a sorted list of OID values of values +from ``b`` that are to be matched. + +In our implementation we use the same code for a *dense* candidate +list and for no candidate list. We just iterate over all values of +``b`` between a start and end index. If there is a non dense +candidate list, we iterate over it and use an indirection to get to +the value in ``b``. + +When creating the result BAT, we allocate enough space in case all +input values match. This means that inside the matching loop we don't +need to check whether there is enough space. This is, of course, a +trade off between speed and simplicity (whether or not we check) and +memory consumption. An alternative would be to allocate much less and +use the function ``BUNappend`` to add values. When using +``BUNappend`` we also don't need to set any properties afterwards, +since they are maintained by ``BUNappend``. + +Speaking of BAT properties, since we're not using ``BUNappend`` we +need to maintain the properties ourselves. Properties must never +lie. That is to say, if a property is set, whatever the property +indicates must be true. An unset property indicates we don't know +whether or not the property holds. + +The two C functions referenced above are so similar that they share +all code:: + + char * + regexpmatchselect(bat *ret, const bat *bid, const bat *sid, + const char **pat, const bit *anti) + { + return do_select(ret, *bid, sid ? *sid : 0, *pat, "", *anti); + } + + + char * + regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, + const char **pat, const char **flags, const bit *anti) + { + return do_select(ret, *bid, sid ? *sid : 0, *pat, *flags, *anti); + } + +The function ``do_select`` does all the work:: + + static char * + do_select(bat *ret, bat bid, bat sid, const char *pat, + const char *flags, bit anti) + { + ... + } + +First we check whether any of the string input arguments is NIL. If +they are, there are no matches and we're done quickly:: + + if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { + /* no matches when the pattern or the flags is NIL + * we return an empty BAT of the correct type */ + if ((bn = BATdense(0, 0, 0)) == NULL) + throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); + *ret = bn->batCacheid; + BBPkeepref(*ret); + return MAL_SUCCEED; + } + +After this we convert the ``flags`` string to the ``options`` value +that the PCRE library wants:: + + options = parseflags(flags); + +Then we load the two input BATs. The parameters ``bid`` and ``sid`` +are BAT IDs, but we need BAT descriptors. To get from one to the +other we use the function ``BATdescriptor``. Once we have a BAT +descriptor, we need to free it again using ``BBPunfix``:: + + s = NULL; + b = BATdescriptor(bid); + if (sid != 0 && sid != bat_nil) + s = BATdescriptor(sid); + +We create the output BAT. The third arguments of ``COLnew`` is the +desired capacity of the BAT. If the call succeeds, it is guaranteed +that there is enough space for this many rows, so if we ask for enough +capacity to store the maximum potential result, we don't need to check +during insertion. We also set up a pointer to the start of the data +area of the BAT:: + + bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT); + outp = (oid *) Tloc(bn, 0); + +Since we're going to use the search pattern many times (well, +depending on the inputs, of course), we invest extra time to study the +pattern:: + + re = pcre_compile(pat, options, &err, &pos, NULL); + sd = pcre_study(re, 0, &err); + +We then set up some auxiliary variables to help us iterate over the +input:: + + CANDINIT(b, s, start, end, cnt, cand, candend); + bi = bat_iterator(b); + +The macro ``CANDINIT``, defined in the file ``gdk_cand.h``, looks at +the first two arguments and initializes the remaining arguments. The +function ``bat_iterator`` returns a ``BATiter`` value that is needed +for the ``BUNtail``, ``BUNtvar``, ``BUNtloc`` and ``BUNtpos`` macros. +The macro ``BUNtail`` is the generic version, the other three can be +used if we know more about the BAT that is used. In our case, we know +that the BAT contains string values which are variabled sized values. +This means that we can use ``BUNtvar``. Using ``BUNtvar`` instead of +``BUNtail`` should be slightly faster since it omits a few tests. + +Now we get to the core of the algorithm. There are two cases: with +and without candidate list. In either case, we iterate and in each +iteration we check first whether the value is nil, and if not whether +the value matches the regular expression. If there is a match we add +the ID of the value to the output. Here, match takes the ``anti`` +variable into account. The version with candidate list is as +follows:: + + while (cand < candend) { + const char *val = BUNtvar(bi, *cand - b->hseqbase); + if (!GDK_STRNIL(val)) { + pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); + if (pos >= 0) { + if (!anti) + *outp++ = *cand; + } else if (pos == PCRE_ERROR_NOMATCH) { + if (anti) + *outp++ = *cand; + } + } + cand++; + } + +Now we can release all resources:: + + if (s) BBPunfix(s->batCacheid); + BBPunfix(b->batCacheid); + pcre_free_study(sd); + pcre_free(re); + +Finally, we need to fix up the result BAT and return it. As is, the +BAT is still empty, so we set the size:: + + BATsetcount(bn, (BUN) (outp - (oid *) Tloc(bn, 0))); + +We also need to set the properties. If we had used ``BUNappend`` that +would have been done for us, but using ``BUNappend`` is a lot more +overhead. The properties we need to set are: + +``tsorted`` + The column is sorted in ascending order. + +``trevsorted`` + The column is sorted in descending order. + +``tkey`` + All values are distinct + +``tdense`` + Only valid for ``TYPE_oid`` columns: each value in the column is + exactly one larger than the previous value. + +``tnil`` + There is at least one NIL value in the column. + +``tnonil`` + There are no NIL values in the column. + +``tnosorted`` + If ``tsorted == 0``, the index in the column that proofs not sorted. + +``tnorevsorted`` + If ``trevsorted == 0``, the index in the column that proofs not + reverse sorted. + +``tnokey`` + If ``tkey == 0``, a pair of indexes that together proof not all + values are distinct. + +Because of the way we created the output we know a number of these +properties:: + + bn->tsorted = 1; + bn->tkey = 1; + bn->trevsorted = BATcount(bn) <= 1; + bn->tnil = 0; + bn->tnonil = 1; + bn->tnosorted = 0; + if (!bn->trevsorted) bn->tnorevsorted = 1; + +We also know that the column is dense if there is at most a single row +in it, but we can easily check whether the column is dense even if +there are more rows: we just need to look at the difference between +the first and last rows and compare that with the number of rows:: + + if (BATcount(bn) > 1) { + outp = (oid *) Tloc(bn, 0); /* pointer to start */ + bn->tdense = outp[BATcount(bn) - 1] - outp[0] == BATcount(bn) - 1; + } + +If the column is dense, we need to set ``tseqbase`` to the first value +(or ``0`` if there are no values):: + + bn->tseqbase = BATcount(bn) > 0 ? * (oid *) Tloc(bn, 0) : 0; + +Now we can return:: + + *ret = bn->batCacheid; + BBPkeepref(*ret); + return MAL_SUCCEED; + +Join +```` + +The C interface of the two join functions (with and without flags) is +as follows:: + + char *regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, + const bat *sl, const bat *sr, + const bit *nil_matches, const lng *estimate); + char *regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, + const char **flags, const bat *sl, const bat *sr, + const bit *nil_matches, const lng *estimate); + +In MonetDB, a join returns two aligned (same length, same +``hseqbase``) BATs where each corresponding set of values indicates +the OIDs of the left and right inputs that matched with each other. +The left and right inputs are here given by ``lid`` and ``rid`` in +combination with their respective, optional, candidate lists ``sl`` +and ``sr``. The outputs are returned through the ``lres`` and +``rres`` pointers. The additional parameters are: ``flags``, the +extra argument for the filter function; ``nil_matches``, if set, NILs +are considered "normal" values that can match; ``estimate``, an +estimate of how large the result is (often this is unknown which is +indicated by the value ``lng_nil``). We ignore the value of +``nil_matches`` since it is only relevant for an equality join. + +The outputs of the join do not have to be in any particular order. In +the general case, neither output is a candidate list (there may be +duplicates if a value matches multiple times), so there is also no +real point in demanding that one or the other output is sorted. +Although it helps to set the ``tsorted`` property if we know that the +output is sorted. + +There is no real alternative to implementing the join using a nested +loop implementation. For reasons of efficiency, in our implementation +of this join function, we will iterate over the pattern (right input) +in the outer loop and over the to-be-matched strings (left input) in +the inner loop. In this way, we only need to compile (and study) each +pattern once and we can use it multiple times. + +Since the inner loop is executed many more times than the outer loop, +we try to be a bit more efficient in the inner loop. That's why we +check for ``lcand`` only once and check for ``rcand`` in every +iteration.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexp/regexp.c Mon Jan 22 17:56:06 2018 +0100 @@ -0,0 +1,553 @@ +/* monetdb_config.h must be included as the first include file */ +#include <monetdb_config.h> + +/* mal_exception.h actually contains everything we need */ +#include <mal_exception.h> + +/* for the CANDINIT macro */ +#include <gdk_cand.h> + +/* system include files */ +#include <string.h> + +/* we use the PCRE library to do regular expression matching */ +#include <pcre.h> + +/* __declspec() must be used on Windows, but not on other systems */ +#ifndef _MSC_VER +/* not Windows */ +#define __declspec(x) /* nothing */ +#endif + +extern __declspec(dllexport) char *regexpmatch(bit *ret, const char **val, const char **pat); +extern __declspec(dllexport) char *regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags); + +static int +parseflags(const char *flags) +{ + int options = PCRE_UTF8; /* MonetDB uses UTF-8 exclusively */ + + if (flags) { + while (*flags) { + switch (*flags) { + case 'i': /* case insensitive */ + options |= PCRE_CASELESS; + break; + case 'x': /* extended regular expressions */ + options |= PCRE_EXTENDED; + break; + default: + return -1; /* indicate there was an error */ + } + flags++; + } + } + return options; +} + +static char * +do_match(bit *ret, const char *val, const char *pat, const char *flags) +{ + const char *err = NULL; + int options; + int pos = 0; + pcre *re; + + /* if any of the input values are nil, the result is no match */ + if (GDK_STRNIL(val) || + GDK_STRNIL(pat) || + GDK_STRNIL(flags)) { + /* special case for NIL inputs: NILs don't match anything */ + *ret = 0; + return MAL_SUCCEED; + } + + options = parseflags(flags); + if (options == -1) + throw(MAL, "regexp.rematch", "bad flag character"); + + re = pcre_compile(pat, options, &err, &pos, NULL); + if (re == NULL) { + throw(MAL, "regexp.rematch", + "compilation of regular expression (%s) failed at %d with %s", + pat, pos, err); + } + pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, 0, NULL, 0); + pcre_free(re); + if (pos < PCRE_ERROR_NOMATCH) { + throw(MAL, "regexp.rematch", + "matching of regular expression (%s) failed with %d", + pat, pos); + } + *ret = pos >= 0; + return MAL_SUCCEED; +} + +char * +regexpmatch(bit *ret, const char **val, const char **pat) +{ + return do_match(ret, *val, *pat, ""); +} + +char * +regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags) +{ + return do_match(ret, *val, *pat, *flags); +} + +static char * +do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti) +{ + BAT *b, *s = NULL; /* input BAT and optional candidate list */ + + BUN start, end; /* start and end index */ + BUN cnt; /* difference between start and end (unused) */ + const oid *cand, *candend; /* start and end of candidate list */ + BATiter bi; /* helper to loop through values */ + + BAT *bn; /* result BAT */ + oid *outp; /* pointer through which we add to result */ + + const char *err = NULL; /* error message from PCRE library */ + int pos = 0; /* error position from PCRE library */ + int options; /* PCRE options */ + pcre *re; /* compiled regular expression */ + pcre_extra *sd; /* studied regular expression */ + + /* if any of the input values are nil, the result is no match */ + if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { + /* no matches when the pattern or the flags is NIL + * we return an empty BAT of the correct type */ + if ((bn = BATdense(0, 0, 0)) == NULL) + throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); + *ret = bn->batCacheid; + BBPkeepref(*ret); + return MAL_SUCCEED; + } + options = parseflags(flags); + if (options == -1) + throw(MAL, "regexp.rematchselect", "bad flag character"); + + /* from the BAT IDs we need to get the BAT descriptors, making + * sure that the data of the BATs are loaded into memory */ + if ((b = BATdescriptor(bid)) == NULL) { + throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); + } + if (sid != 0 && sid != bat_nil && + (s = BATdescriptor(sid)) == NULL) { + BBPunfix(b->batCacheid); + throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); + } + + /* allocate a result BAT; the capacity we ask for is the + * maximum potential result size (i.e. the size of the + * candidate list if there is one, else the size of the input + * BAT b) */ + bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT); + if (bn == NULL) { + BBPunfix(b->batCacheid); + if (s) + BBPunfix(s->batCacheid); + throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); + } + + /* Position outp at the start of the result array. + * We know the array is large enough even if every value were + * to match, so we don't need to check for that. */ + outp = (oid *) Tloc(bn, 0); + + /* compile the regular expression */ + re = pcre_compile(pat, options, &err, &pos, NULL); + if (re == NULL) { + BBPunfix(b->batCacheid); + if (s) + BBPunfix(s->batCacheid); + BBPreclaim(bn); + throw(MAL, "regexp.rematchselect", + "compilation of regular expression (%s) failed at %d with %s", + pat, pos, err); + } + /* invest in study of the r.e. */ + sd = pcre_study(re, 0, &err); + if (err != NULL) { + pcre_free(re); + BBPunfix(b->batCacheid); + if (s) + BBPunfix(s->batCacheid); + BBPreclaim(bn); + throw(MAL, "regexp.rematchselect", + "study of regular expression (%s) failed with %s", + pat, err); + } + + /* initialize the start, end, cnt, cand, and candend variables + * from b and s*/ + CANDINIT(b, s, start, end, cnt, cand, candend); + + /* now, start and end are the limits in b that we need to look + * at, and if set, cand and candend are the beginning and end + * of the list of OIDs of b that we need to consider */ + + bi = bat_iterator(b); + + if (cand) { + /* there is a non-dense candidate list, we run through + * it and only check values in b that are referenced + * in this candidate list */ + while (cand < candend) { + /* the candidate list has a list of OIDs which + * are relative to b->hseqbase, we need to + * convert that to an index relative to the + * start of the array in the (tail) heap */ + const char *val = BUNtvar(bi, *cand - b->hseqbase); + /* nil values never match */ + if (!GDK_STRNIL(val)) { + pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); + if (pos >= 0) { + /* regular expression matched */ + if (!anti) + *outp++ = *cand; + } else if (pos == PCRE_ERROR_NOMATCH) { + /* regular expression didn't match */ + if (anti) + *outp++ = *cand; + } else { + /* error during processing */ + BBPunfix(b->batCacheid); + BBPunfix(s->batCacheid); + BBPreclaim(bn); + pcre_free_study(sd); + pcre_free(re); + throw(MAL, "regexp.rematchselect", + "matching of regular expression (%s) failed with %d", + pat, pos); + } + } + cand++; + } + /* we done with s */ + BBPunfix(s->batCacheid); + } else { + /* we're done with s */ + if (s) { + /* there is a dense candidate list, we don't + * need it anymore since we extracted start + * and end */ + BBPunfix(s->batCacheid); + } + while (start < end) { + const char *val = BUNtvar(bi, start); + /* nil values never match */ + if (!GDK_STRNIL(val)) { + pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); + /* note that we need to convert the + * index relative to the start of the + * array to a value relative to + * b->hseqbase (opposite from what + * happens above) */ + if (pos >= 0) { + /* regular expression matched */ + if (!anti) + *outp++ = b->hseqbase + start; + } else if (pos == PCRE_ERROR_NOMATCH) { + /* regular expression didn't match */ + if (anti) + *outp++ = b->hseqbase + start; + } else { + /* error during processing */ + BBPunfix(b->batCacheid); + BBPreclaim(bn); + pcre_free_study(sd); + pcre_free(re); + throw(MAL, "regexp.rematchselect", + "matching of regular expression (%s) failed with %d", + pat, pos); + } + } + start++; + } + } + + /* we're done with b and re */ + BBPunfix(b->batCacheid); + pcre_free_study(sd); + pcre_free(re); + + /* set properties and size of result BAT */ + BATsetcount(bn, (BUN) (outp - (oid *) Tloc(bn, 0))); /* size is pointer difference */ + /* the result BAT of a select operation MUST be sorted, and + * all values MUST be distinct (i.e. it is a candidate list); + * due to the way we created the result, we know this is the + * case */ + bn->tsorted = 1; + bn->tnosorted = 0; + bn->tkey = 1; + if (BATcount(bn) > 1) { + /* if more than 1 result, it is not reverse sorted */ + bn->trevsorted = 0; + bn->tnorevsorted = 1; /* index 1 is larger than index 0 */ + /* the BAT is dense if the type is TYPE_oid, the + * values are sorted in ascending order, they are all + * distinct, and they form a consecutive sequence (no + * missing values); we only need to check the last + * condition, which we do by checking the difference + * between the first and last values */ + outp = (oid *) Tloc(bn, 0); /* pointer to start */ + bn->tdense = outp[BATcount(bn) - 1] - outp[0] == BATcount(bn) - 1; + } else { + /* if empty or a single result, it is reverse sorted + * and dense */ + bn->trevsorted = 1; + bn->tnorevsorted = 0; + bn->tdense = 1; + } + if (bn->tdense) { + /* if dense, we must also set tseqbase to the first value */ + if (BATcount(bn) == 0) + bn->tseqbase = 0; /* no first value */ + else + bn->tseqbase = * (oid *) Tloc(bn, 0); + } else { + bn->tseqbase = oid_nil; + } + /* there are no NIL values in the result */ + bn->tnil = 0; + bn->tnonil = 1; + + *ret = bn->batCacheid; + BBPkeepref(*ret); + return MAL_SUCCEED; +} + +char * +regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti) +{ + return do_select(ret, *bid, sid ? *sid : 0, *pat, "", *anti); +} + + +char * +regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti) +{ + return do_select(ret, *bid, sid ? *sid : 0, *pat, *flags, *anti); +} + +static char * +do_join(bat *lres, bat *rres, bat lid, bat rid, const char *flags, + bat slid, bat srid, bit nil_matches, lng estimate) +{ + BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */ + BAT *bn1, *bn2; /* output BATs */ + + BUN lstart, lend; /* start and end index */ + BUN lcnt; /* difference between start and end (unused) */ + const oid *lcand, *lcandend; /* start and end of candidate list */ + BATiter li; /* helper to loop through values */ + + BUN rstart, rend; /* start and end index */ + BUN rcnt; /* difference between start and end (unused) */ + const oid *rcand, *rcandend; /* start and end of candidate list */ + BATiter ri; /* helper to loop through values */ + oid ro; /* right OID being matched */ + + const char *pat; /* the regular expression being matched */ + const char *err; /* error message from PCRE library */ + int pos = 0; /* error position from PCRE library */ + int options; /* PCRE options */ + pcre *re; /* compiled regular expression */ + pcre_extra *sd; /* studied regular expression */ + + (void) nil_matches; /* only relevant for equi-join */ + + if (GDK_STRNIL(flags)) { + /* no matches when the flags is NIL + * we return two empty BATs */ + bn1 = BATdense(0, 0, 0); + bn2 = BATdense(0, 0, 0); + if (bn1 == NULL || bn2 == NULL) { + BBPreclaim(bn1); + BBPreclaim(bn2); + throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); + } + return MAL_SUCCEED; + } + options = parseflags(flags); + if (options == -1) + throw(MAL, "regexp.rematchjoin", "bad flag character"); + + l = BATdescriptor(lid); + r = BATdescriptor(rid); + if (slid != 0 && slid != bat_nil) + sl = BATdescriptor(slid); + if (srid != 0 && srid != bat_nil) + sr = BATdescriptor(srid); + if (l == NULL || r == NULL || + (slid != 0 && slid != bat_nil && sl == NULL) || + (srid != 0 && srid != bat_nil && sr == NULL)) { + /* one of the calls to BATdescriptor failed */ + if (l) + BBPunfix(l->batCacheid); + if (r) + BBPunfix(r->batCacheid); + if (sl) + BBPunfix(sl->batCacheid); + if (sr) + BBPunfix(sr->batCacheid); + throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); + } + + /* if there is no valid estimate, use the size of the left + * input as size estimate */ + if (estimate == lng_nil || estimate == 0) + estimate = sl ? BATcount(sl) : BATcount(l); + + /* create the output BATs */ + bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT); + bn2 = COLnew(0, TYPE_oid, estimate, TRANSIENT); + if (bn1 == NULL || bn2 == NULL) { + /* one of the calls to COLnew failed + * note, BBPreclaim check whether its argument is NULL */ + BBPreclaim(bn1); + BBPreclaim(bn2); + BBPunfix(l->batCacheid); + BBPunfix(r->batCacheid); + if (sl) + BBPunfix(sl->batCacheid); + if (sr) + BBPunfix(sr->batCacheid); + throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); + } + + /* extract boundaries and candidate lists from inputs */ + CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend); + CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend); + + li = bat_iterator(l); + ri = bat_iterator(r); + + /* if there is a candidate list (sl or sr is not NULL), but it + * is dense (lcand/rcand is NULL), we don't need the BAT + * descriptor anymore, the start and end values suffice */ + if (sl && lcand == NULL) { + BBPunfix(sl->batCacheid); + sl = NULL; + } + if (sr && rcand == NULL) { + BBPunfix(sr->batCacheid); + sr = NULL; + } + + for (;;) { + if (rcand) { + if (rcand == rcandend) + break; + ro = *rcand++; + pat = BUNtvar(ri, ro - r->hseqbase); + } else { + if (rstart == rend) + break; + ro = rstart + r->hseqbase; + pat = BUNtvar(ri, rstart); + } + /* nil regular expressions don't match (despite + * nil_matches) */ + if (GDK_STRNIL(pat)) + continue; + re = pcre_compile(pat, options, &err, &pos, NULL); + sd = NULL; + if (re == NULL) + goto bailout; + sd = pcre_study(re, 0, &err); + if (err != NULL) + goto bailout; + /* inner loop: try to be more efficient than outer + * loop, so test for lcand once instead of each + * iteration */ + if (lcand) { + for (const oid *cand = lcand; cand < lcandend; cand++) { + const char *val = BUNtvar(li, *cand - l->hseqbase); + if (GDK_STRNIL(val)) + continue; + pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); + if (pos >= 0) { + /* regular expression matched */ + if (BUNappend(bn1, cand, FALSE) != GDK_SUCCEED || + BUNappend(bn2, &ro, FALSE) != GDK_SUCCEED) + goto bailout; + } else if (pos != PCRE_ERROR_NOMATCH) { + /* error during processing */ + err = "matching of regular expression failed"; + goto bailout; + } + } + } else { + for (BUN start = lstart; start < lend; start++) { + const char *val = BUNtvar(li, start); + if (GDK_STRNIL(val)) + continue; + pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); + if (pos >= 0) { + /* regular expression matched */ + oid lo = start + l->hseqbase; + if (BUNappend(bn1, &lo, FALSE) != GDK_SUCCEED || + BUNappend(bn2, &ro, FALSE) != GDK_SUCCEED) + goto bailout; + } else if (pos != PCRE_ERROR_NOMATCH) { + /* error during processing */ + err = "matching of regular expression failed"; + goto bailout; + } + } + } + pcre_free_study(sd); + pcre_free(re); + } + + BBPunfix(l->batCacheid); + BBPunfix(r->batCacheid); + if (sl) + BBPunfix(sl->batCacheid); + if (sr) + BBPunfix(sr->batCacheid); + *lres = bn1->batCacheid; + *rres = bn2->batCacheid; + BBPkeepref(*lres); + BBPkeepref(*rres); + return MAL_SUCCEED; + + bailout: + BBPreclaim(bn1); + BBPreclaim(bn2); + BBPunfix(l->batCacheid); + BBPunfix(r->batCacheid); + if (sl) + BBPunfix(sl->batCacheid); + if (sr) + BBPunfix(sr->batCacheid); + if (sd) + pcre_free_study(sd); + if (re) + pcre_free(re); + if (err) + throw(MAL, "pcre.rematchjoin", + "error with regular expression: %s", err); + throw(MAL, "pcre.rematchjoin", GDK_EXCEPTION); +} + +char * +regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, + const bat *sl, const bat *sr, const bit *nil_matches, + const lng *estimate) +{ + return do_join(lres, rres, *lid, *rid, "", *sl, *sr, + *nil_matches, *estimate); +} + +char * +regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, + const char **flags, const bat *sl, const bat *sr, + const bit *nil_matches, const lng *estimate) +{ + return do_join(lres, rres, *lid, *rid, *flags, *sl, *sr, + *nil_matches, *estimate); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexp/regexp.mal Mon Jan 22 17:56:06 2018 +0100 @@ -0,0 +1,30 @@ +module regexp; + +# variant without flags argument + +command rematch(val:str, pat:str) :bit +address regexpmatch +comment "Return true when the value 'val' matches the regular expression 'pat'"; + +command rematchselect(val:bat[:str], cand:bat[:oid], pat:str, anti:bit):bat[:oid] +address regexpmatchselect +comment "Return the list of matches in 'val' that match the regular expression 'pat'"; + +command rematchjoin(val:bat[:str], pat:bat[:str], sl:bat[:oid], sr:bat[:oid], nil_matches:bit, estimate:lng)(lr:bat[:oid],rr:bat[:oid]) +address regexpmatchjoin +comment "Return the matching pairs from the 'val' and 'pat' columns"; + +# variant with flags argument + +command rematch(val:str, pat:str, flags:str) :bit +address regexpmatchf +comment "Return true when the value 'val' matches the regular expression 'pat'"; + +command rematchselect(val:bat[:str], s:bat[:oid], pat:str, flags:str, anti:bit):bat[:oid] +address regexpmatchfselect +comment "Return the list of matches in 'val' that match the regular expression 'pat'"; + +command rematchjoin(val:bat[:str], pat:bat[:str], flags:str, sl:bat[:oid], sr:bat[:oid], nil_matches:bit, estimate:lng)(lr:bat[:oid],rr:bat[:oid]) +address regexpmatchfjoin +comment "Return the matching pairs from the 'val' and 'pat' +columns";