Mercurial > hg > MonetDB-extend
changeset 40:e70b12c15507
Updated for Oct2020 version.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Tue, 08 Jun 2021 14:55:38 +0200 (2021-06-08) |
parents | 4633ab41de55 |
children | da896864dbbd |
files | .editorconfig regexp/81_regexp.mal regexp/81_regexp.sql regexp/Makefile regexp/README.rst regexp/regexp.c regexp/regexp.mal reverse/80_reverse.mal reverse/80_reverse.sql reverse/Makefile reverse/README.rst reverse/reverse.c reverse/reverse.mal |
diffstat | 13 files changed, 244 insertions(+), 275 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.editorconfig Tue Jun 08 14:55:38 2021 +0200 @@ -0,0 +1,19 @@ +root = true + +# Unix-style newlines with a newline ending every file +[*] +end_of_line = lf +insert_final_newline = true + +[*.{c,h}] +indent_style = tab +tab_width = 8 +indent_size = tab +trim_trailing_whitespace = true +charset = utf-8 +max_line_length = 72 + +[{CMakeLists.txt,*.cmake}] +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true
--- a/regexp/81_regexp.mal Tue Jun 08 11:58:34 2021 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# -# Copyright 2013-2018 MonetDB B.V. - -include regexp;
--- a/regexp/81_regexp.sql Tue Jun 08 11:58:34 2021 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ --- This Source Code Form is subject to the terms of the Mozilla Public --- License, v. 2.0. If a copy of the MPL was not distributed with this --- file, You can obtain one at http://mozilla.org/MPL/2.0/. --- --- Copyright 2013-2018 MonetDB B.V. - -CREATE FILTER FUNCTION rematch(val STRING, pat STRING) - EXTERNAL NAME regexp.rematch; -CREATE FILTER FUNCTION rematch(val STRING, pat STRING, flags STRING) - EXTERNAL NAME regexp.rematch;
--- a/regexp/Makefile Tue Jun 08 11:58:34 2021 +0200 +++ b/regexp/Makefile Tue Jun 08 14:55:38 2021 +0200 @@ -29,10 +29,8 @@ rm -f README.html README.pdf *.o *.so install: lib_regexp.so - cp regexp.mal lib_regexp.so $(DESTDIR)$(LIBDIR)/monetdb5 - cp 81_regexp.sql $(DESTDIR)$(LIBDIR)/monetdb5/createdb - cp 81_regexp.mal $(DESTDIR)$(LIBDIR)/monetdb5/autoload + cp lib_regexp.so $(DESTDIR)$(LIBDIR)/monetdb5 -tar: MonetDB-regexp-1.1.tar.bz2 -MonetDB-regexp-1.1.tar.bz2: README.rst Makefile 81_regexp.mal regexp.mal 81_regexp.sql regexp.c - tar -cjf MonetDB-regexp-1.1.tar.bz2 --transform='s|^|MonetDB-regexp-1.1/|' README.rst Makefile 81_regexp.mal regexp.mal 81_regexp.sql regexp.c +tar: MonetDB-regexp-1.2.tar.bz2 +MonetDB-regexp-1.2.tar.bz2: README.rst Makefile regexp.c + tar -cjf MonetDB-regexp-1.2.tar.bz2 --transform='s|^|MonetDB-regexp-1.2/|' README.rst Makefile regexp.c
--- a/regexp/README.rst Tue Jun 08 11:58:34 2021 +0200 +++ b/regexp/README.rst Tue Jun 08 14:55:38 2021 +0200 @@ -110,12 +110,14 @@ arguments given. This statement will normally be executed once when the database is -created, after which it is part of the SQL catalog. This is -accomplished by having the statement in a file in the -``$libdir/monetdb/createdb`` directory. Since files in that directory -are executed in order, the convention is to add a two digit number at -the front of the file name to force the order. So we have a file -``81_regexp.sql`` where we put this statement. +created, after which it is part of the SQL catalog. To accomplish +this we need to store the SQL query in a C string (also see the +*reverse* tutorial):: + + static char regexp_sql[] = "CREATE FILTER FUNCTION rematch(val STRING, pat STRING) " + "EXTERNAL NAME regexp.rematch; " + "CREATE FILTER FUNCTION rematch(val STRING, pat STRING, flags STRING)" + " EXTERNAL NAME regexp.rematch;"; At the SQL side we don't have to do anything more. @@ -179,15 +181,36 @@ address regexpmatchfbulk comment "Return a BAT with true for match and false for no match"; -We put these MAL commands in the file -``$libdir/monetdb5/regexp.mal``. In addition we create a file -``$libdir/monetdb5/autoload/81_regexp.mal`` that just contains:: +We encode these MAL commands in a C array (again, see the *reverse* +tutorial):: - include regexp; - -The files in the ``autoload`` directory are executed in order every -time the server is started so that by putting the ``81_regexp.mal`` -file there, we make sure that the system knows about these functions. + static mel_func regexp_init_funcs[] = { + command("regexp", "rematch", regexpmatch, false, + "Return true when the value 'val' matches the regular expression 'pat'", + args(1,3, arg("",bit),arg("val",str),arg("pat",str))), + command("regexp", "rematchselect", regexpmatchselect, false, + "Return the list of matches in 'val' that match the regular expression 'pat'", + args(1,5, batarg("",oid),batarg("val",str),batarg("cand",oid),arg("pat",str),arg("anti",bit))), + command("regexp", "rematchjoin", regexpmatchjoin, false, + "Return the matching pairs from the 'val' and 'pat' columns", + args(2,8, batarg("lr",oid),batarg("rr",oid),batarg("val",str),batarg("pat",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng))), + command("batregexp", "rematch", regexpmatchbulk, false, + "Return a BAT with true for match and false for no match", + args(1,3, batarg("",bit),batarg("val",str),arg("pat",str))), + command("regexp", "rematch", regexpmatchf, false, + "Return true when the value 'val' matches the regular expression 'pat'", + args(1,4, arg("",bit),arg("val",str),arg("pat",str),arg("flags",str))), + command("regexp", "rematchselect", regexpmatchfselect, false, + "Return the list of matches in 'val' that match the regular expression 'pat'", + args(1,6, batarg("",oid),batarg("val",str),batarg("s",oid),arg("pat",str),arg("flags",str),arg("anti",bit))), + command("regexp", "rematchjoin", regexpmatchfjoin, false, + "Return the matching pairs from the 'val' and 'pat'\ncolumns", + args(2,9, batarg("lr",oid),batarg("rr",oid),batarg("val",str),batarg("pat",str),arg("flags",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng))), + command("batregexp", "rematch", regexpmatchfbulk, false, + "Return a BAT with true for match and false for no match", + args(1,4, batarg("",bit),batarg("val",str),arg("pat",str),arg("flags",str))), + { .imp=NULL } /* sentinel */ + }; C Implementation ................ @@ -222,13 +245,13 @@ works on a single value. We also give the version with flags argument. Since they are very similar, they share all code:: - char * + static char * regexpmatch(bit *ret, const char **val, const char **pat) { return do_match(ret, *val, *pat, ""); } - char * + static char * regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags) { return do_match(ret, *val, *pat, *flags); @@ -245,7 +268,7 @@ int errpos = 0; pcre *re; - if (GDK_STRNIL(val) || GDK_STRNIL(pat) || GDK_STRNIL(flags)) { + if (strNil(val) || strNil(pat) || strNil(flags)) { /* special case for NIL inputs: NILs don't match anything */ *ret = 0; return MAL_SUCCEED; @@ -266,9 +289,9 @@ The C interface of the two select functions (with and without flags) is as follows:: - char *regexpmatchselect(bat *ret, const bat *bid, const bat *sid, + static char *regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti); - char *regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, + static char *regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti); The select function is essentially a bulk version of the match @@ -294,11 +317,21 @@ for matching. If there is a candidate list, it is a sorted list of OID values of values from ``*bid`` that are to be matched. -In our implementation we use the same code for a *dense* candidate -list and for no candidate list. We just iterate over all values of -``*bid`` between a start and end index. If there is a non dense -candidate list, we iterate over it and use an indirection to get to -the value in ``*bid``. +There are a set of macros and functions that make using candidate +lists very easy. First an iterator structure is initialized, and then +this structure is used to iterate through the candidate list. The +relevant code looks like this:: + + struct canditer ci; + canditer_init(&ci, b, s); /* s may be NULL for no candidate list */ + for (BUN i = 0; i < ci.ncand; i++) { + oid o = canditer_next(&ci); + /* o is now the next candidate to be considered */ + /* subtract b->hseqbase to get the BUN index of the value */ + } + +Note that the value ``ci.ncand`` is initialized with the number of +candidates. When creating the result BAT, we allocate enough space in case all input values match. This means that inside the matching loop we don't @@ -318,7 +351,7 @@ The two C functions referenced above are so similar that they share all code:: - char * + static char * regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti) { @@ -326,7 +359,7 @@ } - char * + static char * regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti) { @@ -345,7 +378,7 @@ First we check whether any of the string input arguments is NIL. If they are, there are no matches and we're done quickly:: - if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { + if (strNil(pat) || strNil(flags)) { /* no matches when the pattern or the flags is NIL * we return an empty BAT of the correct type */ if ((bn = BATdense(0, 0, 0)) == NULL) @@ -377,7 +410,7 @@ during insertion. We also set up a pointer to the start of the data area of the BAT:: - bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT); + bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT); outp = (oid *) Tloc(bn, 0); Since we're going to use the search pattern many times (well, @@ -387,43 +420,40 @@ re = pcre_compile(pat, options, &err, &pos, NULL); sd = pcre_study(re, 0, &err); -We then set up some auxiliary variables to help us iterate over the +We then set up an auxiliary variable to help us iterate over the input:: - CANDINIT(b, s, start, end, cnt, cand, candend); bi = bat_iterator(b); -The macro ``CANDINIT``, defined in the file ``gdk_cand.h``, looks at -the first two arguments and initializes the remaining arguments. The -function ``bat_iterator`` returns a ``BATiter`` value that is needed -for the ``BUNtail``, ``BUNtvar``, ``BUNtloc`` and ``BUNtpos`` macros. -The macro ``BUNtail`` is the generic version, the other three can be -used if we know more about the BAT that is used. In our case, we know -that the BAT contains string values which are variabled sized values. -This means that we can use ``BUNtvar``. Using ``BUNtvar`` instead of -``BUNtail`` should be slightly faster since it omits a few tests. +The function ``bat_iterator`` returns a ``BATiter`` value that is +needed for the ``BUNtail``, ``BUNtvar``, ``BUNtloc`` and ``BUNtpos`` +macros. The macro ``BUNtail`` is the generic version, the other three +can be used if we know more about the BAT that is used. In our case, +we know that the BAT contains string values which are variabled sized +values. This means that we can use ``BUNtvar``. Using ``BUNtvar`` +instead of ``BUNtail`` should be slightly faster since it omits a few +tests. -Now we get to the core of the algorithm. There are two cases: with -and without candidate list. In either case, we iterate and in each -iteration we check first whether the value is nil, and if not whether -the value matches the regular expression. If there is a match we add -the ID of the value to the output. Here, match takes the ``anti`` -variable into account. The version with candidate list is as +Now we get to the core of the algorithm. We iterate over the +candidates and in each iteration we check first whether the value is +nil, and if not whether the value matches the regular expression. If +there is a match we add the ID of the value to the output. Here, +match takes the ``anti`` variable into account. The code is as follows:: - while (cand < candend) { - const char *val = BUNtvar(bi, *cand - b->hseqbase); - if (!GDK_STRNIL(val)) { + for (BUN i = 0; i < ci.ncand; i++) { + oid o = canditer_next(&ci); + const char *val = BUNtvar(bi, o - b->hseqbase); + if (!strNil(val)) { pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); if (pos >= 0) { if (!anti) - *outp++ = *cand; + *outp++ = o; } else if (pos == PCRE_ERROR_NOMATCH) { if (anti) - *outp++ = *cand; + *outp++ = o; } } - cand++; } Now we can release all resources:: @@ -549,8 +579,3 @@ in the outer loop and over the to-be-matched strings (left input) in the inner loop. In this way, we only need to compile (and study) each pattern once and we can use it multiple times. - -Since the inner loop is executed many more times than the outer loop, -we try to be a bit more efficient in the inner loop. That's why we -check for ``lcand`` only once and check for ``rcand`` in every -iteration.
--- a/regexp/regexp.c Tue Jun 08 11:58:34 2021 +0200 +++ b/regexp/regexp.c Tue Jun 08 14:55:38 2021 +0200 @@ -20,25 +20,6 @@ /* we use the PCRE library to do regular expression matching */ #include <pcre.h> -/* __declspec() must be used on Windows, but not on other systems */ -#ifndef _MSC_VER -/* not Windows */ -#define __declspec(x) /* nothing */ -#endif - -/* these eight functions are the only externally visible functions - * since they are the only ones that are called from the MAL layer; on - * Windows they must be exported, on other systems, declaring them as - * extern is enough */ -extern __declspec(dllexport) char *regexpmatch(bit *ret, const char **val, const char **pat); -extern __declspec(dllexport) char *regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags); -extern __declspec(dllexport) char *regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti); -extern __declspec(dllexport) char *regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti); -extern __declspec(dllexport) char *regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate); -extern __declspec(dllexport) char *regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const char **flags, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate); -extern __declspec(dllexport) char *regexpmatchbulk(bat *ret, const bat *bid, const char **pat); -extern __declspec(dllexport) char *regexpmatchfbulk(bat *ret, const bat *bid, const char **pat, const char **flags); - static int parseflags(const char *flags) { @@ -101,13 +82,13 @@ return MAL_SUCCEED; } -char * +static char * regexpmatch(bit *ret, const char **val, const char **pat) { return do_match(ret, *val, *pat, ""); } -char * +static char * regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags) { return do_match(ret, *val, *pat, *flags); @@ -257,14 +238,14 @@ return MAL_SUCCEED; } -char * +static char * regexpmatchbulk(bat *ret, const bat *bid, const char **pat) { return do_matchbulk(ret, *bid, *pat, ""); } -char * +static char * regexpmatchfbulk(bat *ret, const bat *bid, const char **pat, const char **flags) { return do_matchbulk(ret, *bid, *pat, *flags); @@ -457,14 +438,14 @@ return MAL_SUCCEED; } -char * +static char * regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti) { return do_select(ret, *bid, sid ? *sid : 0, *pat, "", *anti); } -char * +static char * regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti) { return do_select(ret, *bid, sid ? *sid : 0, *pat, *flags, *anti); @@ -661,7 +642,7 @@ throw(MAL, "pcre.rematchjoin", GDK_EXCEPTION); } -char * +static char * regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate) @@ -670,7 +651,7 @@ *nil_matches, *estimate); } -char * +static char * regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const char **flags, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate) @@ -681,7 +662,10 @@ #include "mel.h" -static char regexp_sql[] = "CREATE FILTER FUNCTION rematch(val STRING, pat STRING) EXTERNAL NAME regexp.rematch; CREATE FILTER FUNCTION rematch(val STRING, pat STRING, flags STRING) EXTERNAL NAME regexp.rematch;"; +static char regexp_sql[] = "CREATE FILTER FUNCTION rematch(val STRING, pat STRING)" + " EXTERNAL NAME regexp.rematch; " + "CREATE FILTER FUNCTION rematch(val STRING, pat STRING, flags STRING)" + " EXTERNAL NAME regexp.rematch;"; static mel_func regexp_init_funcs[] = { command("regexp", "rematch", regexpmatch, false, @@ -689,10 +673,13 @@ args(1,3, arg("",bit),arg("val",str),arg("pat",str))), command("regexp", "rematchselect", regexpmatchselect, false, "Return the list of matches in 'val' that match the regular expression 'pat'", - args(1,5, batarg("",oid),batarg("val",str),batarg("cand",oid),arg("pat",str),arg("anti",bit))), + args(1,5, batarg("",oid),batarg("val",str),batarg("s",oid),arg("pat",str),arg("anti",bit))), command("regexp", "rematchjoin", regexpmatchjoin, false, "Return the matching pairs from the 'val' and 'pat' columns", args(2,8, batarg("lr",oid),batarg("rr",oid),batarg("val",str),batarg("pat",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng))), + command("batregexp", "rematch", regexpmatchbulk, false, + "Return a BAT with true for match and false for no match", + args(1,3, batarg("",bit),batarg("val",str),arg("pat",str))), command("regexp", "rematch", regexpmatchf, false, "Return true when the value 'val' matches the regular expression 'pat'", args(1,4, arg("",bit),arg("val",str),arg("pat",str),arg("flags",str))), @@ -702,9 +689,6 @@ command("regexp", "rematchjoin", regexpmatchfjoin, false, "Return the matching pairs from the 'val' and 'pat'\ncolumns", args(2,9, batarg("lr",oid),batarg("rr",oid),batarg("val",str),batarg("pat",str),arg("flags",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng))), - command("batregexp", "rematch", regexpmatchbulk, false, - "Return a BAT with true for match and false for no match", - args(1,3, batarg("",bit),batarg("val",str),arg("pat",str))), command("batregexp", "rematch", regexpmatchfbulk, false, "Return a BAT with true for match and false for no match", args(1,4, batarg("",bit),batarg("val",str),arg("pat",str),arg("flags",str))),
--- a/regexp/regexp.mal Tue Jun 08 11:58:34 2021 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# -# Copyright 2013-2018 MonetDB B.V. - -module regexp; - -# variant without flags argument - -command rematch(val:str, pat:str) :bit -address regexpmatch -comment "Return true when the value 'val' matches the regular expression 'pat'"; - -command rematchselect(val:bat[:str], cand:bat[:oid], pat:str, anti:bit):bat[:oid] -address regexpmatchselect -comment "Return the list of matches in 'val' that match the regular expression 'pat'"; - -command rematchjoin(val:bat[:str], pat:bat[:str], sl:bat[:oid], sr:bat[:oid], nil_matches:bit, estimate:lng)(lr:bat[:oid],rr:bat[:oid]) -address regexpmatchjoin -comment "Return the matching pairs from the 'val' and 'pat' columns"; - -# variant with flags argument - -command rematch(val:str, pat:str, flags:str) :bit -address regexpmatchf -comment "Return true when the value 'val' matches the regular expression 'pat'"; - -command rematchselect(val:bat[:str], s:bat[:oid], pat:str, flags:str, anti:bit):bat[:oid] -address regexpmatchfselect -comment "Return the list of matches in 'val' that match the regular expression 'pat'"; - -command rematchjoin(val:bat[:str], pat:bat[:str], flags:str, sl:bat[:oid], sr:bat[:oid], nil_matches:bit, estimate:lng)(lr:bat[:oid],rr:bat[:oid]) -address regexpmatchfjoin -comment "Return the matching pairs from the 'val' and 'pat' -columns"; - -module batregexp; - -command rematch(val:bat[:str], pat:str) :bat[:bit] -address regexpmatchbulk -comment "Return a BAT with true for match and false for no match"; - -command rematch(val:bat[:str], pat:str, flags:str) :bat[:bit] -address regexpmatchfbulk -comment "Return a BAT with true for match and false for no match";
--- a/reverse/80_reverse.mal Tue Jun 08 11:58:34 2021 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# -# Copyright 2013-2018 MonetDB B.V. - -include reverse;
--- a/reverse/80_reverse.sql Tue Jun 08 11:58:34 2021 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ --- This Source Code Form is subject to the terms of the Mozilla Public --- License, v. 2.0. If a copy of the MPL was not distributed with this --- file, You can obtain one at http://mozilla.org/MPL/2.0/. --- --- Copyright 2013-2018 MonetDB B.V. - -CREATE FUNCTION reverse(src STRING) RETURNS STRING - EXTERNAL NAME reverse.reverse;
--- a/reverse/Makefile Tue Jun 08 11:58:34 2021 +0200 +++ b/reverse/Makefile Tue Jun 08 14:55:38 2021 +0200 @@ -29,10 +29,8 @@ rm -f README.html README.pdf *.o *.so install: lib_reverse.so - cp reverse.mal lib_reverse.so $(DESTDIR)$(LIBDIR)/monetdb5 - cp 80_reverse.sql $(DESTDIR)$(LIBDIR)/monetdb5/createdb - cp 80_reverse.mal $(DESTDIR)$(LIBDIR)/monetdb5/autoload + cp lib_reverse.so $(DESTDIR)$(LIBDIR)/monetdb5 -tar: MonetDB-reverse-1.1.tar.bz2 -MonetDB-reverse-1.1.tar.bz2: README.rst Makefile 80_reverse.mal reverse.mal 80_reverse.sql reverse.c - tar -cjf MonetDB-reverse-1.1.tar.bz2 --transform='s|^|MonetDB-reverse-1.1/|' README.rst Makefile 80_reverse.mal reverse.mal 80_reverse.sql reverse.c +tar: MonetDB-reverse-1.2.tar.bz2 +MonetDB-reverse-1.2.tar.bz2: README.rst Makefile reverse.c + tar -cjf MonetDB-reverse-1.2.tar.bz2 --transform='s|^|MonetDB-reverse-1.2/|' README.rst Makefile reverse.c
--- a/reverse/README.rst Tue Jun 08 11:58:34 2021 +0200 +++ b/reverse/README.rst Tue Jun 08 14:55:38 2021 +0200 @@ -2,7 +2,7 @@ .. License, v. 2.0. If a copy of the MPL was not distributed with this .. file, You can obtain one at http://mozilla.org/MPL/2.0/. .. -.. Copyright 2013-2018 MonetDB B.V. +.. Copyright 2013-2021 MonetDB B.V. .. This document is written in reStructuredText (see http://docutils.sourceforge.net/ for more information). @@ -56,12 +56,13 @@ ``CHARACTER LARGE OBJECT`` or ``CLOB``. This statement will normally be executed once when the database is -created, after which it is part of the SQL catalog. This is -accomplished by having the statement in a file in the -``$libdir/monetdb/createdb`` directory. Since files in that directory -are executed in order, the convention is to add a two digit number at -the front of the file name to force the order. So we have a file -``80_reverse.sql`` where we put this statement. +created, after which it is part of the SQL catalog. How this is +accomplished exactly we will leave until later in this tutorial. For +now let it suffice to note that the SQL query is encoded as a C string +and stored in the variable ``reverse_sql``:: + + static char reverse_sql[] = "CREATE FUNCTION reverse(src STRING)" + " RETURNS STRING EXTERNAL NAME reverse.reverse;"; At the SQL side we don't have to do anything more. @@ -78,13 +79,6 @@ address UDFreverse comment "Reverse a string"; -First we need to tell the MAL interpreter that we're now in the module -``reverse``, then we create the function ``reverse`` that takes a -``str`` argument and produces a ``str`` result. The MAL ``str`` type -is used to implement all character types in SQL (i.e., ``CHAR``, -``VARCHAR``, ``CLOB`` and all their variants). The name of the -argument (``ra1``) is completely unimportant. - The SQL engine uses the convention that a *bulk* variant of a scalar operation (i.e., a variant that works on a complete column and produces a column as opposed to a function that works on a single @@ -98,34 +92,54 @@ address UDFBATreverse comment "Reverse a column of strings"; +This MAL code also needs to be encoded in the C source. This is done as +follows:: + + static mel_func reverse_init_funcs[] = { + command("reverse", "reverse", UDFreverse, false, + "Reverse a string", + args(1,2, arg("",str),arg("ra1",str))), + command("batreverse", "reverse", UDFBATreverse, false, + "Reverse a BAT of strings", + args(1,2, batarg("",str),batarg("b",str))), + { .imp=NULL } /* sentinel */ + }; + +A C array with elements of type ``mel_func`` is created, and each MAL +command is one element of this array. The array ends with a sentinel, a +value that is "empty" and can thus be recognized as the end of the +array. + +Each element in the array is an instance of the macro ``command`` which +has a bunch of arguments. In order, they are: MAL module name (string), +MAL function name (string), C function (pointer to function), a Boolean +flag to indicate whether this is an "unsafe" operation (one with side +effects), a comment (string--not currently used but must be present), a +description of the function arguments. The function arguments are +encoded using the ``args`` macro with the following arguments. The +number of return values (MAL functions can return 1 or more values), the +total number of arguments (i.e. the sum of return values and input +arguments), and then for each return argument and each input argument a +description of the argument itself. Each argument is an instance of a +macro. There are various forms, but here we use two. ``arg`` describes +a scalar argument and has two parameters, the MAL name of the argument +and the MAL type. ``batarg`` describes a BAT argument and also has two +parameters, the MAL name of the argument and the MAL type of the +elements of the BAT. + Note that implementing a bulk version is optional. If it does not exist, the scalar version will be called multiple times. However, calling the scalar version multiple (sometimes very many) times incurs significant overhead, hence it is usually a good idea to implement the bulk version. -We put these MAL commands in the file -``$libdir/monetdb5/reverse.mal``. In addition we create a file -``$libdir/monetdb5/autoload/80_reverse.mal`` that just contains:: - - include reverse; - -We need the extra file because the ``include`` statement actually -includes both the MAL file ``reverse.mal`` and the dynamically -loadable C module that we will install in the file ``lib_reverse.so`` -(or whatever extension is needed on the system you're on). - -The files in the ``autoload`` directory are executed in order every -time the server is started so that by putting the ``80_reverse.mal`` -file there, we make sure that the system knows about these functions. - Now we come to the actual implementation of the feature. The MAL interfaces of the scalar and bulk versions of the ``reverse`` function translates to the following C interfaces:: - char *UDFreverse(char **retval, const char **arg); - char *UDFBATreverse(bat *retval, const bat *arg); + static char *UDFreverse(char **retval, const char **arg); + static char *UDFBATreverse(bat *retval, const bat *arg); The return value of the C functions is normally ``MAL_SUCCEED`` which translates to ``NULL``. If an error occurs, the return should be a @@ -142,7 +156,7 @@ remaining arguments are values that are used by the format string. A minimal example is:: - char *UDFreverse(char **retval, const char **arg) + static char *UDFreverse(char **retval, const char **arg) { (void) retval; (void) arg; /* we're not using these */ throw(MAL, "reverse.reverse", "Not yet implemented"); @@ -158,8 +172,8 @@ The MonetDB code usually uses the C type ``str`` which is defined to be ``char *``, so you could define the functions also as:: - str UDFreverse(str *retval, const str *arg); - str UDFBATreverse(bat *retval, const bat *arg); + static str UDFreverse(str *retval, const str *arg); + static str UDFBATreverse(bat *retval, const bat *arg); Note that the definitions are not entirely equivalent. The target of the ``const`` keyword is different for the first function. @@ -170,18 +184,9 @@ These functions must be located in a dynamically loadable module (``.so`` file on Linux, ``.dll`` on Windows), and this module must have the name ``lib_reverse.so`` (or ``lib_reverse.dll``). The -functions must be visible by the process loading the module, so they -must be exported (this is especially true on Windows). So we need to -declare these functions using the phrase ``__declspec(dllexport)`` on -Windows. So the full declaration becomes:: - - /* __declspec() must be used on Windows, but not on other systems */ - #ifndef _MSC_VER - /* not Windows */ - #define __declspec(x) /* nothing */ - #endif - extern __declspec(dllexport) char *UDFreverse(char **retval, const char **arg); - extern __declspec(dllexport) char *UDFBATreverse(bat *retval, const bat *arg); +functions are only directly referenced from the ``reverse_init_funcs`` +array that we have defined above, so the functions are declared as +``static`` functions. Scalar Version ~~~~~~~~~~~~~~ @@ -275,7 +280,7 @@ with here), ``BUNappend`` can still fail due to not enough memory, even though we supposedly allocated enough. The strings have to be stored somewhere, and ``COLnew`` has no way of knowing how large the -total are for the strings must be, so ``BUNappend`` may have to grow +total area for the strings must be, so ``BUNappend`` may have to grow the memory area for the strings, and that can fail. Iterating through the source BAT is done using a standard mechanism:: @@ -301,15 +306,15 @@ We then use this string in the same way as in the scalar function. The reversed string in ``dst`` is appended to the result BAT:: - BUNappend(bn, dst, 0); + BUNappend(bn, dst, false); -The third argument to ``BUNappend`` must always be ``0``. +The third argument to ``BUNappend`` must always be ``false``. -Note that the return value of ``BUNappend`` was changed starting with -the Jul2015 feature release. Before, ``BUNappend`` returned its first -argument on success or ``NULL`` on failure. Starting with the Jul2015 -release it returns ``GDK_SUCCEED`` or ``GDK_FAIL`` for success or -failure. +``BUNappend`` returns ``GDK_SUCCEED`` for success or ``GDK_FAIL`` for +failure. ``BUNappend`` is marked in the include file as a function of +which the result *must* be checked. It is a good convention to always +check whether the result is (not) equal to ``GDK_SUCCEED`` so that if in +the future different errors are returned, the code keeps working. BAT Properties -------------- @@ -359,12 +364,6 @@ the same time. When they are, it implies that all values are equal to each other. -Next to the ``tkey`` property there is also a ``tunique`` property. -The ``tunique`` property, when set, indicates that all values in the -BAT *must* be distinct (as in the UNIQUE constraint in SQL). We're -not really concerned with this, since it is not used by the SQL layer. -When ``tunique`` is set, then so must ``tkey``. - When the ``tsorted`` property is unset, the ``tnosorted`` property is a position in the BAT where the previous value is not less than or equal to the position itself. If the ``tnosorted`` value is 0, we @@ -392,20 +391,69 @@ as best it can. That is why in the example we didn't need to do anything with the property flags. +Informing the Server +-------------------- + +So far we have created the necessary C code that can be called by the +interpreter when the appropriate SQL query is executed. However, we +still need to tell the server that this code actually exists. We have +already hinted at this, but here we will finish that part. + +Once the ``.so`` or ``.ddl`` file has been created and installed, the +server needs to be told about it. This is done be calling the server +with an extra argument:: + + mserver5 --loadmodule=regexp ... + +where ``...`` represents any other arguments not covered by this +tutorial. + +When the server gets this ``--loadmodule`` argument, it loads the +library. And here we use a trick that is available in dynamically +loaded libraries. We tell the system to automatically execute some code +when the library is loaded:: + + #include "mal_import.h" + #include "sql_import.h" + #ifdef _MSC_VER + #undef read + #pragma section(".CRT$XCU",read) + #endif + LIB_STARTUP_FUNC(init_reverse) + { + mal_module("reverse", NULL, reverse_init_funcs); + sql_register("reverse", reverse_sql); + } + +The ``LIB_STARTUP_FUNC`` macro is defined in one of the include files. +It has an argument which is a name that should be unique. The +convention is ``init_`` followed by the name of the module. This macro +is the start of a function definition, so it is followed by the body of +the function that is executed when the library is loaded. + +The function calls two functions. The first, ``mal_module``, registers +the MAL functions that we have defined. The arguments are the name of +the module, an array of elements of type ``mel_atom`` (not used here), +and an array of elements of type ``mel_func`` which contains our MAL +functions. + +The second function, ``sql_register`` registers the SQL query that needs +to be executed to enter the SQL function into the catalog. + Makefile -------- To bring all of this together, we have a ``Makefile``. It uses the -``pkgconf`` command to find the location of the MonetDB installation -and the arguments needed to compile the module. (If you don't have +``pkgconf`` command to find the location of the MonetDB installation and +the arguments needed to compile the module. (If you don't have ``pkgconf``, you may be able to replace it with ``pkg-config``.) This Makefile works on Fedora Linux if you have the package -``MonetDB5-server-devel`` with all its dependencies installed -(available starting in the Jan2014 feature release), and on -Debian/Ubuntu if you have the packages ``libmonetdb-dev`` and -``monetdb5-server-dev`` with all their dependencies installed -(available starting in the Oct2014-SP3 bugfix release). The file may -need to be changed for other systems. +``MonetDB-SQL-server5-devel`` with all its dependencies installed +(available starting in the Oct2020-SP2 bugfix release), and on +Debian/Ubuntu if you have the package ``monetdb5-sql-dev`` with all its +dependencies installed (available starting in the Oct2020-SP2 bugfix +release). The file may need to be changed for other systems. Note that +even in the Oct2020-SP5 release there are some include files missing. A Note About Names ------------------
--- a/reverse/reverse.c Tue Jun 08 11:58:34 2021 +0200 +++ b/reverse/reverse.c Tue Jun 08 14:55:38 2021 +0200 @@ -68,16 +68,7 @@ assert(len == 0); } -/* __declspec() must be used on Windows, but not on other systems */ -#ifndef _MSC_VER -/* not Windows */ -#define __declspec(x) /* nothing */ -#endif - -extern __declspec(dllexport) char *UDFreverse(char **retval, const char **arg); -extern __declspec(dllexport) char *UDFBATreverse(bat *retval, const bat *arg); - -char * +static char * UDFreverse(char **retval, const char **arg) { size_t len; @@ -93,7 +84,7 @@ } -char * +static char * UDFBATreverse(bat *retval, const bat *arg) { BAT *b, *bn; @@ -175,7 +166,8 @@ #include "mel.h" -static char reverse_sql[] = "CREATE FUNCTION reverse(src STRING) RETURNS STRING EXTERNAL NAME reverse.reverse;"; +static char reverse_sql[] = "CREATE FUNCTION reverse(src STRING)" + " RETURNS STRING EXTERNAL NAME reverse.reverse;"; static mel_func reverse_init_funcs[] = { command("reverse", "reverse", UDFreverse, false,
--- a/reverse/reverse.mal Tue Jun 08 11:58:34 2021 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# -# Copyright 2013-2018 MonetDB B.V. - -module reverse; - -command reverse(ra1:str):str -address UDFreverse -comment "Reverse a string"; - -module batreverse; - -command reverse(b:bat[:str]):bat[:str] -address UDFBATreverse -comment "Reverse a BAT of strings";