Mercurial > hg > MonetDB-extend
changeset 0:a1080ed7fe4d
Created an example UDF for a SQL reverse() function.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Fri, 05 Jul 2013 17:14:08 +0200 (2013-07-05) |
parents | |
children | ccdd5feff344 |
files | reverse/80_reverse.mal reverse/80_reverse.sql reverse/Makefile reverse/README.rst reverse/reverse.c reverse/reverse.mal |
diffstat | 6 files changed, 580 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/reverse/80_reverse.mal Fri Jul 05 17:14:08 2013 +0200 @@ -0,0 +1,18 @@ +# The contents of this file are subject to the MonetDB Public License +# Version 1.1 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# http://www.monetdb.org/Legal/MonetDBLicense +# +# Software distributed under the License is distributed on an "AS IS" +# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +# License for the specific language governing rights and limitations +# under the License. +# +# The Original Code is the MonetDB Database System. +# +# The Initial Developer of the Original Code is CWI. +# Portions created by CWI are Copyright (C) 1997-July 2008 CWI. +# Copyright August 2008-2013 MonetDB B.V. +# All Rights Reserved. + +include reverse;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/reverse/80_reverse.sql Fri Jul 05 17:14:08 2013 +0200 @@ -0,0 +1,19 @@ +-- The contents of this file are subject to the MonetDB Public License +-- Version 1.1 (the "License"); you may not use this file except in +-- compliance with the License. You may obtain a copy of the License at +-- http://www.monetdb.org/Legal/MonetDBLicense +-- +-- Software distributed under the License is distributed on an "AS IS" +-- basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +-- License for the specific language governing rights and limitations +-- under the License. +-- +-- The Original Code is the MonetDB Database System. +-- +-- The Initial Developer of the Original Code is CWI. +-- Portions created by CWI are Copyright (C) 1997-July 2008 CWI. +-- Copyright August 2008-2013 MonetDB B.V. +-- All Rights Reserved. + +CREATE FUNCTION reverse(src STRING) RETURNS STRING + EXTERNAL NAME reverse.reverse;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/reverse/Makefile Fri Jul 05 17:14:08 2013 +0200 @@ -0,0 +1,47 @@ +# The contents of this file are subject to the MonetDB Public License +# Version 1.1 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# http://www.monetdb.org/Legal/MonetDBLicense +# +# Software distributed under the License is distributed on an "AS IS" +# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +# License for the specific language governing rights and limitations +# under the License. +# +# The Original Code is the MonetDB Database System. +# +# The Initial Developer of the Original Code is CWI. +# Portions created by CWI are Copyright (C) 1997-July 2008 CWI. +# Copyright August 2008-2013 MonetDB B.V. +# All Rights Reserved. + +LIBDIR = `pkg-config --variable=libdir monetdb5` + +CC = cc + +CFLAGS = `pkg-config --cflags monetdb5` +LDFLAGS = `pkg-config --libs monetdb5` + +all: lib_reverse.so + +lib_reverse.so: reverse.o + $(CC) -fPIC -DPIC -o lib_reverse.so -shared reverse.o $(LDFLAGS) -Wl,-soname -Wl,lib_reverse.so + +reverse.o: reverse.c + $(CC) -fPIC -DPIC $(CFLAGS) -c reverse.c + +doc: README.html README.pdf + +README.html: README.rst + rst2html -d README.rst > README.html + +README.pdf: README.rst + rst2pdf README.rst + +clean: + rm -f README.html README.pdf *.o *.so + +install: lib_reverse.so + cp reverse.mal lib_reverse.so $(LIBDIR)/monetdb5 + cp 80_reverse.sql $(LIBDIR)/monetdb5/createdb + cp 80_reverse.mal $(LIBDIR)/monetdb5/autoload
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/reverse/README.rst Fri Jul 05 17:14:08 2013 +0200 @@ -0,0 +1,288 @@ +.. The contents of this file are subject to the MonetDB Public License +.. Version 1.1 (the "License"); you may not use this file except in +.. compliance with the License. You may obtain a copy of the License at +.. http://www.monetdb.org/Legal/MonetDBLicense +.. +.. Software distributed under the License is distributed on an "AS IS" +.. basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +.. License for the specific language governing rights and limitations +.. under the License. +.. +.. The Original Code is the MonetDB Database System. +.. +.. The Initial Developer of the Original Code is CWI. +.. Portions created by CWI are Copyright (C) 1997-July 2008 CWI. +.. Copyright August 2008-2013 MonetDB B.V. +.. All Rights Reserved. + +.. This document is written in reStructuredText (see + http://docutils.sourceforge.net/ for more information). + Use ``rst2html.py`` to convert this file to HTML. + +=========================================== +Implementing a Simple User-Defined Function +=========================================== + +:Author: Sjoerd Mullender +:Organization: CWI, MonetDB B.V. +:Abstract: In this short tutorial we describe how to create a simple + UDF for MonetDB/SQL. + +Introduction +------------ + +In this directory we show how to make a simple user-defined function +(UDF) that can be used in MonetDB/SQL. The function is implemented in +C. In order to make the function available to SQL, we need to provide +the correct C interface, a MAL interface, and a SQL interface. We +will discuss them all, starting from SQL and going down towards the +actual implementation. + +We want to create a function that allows us to write something like +this:: + + SELECT reverse(strcol) FROM table; + +where ``table`` is an SQL table with a column called ``strcol`` which +is of type ``VARCHAR`` (or any other string type). + +Implementation +-------------- + +We will first create an interface to do a simple one-value-at-a-time +(*scalar*) operation:: + + SELECT reverse('string'); + +The SQL catalog will need to be extended with a definition of the +``reverse`` function as follows:: + + CREATE FUNCTION reverse(src STRING) RETURNS STRING + EXTERNAL NAME reverse.reverse; + +The statement tells the SQL system that there is a function called +``reverse`` which takes a ``STRING`` argument and produces a +``STRING`` result. The function is implemented using the MAL +interface ``reverse.reverse``. + +This statement will normally be executed once when the database is +created, after which it is part of the SQL catalog. This is +accomplished by having the statement in a file in the +``$libdir/monetdb/createdb`` directory. Since files in that directory +are executed in order, the convention is to add a two digit number at +the front of the file name to force the order. So we have a file +``80_reverse.sql`` where we put this statement. + +Now that we have the SQL interface, we need to create the MAL +interface. + +The MAL interface of the function looks like this:: + + module reverse; + + command reverse(ra1:str):str + address UDFreverse + comment "Reverse a string"; + +First we need to tell the MAL interpreter that we're now in the module +``reverse``, then we create the function ``reverse`` that takes a +``str`` argument and produces a ``str`` result. The MAL ``str`` type +is used to implement all character types in SQL (i.e., ``CHAR``, +``VARCHAR``, ``CLOB`` and all their variants). The name of the +argument (``ra1``) is completely unimportant. + +The SQL engine uses the convention that a *bulk* variant of a scalar +operation (i.e., a variant that works on a complete column and +produces a column as opposed to a function that works on a single +value and produces a single value) has the same name but is located in +a module with the string ``bat`` prepended. So, the bulk version of +the ``reverse.reverse`` function can also be created:: + + module batreverse; + + command reverse(b:bat[:oid,:str]):bat[:oid,:str] + address UDFBATreverse + comment "Reverse a BAT of strings"; + +At the SQL side we don't have to do anything more. + +We put these MAL commands in the file +``$libdir/monetdb5/reverse.mal``. In addition we create a file +``$libdir/monetdb5/autoload/80_reverse.mal`` that just contains:: + + include reverse; + +The files in the ``autoload`` directory are executed in order every +time the server is started. + +Note that instead of using ``command``, we could have used +``pattern``. The ``pattern`` interface is more complicated to work +with at the C level, but also more powerful. We will not use the +``pattern`` interface here. + +Now we come to the actual implementation of the feature. + +The MAL interfaces of the scalar and bulk versions of the ``reverse`` +function translates to the following C interfaces:: + + char *UDFreverse(char **retval, const char **arg); + char *UDFBATreverse(bat *retval, const bat *arg); + +The return value of the C functions is normally ``MAL_SUCCEED`` which +translates to ``NULL``. If an error occurs, the return should be a +freshly allocated string that contains the error message. The string +must be allocated with the function ``GDKmalloc`` or one of its +variants (``GDKzalloc``, ``GDKstrdup``) since it will be freed with +``GDKfree`` when the interpreter is done with the message. Most of +the time this can be done by calling the macro ``throw`` which is +defined as ``return createException``, so calling ``throw`` with +arguments will actually call ``createException`` with those same +arguments and return the result. + +MAL commands can return any number of values. These values are +returned by the C function in the first arguments of the C function, +hence the two ``retval`` arguments. The arguments of the MAL +interface follow the return values in the argument list of the C +function (the ``arg`` arguments). We have added the ``const`` keyword +to indicate that the arguments will not be altered. + +The MonetDB code usually uses the C type ``str`` which is defined to +be ``char *``, so you could define the functions also as:: + + str UDFreverse(str *retval, const str *arg); + str UDFBATreverse(bat *retval, const bat *arg); + +The type ``bat`` is defined as ``int``, so you may see ``int`` in many +places in the MonetDB source code in equivalent positions. + +These functions must be located in a dynamically loadable module +(``.so`` file on Linux, ``.dll`` on Windows), and this module must +have the name ``lib_reverse.so`` (or ``lib_reverse.dll``). The +functions must be visible by the process loading the module, so they +must be exported (this is especially true on Windows). So we need to +declare these functions using the phrase ``__declspec(dllexport)`` on +Windows. So the full declaration becomes:: + + /* __declspec() must be used on Windows, but not on other systems */ + #ifndef _MSC_VER + /* not Windows */ + #define __declspec(x) /* nothing */ + #endif + extern __declspec(dllexport) char *UDFreverse(char **retval, const char **arg); + extern __declspec(dllexport) char *UDFBATreverse(bat *retval, const bat *arg); + +We will now first focus on the implementation of the scalar function +and return to the bulk version later. + +The input of the function ``UDFreverse`` is a (NULL-terminated) +string. The function is called with a pointer to the string pointer, +so ``*arg`` is the actual string. + +The result of the operation is also a (NULL-terminated) string. Since +the caller does not know what the size of the result will be, it +provides a pointer to where the result is to be put. The callee is +responsible for allocating the necessary space. This means that we +need to do something like this:: + + *retval = GDKmalloc(size); + // copy data into *retval + +In the case of this function, calculating the needed space is easy, +although we need to do error checking as well:: + + *retval = GDKmalloc(strlen(*arg) + 1); + if (*retval == NULL) + throw(MAL, "reverse.reverse", MAL_MALLOC_FAIL); + // reverse the string in *arg into *retval + return MAL_SUCCEED; + +In the actual algorithm we have also taken into account that strings +in MonetDB are always stored in the UTF-8 encoding. In addition, our +implementation checks for the special value ``str_nil`` which is the C +representation of the SQL ``NULL`` value for strings. + +The bulk version gets as input a pointer to a BAT identifier (a value +of type ``bat``). It also returns a BAT identifier of a newly created +BAT through the output pointer. + +It is important to realize that BATs are reference counted with two +reference counters. There is a *logical* reference count (``lref``) +and a *physical* reference count (``ref``). C code is only allowed to +look at the actual data when the *physical* reference count is greater +than zero and the BAT is loaded into (virtual) memory. A newly +created BAT has a physical reference count of one and a logical +reference count of zero. Before returning a new BAT, the physical +reference count must be converted to a logical reference count. + +We start with loading the BAT into memory. We do that by calling +``BATdescriptor`` which increments the physical reference count and +loads the BAT into memory (if it wasn't there already):: + + BAT *b; + b = BATdescriptor(*arg); + if (b == NULL) + throw(MAL, "batreverse.reverse", RUNTIME_OBJECT_MISSING); + +When we're done with this BAT, we will need to decrement the physical +reference count again. We do that by calling ``BBPreleaseref``:: + + BBPreleaseref(b->batCacheid); + +Note that ``b->batCacheid`` is equal to ``*arg``. + +We need to create the result BAT ourselves. We know the type, and we +know that the size of the BAT will be the same as the input BAT. +Hence we can use this code:: + + bn = BATnew(TYPE_void, TYPE_str, BATcount(b)); + +The arguments of ``BATnew`` are the types of the *head* and *tail* +columns, and the initial size of the to-be-allocated BAT. ``BATnew`` +guarantees that there is space for at least the specified number of +elements, or it returns ``NULL``. Since we call ``BUNappend`` to add +entries to the BAT, we're actually not concerned about the size of the +new BAT (``BUNappend`` takes care of resizing if necessary), but from +an efficiency point of view, it's better to create the BAT with the +required size (growing a BAT can be expensive). + +We then set the sequence base for the head column of the new BAT:: + + BATseqbase(bn, b->hseqbase); + +Iterating through the source BAT is done using a standard mechanism:: + + BATiter bi; + BUN p, q; + bi = bat_iterator(b); + BATloop(b, p, q) { + ... + } + +``BATloop`` is a macro that translates to a C ``for`` loop, using the +first argument to set the bounds, the second argument to iterate +through the BAT, and the third argument as the loop limit. The second +argument can be used inside the body as an argument to +e.g. ``BUNtail``. + +The body of the loop first retrieves the current value from the tail +column:: + + src = (const char *) BUNtail(bi, p); + +We then use this string in the same way as in the scalar function. +The reversed string in ``dst`` is appended to the result BAT:: + + BUNappend(bn, dst, 0); + +The third argument to ``BUNappend`` is called ``force``. Only use +non-zero there if you know what you're doing (and if you need to read +this document, you don't). + +Makefile +-------- + +To bring all of this together, we have a ``Makefile``. It uses the +``pkg-config`` command to find the location of the MonetDB +installation and the arguments needed to compile the module. This +Makefile works on Fedora Linux. The file may need to be changed for +other systems.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/reverse/reverse.c Fri Jul 05 17:14:08 2013 +0200 @@ -0,0 +1,180 @@ +/* + * The contents of this file are subject to the MonetDB Public License + * Version 1.1 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.monetdb.org/Legal/MonetDBLicense + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the + * License for the specific language governing rights and limitations + * under the License. + * + * The Original Code is the MonetDB Database System. + * + * The Initial Developer of the Original Code is CWI. + * Portions created by CWI are Copyright (C) 1997-July 2008 CWI. + * Copyright August 2008-2013 MonetDB B.V. + * All Rights Reserved. + */ + +/* monetdb_config.h must be included as the first include file */ +#include <monetdb_config.h> +/* mal_exception.h actually contains everything we need */ +#include <mal_exception.h> + +/* system include files */ +#include <string.h> + +/* dst is a buffer of length larger than len, src is a UTF-8-encoded + * string of length exactly len bytes */ +static void +do_reverse(char *dst, const char *src, size_t len) +{ + dst[len] = 0; + if (strcmp(src, str_nil) == 0) { + /* special case for nil */ + strcpy(dst, str_nil); + assert(len == strlen(str_nil)); + return; + } + while (*src) { + if ((*src & 0xF8) == 0xF0) { + /* 4 byte UTF-8 sequence */ + assert(len >= 4); + dst[len - 4] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 3] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 4; + } else if ((*src & 0xF0) == 0xE0) { + /* 3 byte UTF-8 sequence */ + assert(len >= 3); + dst[len - 3] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 3; + } else if ((*src & 0xE0) == 0xC0) { + /* 2 byte UTF-8 sequence */ + assert(len >= 2); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 2; + } else { + assert(len >= 1); + assert((*src & 0x80) == 0); + dst[--len] = *src++; + } + } + assert(len == 0); +} + +/* __declspec() must be used on Windows, but not on other systems */ +#ifndef _MSC_VER +/* not Windows */ +#define __declspec(x) /* nothing */ +#endif + +extern __declspec(dllexport) char *UDFreverse(char **retval, const char **arg); +extern __declspec(dllexport) char *UDFBATreverse(bat *retval, const bat *arg); + +char * +UDFreverse(char **retval, const char **arg) +{ + size_t len; + + len = strlen(*arg); + *retval = GDKmalloc(len + 1); + if (*retval == NULL) + throw(MAL, "reverse.reverse", MAL_MALLOC_FAIL); + + do_reverse(*retval, *arg, len); + + return MAL_SUCCEED; + +} + +char * +UDFBATreverse(bat *retval, const bat *arg) +{ + BAT *b, *bn; + BATiter bi; + BUN p, q; + const char *src; + size_t len; + char *dst; + size_t dstlen; + + /* allocate temporary space for reversed strings; we grow this + * if we need more */ + dstlen = 1024; + dst = GDKmalloc(dstlen); + if (dst == NULL) + throw(MAL, "batreverse.reverse", MAL_MALLOC_FAIL); + + b = BATdescriptor(*arg); + if (b == NULL) + throw(MAL, "batreverse.reverse", RUNTIME_OBJECT_MISSING); + + /* we should only get called for string BATs */ + assert(b->ttype == TYPE_str); + + /* allocate result BAT */ + bn = BATnew(TYPE_void, TYPE_str, BATcount(b)); + if (bn == NULL) { + BBPreleaseref(b->batCacheid); + throw(MAL, "batreverse.reverse", MAL_MALLOC_FAIL); + } + + /* copy seqbase from old to new */ + BATseqbase(bn, b->hseqbase); + + /* loop through BAT b; p is index of the entry we're working + * on, q is used internally by BATloop to do the iterating */ + bi = bat_iterator(b); + BATloop(b, p, q) { + src = (const char *) BUNtail(bi, p); + len = strlen(src); + /* make sure dst is large enough */ + if (len >= dstlen) { + char *ndst; + + dstlen = len + 1024; + ndst = GDKrealloc(dst, dstlen); + if (ndst == NULL) { + /* if GDKrealloc fails, dst is still + * allocated */ + goto bailout; + } + dst = ndst; + } + do_reverse(dst, src, len); + if (BUNappend(bn, dst, 0) == NULL) { + /* BUNappend can fail since it may have to + * grow memory areas--especially true for + * string BATs */ + goto bailout; + } + } + GDKfree(dst); + + BBPreleaseref(b->batCacheid); + + *retval = bn->batCacheid; + BBPkeepref(bn->batCacheid); + + return MAL_SUCCEED; + + bailout: + /* we only get here in the case of an allocation error; clean + * up the mess we've created and throw an exception */ + GDKfree(dst); + BBPreleaseref(b->batCacheid); + BBPreleaseref(bn->batCacheid); + throw(MAL, "batreverse.reverse", MAL_MALLOC_FAIL); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/reverse/reverse.mal Fri Jul 05 17:14:08 2013 +0200 @@ -0,0 +1,28 @@ +# The contents of this file are subject to the MonetDB Public License +# Version 1.1 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# http://www.monetdb.org/Legal/MonetDBLicense +# +# Software distributed under the License is distributed on an "AS IS" +# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +# License for the specific language governing rights and limitations +# under the License. +# +# The Original Code is the MonetDB Database System. +# +# The Initial Developer of the Original Code is CWI. +# Portions created by CWI are Copyright (C) 1997-July 2008 CWI. +# Copyright August 2008-2013 MonetDB B.V. +# All Rights Reserved. + +module reverse; + +command reverse(ra1:str):str +address UDFreverse +comment "Reverse a string"; + +module batreverse; + +command reverse(b:bat[:oid,:str]):bat[:oid,:str] +address UDFBATreverse +comment "Reverse a BAT of strings";