changeset 0:a1080ed7fe4d

Created an example UDF for a SQL reverse() function.
author Sjoerd Mullender <sjoerd@acm.org>
date Fri, 05 Jul 2013 17:14:08 +0200 (2013-07-05)
parents
children ccdd5feff344
files reverse/80_reverse.mal reverse/80_reverse.sql reverse/Makefile reverse/README.rst reverse/reverse.c reverse/reverse.mal
diffstat 6 files changed, 580 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reverse/80_reverse.mal	Fri Jul 05 17:14:08 2013 +0200
@@ -0,0 +1,18 @@
+# The contents of this file are subject to the MonetDB Public License
+# Version 1.1 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://www.monetdb.org/Legal/MonetDBLicense
+#
+# Software distributed under the License is distributed on an "AS IS"
+# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+# License for the specific language governing rights and limitations
+# under the License.
+#
+# The Original Code is the MonetDB Database System.
+#
+# The Initial Developer of the Original Code is CWI.
+# Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+# Copyright August 2008-2013 MonetDB B.V.
+# All Rights Reserved.
+
+include reverse;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reverse/80_reverse.sql	Fri Jul 05 17:14:08 2013 +0200
@@ -0,0 +1,19 @@
+-- The contents of this file are subject to the MonetDB Public License
+-- Version 1.1 (the "License"); you may not use this file except in
+-- compliance with the License. You may obtain a copy of the License at
+-- http://www.monetdb.org/Legal/MonetDBLicense
+--
+-- Software distributed under the License is distributed on an "AS IS"
+-- basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+-- License for the specific language governing rights and limitations
+-- under the License.
+--
+-- The Original Code is the MonetDB Database System.
+--
+-- The Initial Developer of the Original Code is CWI.
+-- Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+-- Copyright August 2008-2013 MonetDB B.V.
+-- All Rights Reserved.
+
+CREATE FUNCTION reverse(src STRING) RETURNS STRING
+       EXTERNAL NAME reverse.reverse;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reverse/Makefile	Fri Jul 05 17:14:08 2013 +0200
@@ -0,0 +1,47 @@
+# The contents of this file are subject to the MonetDB Public License
+# Version 1.1 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://www.monetdb.org/Legal/MonetDBLicense
+#
+# Software distributed under the License is distributed on an "AS IS"
+# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+# License for the specific language governing rights and limitations
+# under the License.
+#
+# The Original Code is the MonetDB Database System.
+#
+# The Initial Developer of the Original Code is CWI.
+# Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+# Copyright August 2008-2013 MonetDB B.V.
+# All Rights Reserved.
+
+LIBDIR = `pkg-config --variable=libdir monetdb5`
+
+CC = cc
+
+CFLAGS = `pkg-config --cflags monetdb5`
+LDFLAGS = `pkg-config --libs monetdb5`
+
+all: lib_reverse.so
+
+lib_reverse.so: reverse.o
+	$(CC) -fPIC -DPIC -o lib_reverse.so -shared reverse.o $(LDFLAGS) -Wl,-soname -Wl,lib_reverse.so
+
+reverse.o: reverse.c
+	$(CC) -fPIC -DPIC $(CFLAGS) -c reverse.c
+
+doc: README.html README.pdf
+
+README.html: README.rst
+	rst2html -d README.rst > README.html
+
+README.pdf: README.rst
+	rst2pdf README.rst
+
+clean:
+	rm -f README.html README.pdf *.o *.so
+
+install: lib_reverse.so
+	cp reverse.mal lib_reverse.so $(LIBDIR)/monetdb5
+	cp 80_reverse.sql $(LIBDIR)/monetdb5/createdb
+	cp 80_reverse.mal $(LIBDIR)/monetdb5/autoload
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reverse/README.rst	Fri Jul 05 17:14:08 2013 +0200
@@ -0,0 +1,288 @@
+.. The contents of this file are subject to the MonetDB Public License
+.. Version 1.1 (the "License"); you may not use this file except in
+.. compliance with the License. You may obtain a copy of the License at
+.. http://www.monetdb.org/Legal/MonetDBLicense
+..
+.. Software distributed under the License is distributed on an "AS IS"
+.. basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+.. License for the specific language governing rights and limitations
+.. under the License.
+..
+.. The Original Code is the MonetDB Database System.
+..
+.. The Initial Developer of the Original Code is CWI.
+.. Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+.. Copyright August 2008-2013 MonetDB B.V.
+.. All Rights Reserved.
+
+.. This document is written in reStructuredText (see
+   http://docutils.sourceforge.net/ for more information).
+   Use ``rst2html.py`` to convert this file to HTML.
+
+===========================================
+Implementing a Simple User-Defined Function
+===========================================
+
+:Author: Sjoerd Mullender
+:Organization: CWI, MonetDB B.V.
+:Abstract: In this short tutorial we describe how to create a simple
+   UDF for MonetDB/SQL.
+
+Introduction
+------------
+
+In this directory we show how to make a simple user-defined function
+(UDF) that can be used in MonetDB/SQL.  The function is implemented in
+C.  In order to make the function available to SQL, we need to provide
+the correct C interface, a MAL interface, and a SQL interface.  We
+will discuss them all, starting from SQL and going down towards the
+actual implementation.
+
+We want to create a function that allows us to write something like
+this::
+
+ SELECT reverse(strcol) FROM table;
+
+where ``table`` is an SQL table with a column called ``strcol`` which
+is of type ``VARCHAR`` (or any other string type).
+
+Implementation
+--------------
+
+We will first create an interface to do a simple one-value-at-a-time
+(*scalar*) operation::
+
+ SELECT reverse('string');
+
+The SQL catalog will need to be extended with a definition of the
+``reverse`` function as follows::
+
+ CREATE FUNCTION reverse(src STRING) RETURNS STRING
+        EXTERNAL NAME reverse.reverse;
+
+The statement tells the SQL system that there is a function called
+``reverse`` which takes a ``STRING`` argument and produces a
+``STRING`` result.  The function is implemented using the MAL
+interface ``reverse.reverse``.
+
+This statement will normally be executed once when the database is
+created, after which it is part of the SQL catalog.  This is
+accomplished by having the statement in a file in the
+``$libdir/monetdb/createdb`` directory.  Since files in that directory
+are executed in order, the convention is to add a two digit number at
+the front of the file name to force the order.  So we have a file
+``80_reverse.sql`` where we put this statement.
+
+Now that we have the SQL interface, we need to create the MAL
+interface.
+
+The MAL interface of the function looks like this::
+
+ module reverse;
+
+ command reverse(ra1:str):str
+ address UDFreverse
+ comment "Reverse a string";
+
+First we need to tell the MAL interpreter that we're now in the module
+``reverse``, then we create the function ``reverse`` that takes a
+``str`` argument and produces a ``str`` result.  The MAL ``str`` type
+is used to implement all character types in SQL (i.e., ``CHAR``,
+``VARCHAR``, ``CLOB`` and all their variants).  The name of the
+argument (``ra1``) is completely unimportant.
+
+The SQL engine uses the convention that a *bulk* variant of a scalar
+operation (i.e., a variant that works on a complete column and
+produces a column as opposed to a function that works on a single
+value and produces a single value) has the same name but is located in
+a module with the string ``bat`` prepended.  So, the bulk version of
+the ``reverse.reverse`` function can also be created::
+
+ module batreverse;
+
+ command reverse(b:bat[:oid,:str]):bat[:oid,:str]
+ address UDFBATreverse
+ comment "Reverse a BAT of strings";
+
+At the SQL side we don't have to do anything more.
+
+We put these MAL commands in the file
+``$libdir/monetdb5/reverse.mal``.  In addition we create a file
+``$libdir/monetdb5/autoload/80_reverse.mal`` that just contains::
+
+ include reverse;
+
+The files in the ``autoload`` directory are executed in order every
+time the server is started.
+
+Note that instead of using ``command``, we could have used
+``pattern``.  The ``pattern`` interface is more complicated to work
+with at the C level, but also more powerful.  We will not use the
+``pattern`` interface here.
+
+Now we come to the actual implementation of the feature.
+
+The MAL interfaces of the scalar and bulk versions of the ``reverse``
+function translates to the following C interfaces::
+
+ char *UDFreverse(char **retval, const char **arg);
+ char *UDFBATreverse(bat *retval, const bat *arg);
+
+The return value of the C functions is normally ``MAL_SUCCEED`` which
+translates to ``NULL``.  If an error occurs, the return should be a
+freshly allocated string that contains the error message.  The string
+must be allocated with the function ``GDKmalloc`` or one of its
+variants (``GDKzalloc``, ``GDKstrdup``) since it will be freed with
+``GDKfree`` when the interpreter is done with the message.  Most of
+the time this can be done by calling the macro ``throw`` which is
+defined as ``return createException``, so calling ``throw`` with
+arguments will actually call ``createException`` with those same
+arguments and return the result.
+
+MAL commands can return any number of values.  These values are
+returned by the C function in the first arguments of the C function,
+hence the two ``retval`` arguments.  The arguments of the MAL
+interface follow the return values in the argument list of the C
+function (the ``arg`` arguments).  We have added the ``const`` keyword
+to indicate that the arguments will not be altered.
+
+The MonetDB code usually uses the C type ``str`` which is defined to
+be ``char *``, so you could define the functions also as::
+
+ str UDFreverse(str *retval, const str *arg);
+ str UDFBATreverse(bat *retval, const bat *arg);
+
+The type ``bat`` is defined as ``int``, so you may see ``int`` in many
+places in the MonetDB source code in equivalent positions.
+
+These functions must be located in a dynamically loadable module
+(``.so`` file on Linux, ``.dll`` on Windows), and this module must
+have the name ``lib_reverse.so`` (or ``lib_reverse.dll``).  The
+functions must be visible by the process loading the module, so they
+must be exported (this is especially true on Windows).  So we need to
+declare these functions using the phrase ``__declspec(dllexport)`` on
+Windows.  So the full declaration becomes::
+
+ /* __declspec() must be used on Windows, but not on other systems */
+ #ifndef _MSC_VER
+ /* not Windows */
+ #define __declspec(x)	/* nothing */
+ #endif
+ extern __declspec(dllexport) char *UDFreverse(char **retval, const char **arg);
+ extern __declspec(dllexport) char *UDFBATreverse(bat *retval, const bat *arg);
+
+We will now first focus on the implementation of the scalar function
+and return to the bulk version later.
+
+The input of the function ``UDFreverse`` is a (NULL-terminated)
+string.  The function is called with a pointer to the string pointer,
+so ``*arg`` is the actual string.
+
+The result of the operation is also a (NULL-terminated) string.  Since
+the caller does not know what the size of the result will be, it
+provides a pointer to where the result is to be put.  The callee is
+responsible for allocating the necessary space.  This means that we
+need to do something like this::
+
+ *retval = GDKmalloc(size);
+ // copy data into *retval
+
+In the case of this function, calculating the needed space is easy,
+although we need to do error checking as well::
+
+ *retval = GDKmalloc(strlen(*arg) + 1);
+ if (*retval == NULL)
+     throw(MAL, "reverse.reverse", MAL_MALLOC_FAIL);
+ // reverse the string in *arg into *retval
+ return MAL_SUCCEED;
+
+In the actual algorithm we have also taken into account that strings
+in MonetDB are always stored in the UTF-8 encoding.  In addition, our
+implementation checks for the special value ``str_nil`` which is the C
+representation of the SQL ``NULL`` value for strings.
+
+The bulk version gets as input a pointer to a BAT identifier (a value
+of type ``bat``).  It also returns a BAT identifier of a newly created
+BAT through the output pointer.
+
+It is important to realize that BATs are reference counted with two
+reference counters.  There is a *logical* reference count (``lref``)
+and a *physical* reference count (``ref``).  C code is only allowed to
+look at the actual data when the *physical* reference count is greater
+than zero and the BAT is loaded into (virtual) memory.  A newly
+created BAT has a physical reference count of one and a logical
+reference count of zero.  Before returning a new BAT, the physical
+reference count must be converted to a logical reference count.
+
+We start with loading the BAT into memory.  We do that by calling
+``BATdescriptor`` which increments the physical reference count and
+loads the BAT into memory (if it wasn't there already)::
+
+ BAT *b;
+ b = BATdescriptor(*arg);
+ if (b == NULL)
+     throw(MAL, "batreverse.reverse", RUNTIME_OBJECT_MISSING);
+
+When we're done with this BAT, we will need to decrement the physical
+reference count again.  We do that by calling ``BBPreleaseref``::
+
+ BBPreleaseref(b->batCacheid);
+
+Note that ``b->batCacheid`` is equal to ``*arg``.
+
+We need to create the result BAT ourselves.  We know the type, and we
+know that the size of the BAT will be the same as the input BAT.
+Hence we can use this code::
+
+ bn = BATnew(TYPE_void, TYPE_str, BATcount(b));
+
+The arguments of ``BATnew`` are the types of the *head* and *tail*
+columns, and the initial size of the to-be-allocated BAT.  ``BATnew``
+guarantees that there is space for at least the specified number of
+elements, or it returns ``NULL``.  Since we call ``BUNappend`` to add
+entries to the BAT, we're actually not concerned about the size of the
+new BAT (``BUNappend`` takes care of resizing if necessary), but from
+an efficiency point of view, it's better to create the BAT with the
+required size (growing a BAT can be expensive).
+
+We then set the sequence base for the head column of the new BAT::
+
+ BATseqbase(bn, b->hseqbase);
+
+Iterating through the source BAT is done using a standard mechanism::
+
+ BATiter bi;
+ BUN p, q;
+ bi = bat_iterator(b);
+ BATloop(b, p, q) {
+     ...
+ }
+
+``BATloop`` is a macro that translates to a C ``for`` loop, using the
+first argument to set the bounds, the second argument to iterate
+through the BAT, and the third argument as the loop limit.  The second
+argument can be used inside the body as an argument to
+e.g. ``BUNtail``.
+
+The body of the loop first retrieves the current value from the tail
+column::
+
+ src = (const char *) BUNtail(bi, p);
+
+We then use this string in the same way as in the scalar function.
+The reversed string in ``dst`` is appended to the result BAT::
+
+ BUNappend(bn, dst, 0);
+
+The third argument to ``BUNappend`` is called ``force``.  Only use
+non-zero there if you know what you're doing (and if you need to read
+this document, you don't).
+
+Makefile
+--------
+
+To bring all of this together, we have a ``Makefile``.  It uses the
+``pkg-config`` command to find the location of the MonetDB
+installation and the arguments needed to compile the module.  This
+Makefile works on Fedora Linux.  The file may need to be changed for
+other systems.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reverse/reverse.c	Fri Jul 05 17:14:08 2013 +0200
@@ -0,0 +1,180 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2013 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+/* monetdb_config.h must be included as the first include file */
+#include <monetdb_config.h>
+/* mal_exception.h actually contains everything we need */
+#include <mal_exception.h>
+
+/* system include files */
+#include <string.h>
+
+/* dst is a buffer of length larger than len, src is a UTF-8-encoded
+ * string of length exactly len bytes */
+static void
+do_reverse(char *dst, const char *src, size_t len)
+{
+	dst[len] = 0;
+	if (strcmp(src, str_nil) == 0) {
+		/* special case for nil */
+		strcpy(dst, str_nil);
+		assert(len == strlen(str_nil));
+		return;
+	}
+	while (*src) {
+		if ((*src & 0xF8) == 0xF0) {
+			/* 4 byte UTF-8 sequence */
+			assert(len >= 4);
+			dst[len - 4] = *src++;
+			assert((*src & 0xC0) == 0x80);
+			dst[len - 3] = *src++;
+			assert((*src & 0xC0) == 0x80);
+			dst[len - 2] = *src++;
+			assert((*src & 0xC0) == 0x80);
+			dst[len - 1] = *src++;
+			len -= 4;
+		} else if ((*src & 0xF0) == 0xE0) {
+			/* 3 byte UTF-8 sequence */
+			assert(len >= 3);
+			dst[len - 3] = *src++;
+			assert((*src & 0xC0) == 0x80);
+			dst[len - 2] = *src++;
+			assert((*src & 0xC0) == 0x80);
+			dst[len - 1] = *src++;
+			len -= 3;
+		} else if ((*src & 0xE0) == 0xC0) {
+			/* 2 byte UTF-8 sequence */
+			assert(len >= 2);
+			dst[len - 2] = *src++;
+			assert((*src & 0xC0) == 0x80);
+			dst[len - 1] = *src++;
+			len -= 2;
+		} else {
+			assert(len >= 1);
+			assert((*src & 0x80) == 0);
+			dst[--len] = *src++;
+		}
+	}
+	assert(len == 0);
+}
+
+/* __declspec() must be used on Windows, but not on other systems */
+#ifndef _MSC_VER
+/* not Windows */
+#define __declspec(x)	/* nothing */
+#endif
+
+extern __declspec(dllexport) char *UDFreverse(char **retval, const char **arg);
+extern __declspec(dllexport) char *UDFBATreverse(bat *retval, const bat *arg);
+
+char *
+UDFreverse(char **retval, const char **arg)
+{
+	size_t len;
+
+	len = strlen(*arg);
+	*retval = GDKmalloc(len + 1);
+	if (*retval == NULL)
+		throw(MAL, "reverse.reverse", MAL_MALLOC_FAIL);
+
+	do_reverse(*retval, *arg, len);
+
+	return MAL_SUCCEED;
+
+}
+
+char *
+UDFBATreverse(bat *retval, const bat *arg)
+{
+	BAT *b, *bn;
+	BATiter bi;
+	BUN p, q;
+	const char *src;
+	size_t len;
+	char *dst;
+	size_t dstlen;
+
+	/* allocate temporary space for reversed strings; we grow this
+	 * if we need more */
+	dstlen = 1024;
+	dst = GDKmalloc(dstlen);
+	if (dst == NULL)
+		throw(MAL, "batreverse.reverse", MAL_MALLOC_FAIL);
+
+	b = BATdescriptor(*arg);
+	if (b == NULL)
+		throw(MAL, "batreverse.reverse", RUNTIME_OBJECT_MISSING);
+
+	/* we should only get called for string BATs */
+	assert(b->ttype == TYPE_str);
+
+	/* allocate result BAT */
+	bn = BATnew(TYPE_void, TYPE_str, BATcount(b));
+	if (bn == NULL) {
+		BBPreleaseref(b->batCacheid);
+		throw(MAL, "batreverse.reverse", MAL_MALLOC_FAIL);
+	}
+
+	/* copy seqbase from old to new */
+	BATseqbase(bn, b->hseqbase);
+
+	/* loop through BAT b; p is index of the entry we're working
+	 * on, q is used internally by BATloop to do the iterating */
+	bi = bat_iterator(b);
+	BATloop(b, p, q) {
+		src = (const char *) BUNtail(bi, p);
+		len = strlen(src);
+		/* make sure dst is large enough */
+		if (len >= dstlen) {
+			char *ndst;
+
+			dstlen = len + 1024;
+			ndst = GDKrealloc(dst, dstlen);
+			if (ndst == NULL) {
+				/* if GDKrealloc fails, dst is still
+				 * allocated */
+				goto bailout;
+			}
+			dst = ndst;
+		}
+		do_reverse(dst, src, len);
+		if (BUNappend(bn, dst, 0) == NULL) {
+			/* BUNappend can fail since it may have to
+			 * grow memory areas--especially true for
+			 * string BATs */
+			goto bailout;
+		}
+	}
+	GDKfree(dst);
+
+	BBPreleaseref(b->batCacheid);
+
+	*retval = bn->batCacheid;
+	BBPkeepref(bn->batCacheid);
+
+	return MAL_SUCCEED;
+
+  bailout:
+	/* we only get here in the case of an allocation error; clean
+	 * up the mess we've created and throw an exception */
+	GDKfree(dst);
+	BBPreleaseref(b->batCacheid);
+	BBPreleaseref(bn->batCacheid);
+	throw(MAL, "batreverse.reverse", MAL_MALLOC_FAIL);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reverse/reverse.mal	Fri Jul 05 17:14:08 2013 +0200
@@ -0,0 +1,28 @@
+# The contents of this file are subject to the MonetDB Public License
+# Version 1.1 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://www.monetdb.org/Legal/MonetDBLicense
+#
+# Software distributed under the License is distributed on an "AS IS"
+# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+# License for the specific language governing rights and limitations
+# under the License.
+#
+# The Original Code is the MonetDB Database System.
+#
+# The Initial Developer of the Original Code is CWI.
+# Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+# Copyright August 2008-2013 MonetDB B.V.
+# All Rights Reserved.
+
+module reverse;
+
+command reverse(ra1:str):str
+address UDFreverse
+comment "Reverse a string";
+
+module batreverse;
+
+command reverse(b:bat[:oid,:str]):bat[:oid,:str]
+address UDFBATreverse
+comment "Reverse a BAT of strings";