changeset 43:1cfe9d8a07e8

Implemented an aggregate tutorial.
author Sjoerd Mullender <sjoerd@acm.org>
date Thu, 10 Jun 2021 16:22:08 +0200 (2021-06-10)
parents b67deab098f9
children 44031912920c
files README.rst gmean/Makefile gmean/README.rst gmean/gmean.c
diffstat 4 files changed, 765 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/README.rst	Thu Jun 10 16:21:46 2021 +0200
+++ b/README.rst	Thu Jun 10 16:22:08 2021 +0200
@@ -76,3 +76,10 @@
 
 This query filters the table t and selects all rows where the value of
 column ``c`` is equal to ``0``.
+
+gmean
+.....
+
+The tutorial in the ``gmean`` subdirectory contains an example of an
+AGGREGATE FUNCTION.  These functions are used to calculate aggregates
+over columns of values.  Aggregates can be simple or grouped.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gmean/Makefile	Thu Jun 10 16:22:08 2021 +0200
@@ -0,0 +1,36 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0.  If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright 2013-2021 MonetDB B.V.
+
+LIBDIR = $(shell pkgconf --variable=libdir monetdb5)
+
+CFLAGS += $(shell pkgconf --cflags monetdb5)
+LDFLAGS += $(shell pkgconf --libs monetdb5)
+
+all: lib_gmean.so
+
+lib_gmean.so: gmean.o
+	$(CC) -fPIC -DPIC -o lib_gmean.so -shared gmean.o $(LDFLAGS) -Wl,-soname -Wl,lib_gmean.so
+
+gmean.o: gmean.c
+	$(CC) -fPIC -DPIC $(CFLAGS) -c gmean.c
+
+doc: README.html README.pdf
+
+README.html: README.rst
+	rst2html -d README.rst > README.html
+
+README.pdf: README.rst
+	rst2pdf README.rst
+
+clean:
+	rm -f README.html README.pdf *.o *.so
+
+install: lib_gmean.so
+	cp lib_gmean.so $(DESTDIR)$(LIBDIR)/monetdb5
+
+tar: MonetDB-gmean-1.2.tar.bz2
+MonetDB-gmean-1.2.tar.bz2: README.rst Makefile gmean.c
+	tar -cjf MonetDB-gmean-1.2.tar.bz2 --transform='s|^|MonetDB-gmean-1.2/|' README.rst Makefile gmean.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gmean/README.rst	Thu Jun 10 16:22:08 2021 +0200
@@ -0,0 +1,435 @@
+.. This Source Code Form is subject to the terms of the Mozilla Public
+.. License, v. 2.0.  If a copy of the MPL was not distributed with this
+.. file, You can obtain one at http://mozilla.org/MPL/2.0/.
+..
+.. Copyright 2021 MonetDB B.V.
+
+.. This document is written in reStructuredText (see
+   http://docutils.sourceforge.net/ for more information).
+   Use ``rst2html.py`` to convert this file to HTML.
+
+==================================
+Implementing an Aggregate Function
+==================================
+
+:Author: Sjoerd Mullender
+:Organization: MonetDB Solutions B.V., MonetDB B.V.
+:Abstract: In this short tutorial we describe how to create an
+	   aggregate function for MonetDB/SQL.
+
+Introduction
+------------
+
+We want to create a function to calculate the geometric mean of a
+collection of integer values.  The geometric mean of a set of *n*
+values is the *n*\ th root of the product of those values.  See e.g. the
+`Wikipedia article`__.
+
+__ https://en.wikipedia.org/wiki/Geometric_mean
+
+We will make a slight change to the algorithm: if any of the values is
+zero, we return zero (this follows logically from the definition), and
+if there is an odd number of negative values, the product is negative,
+but since an even root of a negative number would result in an
+imaginary number, we will change that to simply a negative result.
+
+We will name the function ``gmean``, so that in SQL we can write:
+
+.. code-block:: sql
+
+  SELECT gmean(i) FROM t;
+
+if we have a table ``t`` with a column ``i`` of type ``INTEGER``.
+
+In SQL we can also have grouped aggregates, so an aggregate value for
+each member of a group.  So we can also write:
+
+.. code-block:: sql
+
+  SELECT g, gmean(i) FROM t GROUP BY g;
+
+if we have this same table with an additional column ``g`` that
+specifies the groups to which the values belong.
+
+Note that SQL aggregates have a number of quirks.  A value of ``NULL``
+does not count toward the aggregate at all.  It is as if the value
+simply does not exist.  If there are no values to be aggregated (i.e.,
+all are ``NULL`` or there simply aren't any values), the result of the
+aggregate is ``NULL``.  If an error occurs during aggregation, such as
+overflow or division by zero, an error should be raised.
+
+Implementation
+--------------
+
+Three things need to be done.  We need to add a function definition to
+the SQL catalog, we need to describe the MAL interface for the MAL
+interpreter, and we need to create the C implementation of the
+function.  We will describe the three steps here in this order.
+
+SQL
+...
+
+The SQL interface is simple:
+
+.. code-block:: sql
+
+  CREATE AGGREGATE FUNCTION gmean(val INTEGER) RETURNS DOUBLE
+         EXTERNAL NAME gmean.gmean;
+
+Note that the word ``FUNCTION`` here is optional, you could also
+write:
+
+.. code-block:: sql
+
+  CREATE AGGREGATE gmean(val INTEGER) RETURNS DOUBLE
+         EXTERNAL NAME gmean.gmean;
+
+When we create an ``AGGREGATE FUNCTION`` in SQL, the SQL system
+expects four MAL functions whose names are based on the external name
+given.  In this case, the MAL functions are called ``gmean.gmean`` and
+``gmean.subgmean``, each with two overloaded definitions.
+
+The two functions ``gmean.gmean`` are for the simple aggregate and a
+simple grouped aggregate.  The two functions ``gmean.subgmean`` are
+for grouped aggregates, but with some extra arguments.
+
+The ``CREATE`` statement will normally be executed once when the
+database is created, after which it is part of the SQL catalog.  To
+accomplish this we need to store the SQL query in a C string (also see
+the *reverse* tutorial):
+
+.. code-block:: c
+
+  static char gmean_sql[] = "CREATE AGGREGATE FUNCTION gmean(val INTEGER)"
+	  " returns DOUBLE external name gmean.gmean;";
+
+At the SQL side we don't need to do anything more.
+
+MAL
+...
+
+As mentioned, the MAL interface consists of four functions whose names
+are based on the names specified in the SQL interface.  The interface
+looks like this:
+
+.. code-block::
+
+  module gmean;
+
+  command gmean(val:bat[:int]) :dbl
+  address gmean_int
+  comment "geometric mean";
+
+  command gmean(val:bat[:int], g:bat[:oid], e:bat[:any_1]) :bat[:dbl]
+  address groupedgmean_int
+  comment "geometric mean";
+
+  command subgmean(val:bat[:int], g:bat[:oid], e:bat[:any_1],
+                   skip_nils:bit, abort_on_error:bit) :bat[:dbl]
+  address subgroupedgmean_int
+  comment "geometric mean";
+  
+  command subgmean(val:bat[:int], g:bat[:oid], e:bat[:any_1], s:bat[:oid],
+                   skip_nils:bit, abort_on_error:bit) :bat[:dbl]
+  address subgroupedgmeancand_int
+  comment "geometric mean";
+
+We encode these MAL commands in a C array (again, see the *reverse*
+tutorial):
+
+.. code-block:: c
+
+  static mel_func gmean_init_funcs[] = {
+	  command("gmean", "gmean", gmean_int, false,
+		  "geometric mean",
+		  args(1,2, arg("",dbl),batarg("b",int))),
+	  command("gmean", "gmean", groupedgmean_int, false,
+		  "geometric mean",
+		  args(1,4, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1))),
+	  command("gmean", "subgmean", subgroupedgmean_int, false,
+		  "geometric mean",
+		  args(1,6, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),arg("skip_nils",bit),arg("abort_on_error",bit))),
+	  command("gmean", "subgmean", subgroupedgmeancand_int, false,
+		  "geometric mean",
+		  args(1,7, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),batarg("s",oid),arg("skip_nils",bit),arg("abort_on_error",bit))),
+	  { .imp=NULL }		/* sentinel */
+  };
+
+C Implementation
+................
+
+In the C implementation we distinguish two cases.  The first, simpler
+case is an aggregate of a set of values that returns a single value.
+This is the implementation of the first of the four MAL interfaces.
+The second case is for the three other MAL interfaces which are all
+for grouped aggregates.  Those three variants can all be implemented
+with a single C function, but since the parameter are different, we
+need an indirection.
+
+First the simple case.
+
+Geometric Mean
+``````````````
+
+First a few words about the implementation of the algorithm to
+calculate the geometric mean.  As mentioned, we should multiply all
+values together and the take the *n*\ th root of the result.  If we
+were to do this, the intermediate result (the product) would become
+very large and likely overflow.  So instead what we do is take the
+logarithm of each of the values, add those up, and divide the sum of
+the logarithms by the number of values.  Then as a final step we take
+the exponent of this value to give us the geometric mean.
+
+There are a few caveats in this algorithm.  The logarithm of zero of
+of negative numbers does not exist, so we need to take special care.
+If we encounter a zero in the list of values we can simply shortcut
+the calculation: the end result will be zero as well, so we can return
+that immediately.  For negative values we should be returning an
+imaginary number, but since MonetDB does not support that, we could
+choose between returning an error and returning some other value.  We
+have chosen to return minus the exponent if the product were to become
+negative.  This means we have to count the number of negative values
+and if the total number of negative values is odd, we negate the
+result of the exponent.
+
+Simple Aggregate
+````````````````
+
+The C interface for the first MAL function is:
+
+.. code-block:: c
+
+  static char *gmean_int(dbl *res, const bat *bid);
+
+As before, ``*bid`` is the BAT identifier (an integer) and this needs
+to be converted to a BAT descriptor, loading the data at the same
+time.  It the call fails, we return a standard error:
+
+.. code-block:: c
+
+  if ((b = BATdescriptor(*bid)) == NULL)
+	  throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING);
+
+The data of the BAT is in essence a C array, so we get a pointer to
+the first value of this array:
+
+.. code-block:: c
+
+  const int *vals = (const int *) Tloc(b, 0);
+
+The ``Tloc`` macro takes a BAT descriptor and an index as argument and
+returns a pointer to the value at the specified index.  We specify the
+index ``0``, so we get a pointer to the first value.  Since the data
+is laid out as a C array, this pointer can be used as such.  The
+``Tloc`` macro is defined to return a ``char *`` pointer, so we need
+to cast it to the correct type.
+
+We will accumulate the sum of the logarithms in a variable ``val``
+which we initialize to ``0``:
+
+.. code-block:: c
+
+  double val = 0;
+
+Then we iterate over all values in the BAT:
+
+.. code-block:: c
+
+  for (BUN i = 0, n = BATcount(b); i < n; i++) {
+          ...
+  }
+
+We don't want to call BATcount(b) for each iteration, so we stick that
+in a temporary variable ``n``.
+
+Inside the loop, ``vals[i]`` is the *i*\ th value.  There are a number
+of possibilities.
+
+If the value is ``NULL``, at the C level this is called ``nil``, we
+need to skip the value.  For all built-in types there is a macro to
+check whether a value is equal to ``nil``.  For ``int`` values, the
+macro is ``is_int_nil``.  For integer types, this doesn't matter so
+much, but for floating point types (``dbl`` and ``flt``) it is
+important to use the macro, since the implementation of ``nil`` for
+these types is a "Not a Number" which cannot be compared using simple
+(in)equality.
+
+If the value is zero, we can immediately return.  The end result is
+zero.  We simply free the resources that we have used by calling
+``BBPunfix``, fill in the result pointer, and return.
+
+Then if the value is negative, we have to count it and add the log of
+the negated value (which is then positive).
+
+And finally, for a positive value, we simply add the log of the value.
+
+We also need to know how many values are involved so that we can
+divide the sum properly.
+
+After the loop, we first free the resources we don't need anymore:
+
+.. code-block:: c
+
+  BBPunfix(b->batCacheid);
+
+Then we distinguish no values (``nval == 0``) where we have to return
+``nil`` from having values where we need to calculate the exponent of
+the sum after dividing it by the number of values.  And finally, if
+there were an odd number of negative values, we negate the geometric
+mean.
+
+And then we return success.
+
+Grouped Aggregates
+``````````````````
+
+The grouped aggregates can all be implemented using a single
+function.  However, the C interfaces of the three functions differ, so
+we need two stub functions to call the third, most complete version.
+
+The stub functions look like this:
+
+.. code-block:: c
+
+  static char *
+  subgroupedgmean_int(bat *retval, const bat *bid, const bat *gid,
+		      const bat *eid,
+		      const bit *skip_nils, const bit *abort_on_error)
+  {
+	  return subgroupedgmeancand_int(retval, bid, gid, eid, NULL,
+					 skip_nils, abort_on_error);
+  }
+
+  static char *
+  groupedgmean_int(bat *retval, const bat *bid, const bat *gid, const bat *eid)
+  {
+	  return subgroupedgmeancand_int(retval, bid, gid, eid, NULL,
+					 &(bit){1}, &(bit){1});
+  }
+
+These are the C implementations of second and third MAL interface
+above.  The fourth MAL interface is the most complete and is the one
+the other two use:
+
+.. code-block:: c
+
+  static char *
+  subgroupedgmeancand_int(bat *retval, const bat *bid, const bat *gid,
+			  const bat *eid, const bat *sid,
+			  const bit *skip_nils, const bit *abort_on_error)
+  {
+          ...
+  }
+
+The parameters, in order, are, a location to return the BAT identifier
+of the result BAT in, the BAT identifier of the BAT to be aggregated,
+the BAT identifier of the BAT that specifies the groups, the BAT
+identifier of a BAT that has a value for each of the groups (the
+groupby implementation produces the groups and two other BATs with
+so-called extents and a histogram, either of these can be used here),
+the BAT identifier of a candidate list that specifies which rows of
+the first and second input BAT are to be used, a Boolean value that
+specifies whether or not to skip NULL (nil) values, and a Boolean
+value that indicates whether errors should produce a nil result or
+raise an error.
+
+The ``skip_nils`` value is generally ``true``, and we will actually
+ignore it.  It could be ``false`` for e.g. a ``COUNT(*)`` aggregate
+where ``NULL`` values have to be counted as well.  The
+``abort_on_error`` value is also generally ``true`` since that is the
+normal way SQL expects errors to be handled.  We will ignore this
+value as well.
+
+When an aggregate function is called, the first input parameter should
+refer to a BAT, but the other three input BATs are optional.  It they
+are not present, the function argument should either be ``NULL``, or
+should point to a value that represents ``nil`` for BAT identifiers.
+If there is no group BAT, we should treat the all values as belonging
+to a single group.  If there is no "extents" BAT, we need to figure
+out in some other way how many groups there are.  If there is no
+candidate list, all rows participate, otherwise only rows indicated by
+the candidate list.
+
+We start by loading all input BATs and checking that they could be
+loaded.  This needs to be done, taking into account that three of the
+input BATs are optional.  Also, if loading of any of the BATs fails,
+we clean up and return an error:
+
+.. code-block:: c
+
+  BAT *g = NULL;
+  BAT *e = NULL;
+  BAT *s = NULL;
+  if ((b = BATdescriptor(*bid)) == NULL ||
+      (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) ||
+      (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) ||
+      (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) {
+	  if (b)
+		  BBPunfix(b->batCacheid);
+	  if (g)
+		  BBPunfix(g->batCacheid);
+	  if (e)
+		  BBPunfix(e->batCacheid);
+	  if (s)
+		  BBPunfix(s->batCacheid);
+	  throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING);
+  }
+
+The next step is to use the input BATs to initialize some variables
+that we need for the groups:
+
+.. code-block:: c
+
+  oid min, max;		/* min and max group id */
+  BUN ngrp, ncand;	/* number of groups, number of candidates */
+  struct canditer ci;	/* candidate list iterator */
+  const char *err;	/* error message */
+  err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci, &ncand);
+
+The function ``BATgroupaggrinit`` returns a static string with an
+error message if it finds an error in the input.  If it returns
+``NULL``, there was no error and the values for ``min``, ``max``,
+``ngrp``, ``ci``, and ``ncand`` are filled in.
+
+Note that if there is no candidate list ``s`` (i.e. the value is
+``NULL``), the candidate iterator is initialized to iterate through
+all values.
+
+The next part of the code has a lot to do with the calculation of the
+geometric mean using the logarithms of the values.  The important bits
+for us are how to iterate through the candidates:
+
+.. code-block:: c
+
+  const int *vals = (const int *) Tloc(b, 0);
+  const oid *gids = NULL;
+  if (g && !BATtdense(g))
+	  gids= (const oid *) Tloc(g, 0);
+  for (BUN i = 0; i < ncand; i++) {
+	  oid o = canditer_next(&ci);
+	  BUN p = o - b->hseqbase;
+	  int v = vals[p];
+	  oid grp = gids ? gids[p] : g ? (oid) p : 0; /* group id */
+	  if (grp >= min && grp <= max) { /* extra check */
+	  	...
+	  }
+  }
+
+There are three possibilities for the group BAT.  There may not be one
+(``g`` is ``NULL``), then all values are in a single group.  The group
+BAT may be dense (``BATtdense(g)`` is ``true``), then there is no C
+array that we can obtain using the ``Tloc`` macro described earlier,
+but then we know that all values are in their own group.  The third
+possibility is the most common one where there is a group BAT and we
+can retrieve the data using the ``Tloc`` macro.  In this last case,
+the ``gids`` pointer will be non-NULL and it can be used to find the
+group ID for each value.
+
+For the algorithm we use two temporary bit masks to have two bits for
+each group.  One bit in one mask is used to indicate whether we have
+seen any values in the group.  The bit in the other mask is used to
+maintain whether we have seen an even or an odd number of negative
+values in the group.
+
+At the end of the function, we free all resources, set the properties
+of the result BAT and return it.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gmean/gmean.c	Thu Jun 10 16:22:08 2021 +0200
@@ -0,0 +1,287 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 2021 MonetDB B.V.
+ */
+
+/* monetdb_config.h must be included as the first include file */
+#include <monetdb_config.h>
+
+/* mal_exception.h actually contains almost everything we need */
+#include <mal_exception.h>
+
+/* for the candidate iterator */
+#include <gdk_cand.h>
+
+/* for log() and exp() */
+#include <math.h>
+
+static char *
+gmean_int(dbl *res, const bat *bid)
+{
+	BAT *b;
+	double val = 0;
+	BUN nneg = 0;
+	BUN nval = 0;
+
+	if ((b = BATdescriptor(*bid)) == NULL)
+		throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING);
+
+	const int *vals = (const int *) Tloc(b, 0);
+	for (BUN i = 0, n = BATcount(b); i < n; i++) {
+		if (is_int_nil(vals[i]))
+			continue; /* nils are ignored */
+		if (vals[i] == 0) {
+			/* any value zero is easy: result is zero */
+			BBPunfix(b->batCacheid);
+			*res = 0;
+			return MAL_SUCCEED;
+		}
+		if (vals[i] < 0) {
+			nneg++;	/* one more negative value */
+			val += log((double) -vals[i]);
+		} else {
+			val += log((double) vals[i]);
+		}
+		nval++;		/* count non-nil values */
+	}
+	BBPunfix(b->batCacheid);
+	if (nval == 0) {
+		/* if there are no non-nil values, the result is nil */
+		*res = dbl_nil;
+	} else {
+		*res = exp(val / nval);
+		/* if the number of negative values in the input is odd,
+		 * we return (incorrectly, but on purpose) a negative
+		 * value (it should really be an imaginary number) */
+		if (nneg & (BUN) 1)
+			*res = - *res;
+	}
+	return MAL_SUCCEED;
+}
+
+static char *
+subgroupedgmeancand_int(bat *retval, const bat *bid, const bat *gid,
+			const bat *eid, const bat *sid,
+			const bit *skip_nils, const bit *abort_on_error)
+{
+	BAT *b, *bn;		/* these two are always assigned */
+	BAT *g = NULL;		/* these three are optional and may not ... */
+	BAT *e = NULL;		/* ... be assigned to below, ... */
+	BAT *s = NULL;		/* ... so we initialize them here */
+
+	/* we ignore these two inputs */
+	(void) skip_nils;
+	(void) abort_on_error;
+
+	/* the bat we're supposed to be working on (bid) is not
+	 * optional, but the others are, so we test whether the bat id
+	 * is not nil, and if it isn't, whether we can find the BAT
+	 * descriptor */
+	if ((b = BATdescriptor(*bid)) == NULL ||
+	    (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) ||
+	    (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) ||
+	    (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) {
+		if (b)
+			BBPunfix(b->batCacheid);
+		if (g)
+			BBPunfix(g->batCacheid);
+		if (e)
+			BBPunfix(e->batCacheid);
+		if (s)
+			BBPunfix(s->batCacheid);
+		throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING);
+	}
+
+	oid min, max;		/* min and max group id */
+	BUN ngrp, ncand;	/* number of groups, number of candidates */
+	struct canditer ci;	/* candidate list iterator */
+	const char *err;	/* error message */
+	err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci, &ncand);
+	/* note that ncand == ci.ncand */
+	if (err != NULL) {
+		BBPunfix(b->batCacheid);
+		if (g)
+			BBPunfix(g->batCacheid);
+		if (e)
+			BBPunfix(e->batCacheid);
+		if (s)
+			BBPunfix(s->batCacheid);
+		throw(MAL, "gmean.gmean", "%s\n", err);
+	}
+
+	/* create a result BAT and initialize it with all NILs */
+	bn = BATconstant(min, TYPE_dbl, &(dbl){0}, ngrp, TRANSIENT);
+	if (bn == NULL) {
+		BBPunfix(b->batCacheid);
+		if (g)
+			BBPunfix(g->batCacheid);
+		if (e)
+			BBPunfix(e->batCacheid);
+		if (s)
+			BBPunfix(s->batCacheid);
+		throw(MAL, "gmean.gmean", "%s\n", GDK_EXCEPTION);
+	}
+
+	/* In general we need to distinguish two cases per group: there
+	 * are no non-nil values, then the result should be nil; there
+	 * are non-nil values, then the result should be the aggregate.
+	 * If there is an error (e.g. overflow), the abort_on_error flag
+	 * determines what should happen: if set, there should not be
+	 * any result (error return), if not set, the group value should
+	 * be set to nil.  We shouldn't get any overflow errors, so this
+	 * doesn't apply here. */
+	const int *vals = (const int *) Tloc(b, 0);
+	const oid *gids = NULL;
+	if (g && !BATtdense(g))
+		gids= (const oid *) Tloc(g, 0);
+	dbl *sums = (dbl *) Tloc(bn, 0); /* results */
+	/* For this aggregate, we need to maintain three states per
+	 * group: no non-nil values encountered yet; we encountered a
+	 * zero value; we encountered values, but no zeros.  We maintain whether we have seen any values with a bit in the temporary array 'seen'.  When we encoun */
+	uint32_t *seen = GDKzalloc(((ngrp + 31) / 32) * sizeof(uint32_t));
+	/* We need to maintain whether we have seen an odd or an even
+	 * number of negative values in each group, so we use a bitmap
+	 * for that (true -> odd number of negatives) */
+	uint32_t *odds = GDKzalloc(((ngrp + 31) / 32) * sizeof(uint32_t));
+	if (seen == NULL || odds == NULL) {
+		BBPunfix(b->batCacheid);
+		if (g)
+			BBPunfix(g->batCacheid);
+		if (e)
+			BBPunfix(e->batCacheid);
+		if (s)
+			BBPunfix(s->batCacheid);
+		/* one of these is NULL, but GDKfree can deal with that */
+		GDKfree(seen);
+		GDKfree(odds);
+		throw(MAL, "gmean.gmean", MAL_MALLOC_FAIL);
+	}
+	for (BUN i = 0; i < ncand; i++) {
+		oid o = canditer_next(&ci); /* id of candidate */
+		BUN p = o - b->hseqbase;    /* BUN (index) of candidate */
+		int v = vals[p];	    /* the actual value */
+		oid grp = gids ? gids[p] : g ? (oid) p : 0; /* group id */
+		if (grp >= min && grp <= max) { /* extra check */
+			if (is_int_nil(v))
+				continue; /* skip nils (no matter skip_nils) */
+			else if (v == 0)
+				sums[grp] = dbl_nil;
+			oid grp32 = grp >> 5;
+			oid mod32 = grp & 31;
+			if (!(seen[grp32] & (1U << mod32))) {
+				seen[grp32] |= 1U << mod32;
+			} else if (is_dbl_nil(sums[grp])) {
+				/* if sums[grp] is nil we've encountered
+				 * a zero */
+				continue;
+			}
+			if (v < 0) {
+				/* flip the bit for this group */
+				odds[grp32] ^= 1U << mod32;
+				sums[grp] += log((double) -v);
+			} else
+				sums[grp] += log((double) v);
+		}
+	}
+	/* we may change these two below */
+	bn->tnil = false;
+	bn->tnonil = true;
+	for (oid grp = 0; grp < ngrp; grp++) {
+		if (seen[grp >> 5] & (1U << (grp & 31))) {
+			if (is_dbl_nil(sums[grp])) {
+				/* we encountered a zero, so the result
+				 * is zero */
+				sums[grp] = 0;
+			} else {
+				/* convert the sum of the logs into the
+				 * geometric mean */
+				sums[grp] = exp(sums[grp]);
+				/* if we saw an odd number of negatives
+				 * in the group, return a negative
+				 * value */
+				if (odds[grp >> 5] & (1U << (grp & 31)))
+					sums[grp] = -sums[grp];
+			}
+		} else {
+			/* no values found in the group, so result is nil */
+			sums[grp] = dbl_nil;
+			bn->tnil = true;
+			bn->tnonil = false;
+		}
+	}
+
+	/* free resources */
+	GDKfree(seen);
+	GDKfree(odds);
+	BBPunfix(b->batCacheid);
+	if (g)
+		BBPunfix(g->batCacheid);
+	if (e)
+		BBPunfix(e->batCacheid);
+	if (s)
+		BBPunfix(s->batCacheid);
+
+	/* set the properties and count */
+	BATsetcount(bn, ngrp);
+	/* we don't know whether the result is sorted or whether there
+	 * are any duplicate values, so we must set these properties to
+	 * false */
+	bn->tsorted = false;
+	bn->trevsorted = false;
+	bn->tkey = false;
+
+	*retval = bn->batCacheid;
+	BBPkeepref(bn->batCacheid);
+	return MAL_SUCCEED;
+}
+
+static char *
+subgroupedgmean_int(bat *retval, const bat *bid, const bat *gid,
+		    const bat *eid,
+		    const bit *skip_nils, const bit *abort_on_error)
+{
+	return subgroupedgmeancand_int(retval, bid, gid, eid, NULL,
+				       skip_nils, abort_on_error);
+}
+
+static char *
+groupedgmean_int(bat *retval, const bat *bid, const bat *gid, const bat *eid)
+{
+	return subgroupedgmeancand_int(retval, bid, gid, eid, NULL,
+				       &(bit){1}, &(bit){1});
+}
+
+#include "mel.h"
+
+static char gmean_sql[] = "CREATE AGGREGATE FUNCTION gmean(val INTEGER)"
+	" returns DOUBLE external name gmean.gmean;";
+
+static mel_func gmean_init_funcs[] = {
+	command("gmean", "gmean", gmean_int, false,
+		"geometric mean",
+		args(1,2, arg("",dbl),batarg("b",int))),
+	command("gmean", "gmean", groupedgmean_int, false,
+		"geometric mean",
+		args(1,4, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1))),
+	command("gmean", "subgmean", subgroupedgmean_int, false,
+		"geometric mean",
+		args(1,6, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),arg("skip_nils",bit),arg("abort_on_error",bit))),
+	command("gmean", "subgmean", subgroupedgmeancand_int, false,
+		"geometric mean",
+		args(1,7, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),batarg("s",oid),arg("skip_nils",bit),arg("abort_on_error",bit))),
+	{ .imp=NULL }		/* sentinel */
+};
+
+#include "mal_import.h"
+#include "sql_import.h"
+#ifdef _MSC_VER
+#undef read
+#pragma section(".CRT$XCU",read)
+#endif
+LIB_STARTUP_FUNC(init_gmean)
+{
+	mal_module("gmean", NULL, gmean_init_funcs);
+	sql_register("gmean", gmean_sql);
+}