Mercurial > hg > MonetDB-extend
changeset 43:1cfe9d8a07e8
Implemented an aggregate tutorial.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Thu, 10 Jun 2021 16:22:08 +0200 (2021-06-10) |
parents | b67deab098f9 |
children | 44031912920c |
files | README.rst gmean/Makefile gmean/README.rst gmean/gmean.c |
diffstat | 4 files changed, 765 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/README.rst Thu Jun 10 16:21:46 2021 +0200 +++ b/README.rst Thu Jun 10 16:22:08 2021 +0200 @@ -76,3 +76,10 @@ This query filters the table t and selects all rows where the value of column ``c`` is equal to ``0``. + +gmean +..... + +The tutorial in the ``gmean`` subdirectory contains an example of an +AGGREGATE FUNCTION. These functions are used to calculate aggregates +over columns of values. Aggregates can be simple or grouped.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gmean/Makefile Thu Jun 10 16:22:08 2021 +0200 @@ -0,0 +1,36 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright 2013-2021 MonetDB B.V. + +LIBDIR = $(shell pkgconf --variable=libdir monetdb5) + +CFLAGS += $(shell pkgconf --cflags monetdb5) +LDFLAGS += $(shell pkgconf --libs monetdb5) + +all: lib_gmean.so + +lib_gmean.so: gmean.o + $(CC) -fPIC -DPIC -o lib_gmean.so -shared gmean.o $(LDFLAGS) -Wl,-soname -Wl,lib_gmean.so + +gmean.o: gmean.c + $(CC) -fPIC -DPIC $(CFLAGS) -c gmean.c + +doc: README.html README.pdf + +README.html: README.rst + rst2html -d README.rst > README.html + +README.pdf: README.rst + rst2pdf README.rst + +clean: + rm -f README.html README.pdf *.o *.so + +install: lib_gmean.so + cp lib_gmean.so $(DESTDIR)$(LIBDIR)/monetdb5 + +tar: MonetDB-gmean-1.2.tar.bz2 +MonetDB-gmean-1.2.tar.bz2: README.rst Makefile gmean.c + tar -cjf MonetDB-gmean-1.2.tar.bz2 --transform='s|^|MonetDB-gmean-1.2/|' README.rst Makefile gmean.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gmean/README.rst Thu Jun 10 16:22:08 2021 +0200 @@ -0,0 +1,435 @@ +.. This Source Code Form is subject to the terms of the Mozilla Public +.. License, v. 2.0. If a copy of the MPL was not distributed with this +.. file, You can obtain one at http://mozilla.org/MPL/2.0/. +.. +.. Copyright 2021 MonetDB B.V. + +.. This document is written in reStructuredText (see + http://docutils.sourceforge.net/ for more information). + Use ``rst2html.py`` to convert this file to HTML. + +================================== +Implementing an Aggregate Function +================================== + +:Author: Sjoerd Mullender +:Organization: MonetDB Solutions B.V., MonetDB B.V. +:Abstract: In this short tutorial we describe how to create an + aggregate function for MonetDB/SQL. + +Introduction +------------ + +We want to create a function to calculate the geometric mean of a +collection of integer values. The geometric mean of a set of *n* +values is the *n*\ th root of the product of those values. See e.g. the +`Wikipedia article`__. + +__ https://en.wikipedia.org/wiki/Geometric_mean + +We will make a slight change to the algorithm: if any of the values is +zero, we return zero (this follows logically from the definition), and +if there is an odd number of negative values, the product is negative, +but since an even root of a negative number would result in an +imaginary number, we will change that to simply a negative result. + +We will name the function ``gmean``, so that in SQL we can write: + +.. code-block:: sql + + SELECT gmean(i) FROM t; + +if we have a table ``t`` with a column ``i`` of type ``INTEGER``. + +In SQL we can also have grouped aggregates, so an aggregate value for +each member of a group. So we can also write: + +.. code-block:: sql + + SELECT g, gmean(i) FROM t GROUP BY g; + +if we have this same table with an additional column ``g`` that +specifies the groups to which the values belong. + +Note that SQL aggregates have a number of quirks. A value of ``NULL`` +does not count toward the aggregate at all. It is as if the value +simply does not exist. If there are no values to be aggregated (i.e., +all are ``NULL`` or there simply aren't any values), the result of the +aggregate is ``NULL``. If an error occurs during aggregation, such as +overflow or division by zero, an error should be raised. + +Implementation +-------------- + +Three things need to be done. We need to add a function definition to +the SQL catalog, we need to describe the MAL interface for the MAL +interpreter, and we need to create the C implementation of the +function. We will describe the three steps here in this order. + +SQL +... + +The SQL interface is simple: + +.. code-block:: sql + + CREATE AGGREGATE FUNCTION gmean(val INTEGER) RETURNS DOUBLE + EXTERNAL NAME gmean.gmean; + +Note that the word ``FUNCTION`` here is optional, you could also +write: + +.. code-block:: sql + + CREATE AGGREGATE gmean(val INTEGER) RETURNS DOUBLE + EXTERNAL NAME gmean.gmean; + +When we create an ``AGGREGATE FUNCTION`` in SQL, the SQL system +expects four MAL functions whose names are based on the external name +given. In this case, the MAL functions are called ``gmean.gmean`` and +``gmean.subgmean``, each with two overloaded definitions. + +The two functions ``gmean.gmean`` are for the simple aggregate and a +simple grouped aggregate. The two functions ``gmean.subgmean`` are +for grouped aggregates, but with some extra arguments. + +The ``CREATE`` statement will normally be executed once when the +database is created, after which it is part of the SQL catalog. To +accomplish this we need to store the SQL query in a C string (also see +the *reverse* tutorial): + +.. code-block:: c + + static char gmean_sql[] = "CREATE AGGREGATE FUNCTION gmean(val INTEGER)" + " returns DOUBLE external name gmean.gmean;"; + +At the SQL side we don't need to do anything more. + +MAL +... + +As mentioned, the MAL interface consists of four functions whose names +are based on the names specified in the SQL interface. The interface +looks like this: + +.. code-block:: + + module gmean; + + command gmean(val:bat[:int]) :dbl + address gmean_int + comment "geometric mean"; + + command gmean(val:bat[:int], g:bat[:oid], e:bat[:any_1]) :bat[:dbl] + address groupedgmean_int + comment "geometric mean"; + + command subgmean(val:bat[:int], g:bat[:oid], e:bat[:any_1], + skip_nils:bit, abort_on_error:bit) :bat[:dbl] + address subgroupedgmean_int + comment "geometric mean"; + + command subgmean(val:bat[:int], g:bat[:oid], e:bat[:any_1], s:bat[:oid], + skip_nils:bit, abort_on_error:bit) :bat[:dbl] + address subgroupedgmeancand_int + comment "geometric mean"; + +We encode these MAL commands in a C array (again, see the *reverse* +tutorial): + +.. code-block:: c + + static mel_func gmean_init_funcs[] = { + command("gmean", "gmean", gmean_int, false, + "geometric mean", + args(1,2, arg("",dbl),batarg("b",int))), + command("gmean", "gmean", groupedgmean_int, false, + "geometric mean", + args(1,4, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1))), + command("gmean", "subgmean", subgroupedgmean_int, false, + "geometric mean", + args(1,6, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),arg("skip_nils",bit),arg("abort_on_error",bit))), + command("gmean", "subgmean", subgroupedgmeancand_int, false, + "geometric mean", + args(1,7, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),batarg("s",oid),arg("skip_nils",bit),arg("abort_on_error",bit))), + { .imp=NULL } /* sentinel */ + }; + +C Implementation +................ + +In the C implementation we distinguish two cases. The first, simpler +case is an aggregate of a set of values that returns a single value. +This is the implementation of the first of the four MAL interfaces. +The second case is for the three other MAL interfaces which are all +for grouped aggregates. Those three variants can all be implemented +with a single C function, but since the parameter are different, we +need an indirection. + +First the simple case. + +Geometric Mean +`````````````` + +First a few words about the implementation of the algorithm to +calculate the geometric mean. As mentioned, we should multiply all +values together and the take the *n*\ th root of the result. If we +were to do this, the intermediate result (the product) would become +very large and likely overflow. So instead what we do is take the +logarithm of each of the values, add those up, and divide the sum of +the logarithms by the number of values. Then as a final step we take +the exponent of this value to give us the geometric mean. + +There are a few caveats in this algorithm. The logarithm of zero of +of negative numbers does not exist, so we need to take special care. +If we encounter a zero in the list of values we can simply shortcut +the calculation: the end result will be zero as well, so we can return +that immediately. For negative values we should be returning an +imaginary number, but since MonetDB does not support that, we could +choose between returning an error and returning some other value. We +have chosen to return minus the exponent if the product were to become +negative. This means we have to count the number of negative values +and if the total number of negative values is odd, we negate the +result of the exponent. + +Simple Aggregate +```````````````` + +The C interface for the first MAL function is: + +.. code-block:: c + + static char *gmean_int(dbl *res, const bat *bid); + +As before, ``*bid`` is the BAT identifier (an integer) and this needs +to be converted to a BAT descriptor, loading the data at the same +time. It the call fails, we return a standard error: + +.. code-block:: c + + if ((b = BATdescriptor(*bid)) == NULL) + throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING); + +The data of the BAT is in essence a C array, so we get a pointer to +the first value of this array: + +.. code-block:: c + + const int *vals = (const int *) Tloc(b, 0); + +The ``Tloc`` macro takes a BAT descriptor and an index as argument and +returns a pointer to the value at the specified index. We specify the +index ``0``, so we get a pointer to the first value. Since the data +is laid out as a C array, this pointer can be used as such. The +``Tloc`` macro is defined to return a ``char *`` pointer, so we need +to cast it to the correct type. + +We will accumulate the sum of the logarithms in a variable ``val`` +which we initialize to ``0``: + +.. code-block:: c + + double val = 0; + +Then we iterate over all values in the BAT: + +.. code-block:: c + + for (BUN i = 0, n = BATcount(b); i < n; i++) { + ... + } + +We don't want to call BATcount(b) for each iteration, so we stick that +in a temporary variable ``n``. + +Inside the loop, ``vals[i]`` is the *i*\ th value. There are a number +of possibilities. + +If the value is ``NULL``, at the C level this is called ``nil``, we +need to skip the value. For all built-in types there is a macro to +check whether a value is equal to ``nil``. For ``int`` values, the +macro is ``is_int_nil``. For integer types, this doesn't matter so +much, but for floating point types (``dbl`` and ``flt``) it is +important to use the macro, since the implementation of ``nil`` for +these types is a "Not a Number" which cannot be compared using simple +(in)equality. + +If the value is zero, we can immediately return. The end result is +zero. We simply free the resources that we have used by calling +``BBPunfix``, fill in the result pointer, and return. + +Then if the value is negative, we have to count it and add the log of +the negated value (which is then positive). + +And finally, for a positive value, we simply add the log of the value. + +We also need to know how many values are involved so that we can +divide the sum properly. + +After the loop, we first free the resources we don't need anymore: + +.. code-block:: c + + BBPunfix(b->batCacheid); + +Then we distinguish no values (``nval == 0``) where we have to return +``nil`` from having values where we need to calculate the exponent of +the sum after dividing it by the number of values. And finally, if +there were an odd number of negative values, we negate the geometric +mean. + +And then we return success. + +Grouped Aggregates +`````````````````` + +The grouped aggregates can all be implemented using a single +function. However, the C interfaces of the three functions differ, so +we need two stub functions to call the third, most complete version. + +The stub functions look like this: + +.. code-block:: c + + static char * + subgroupedgmean_int(bat *retval, const bat *bid, const bat *gid, + const bat *eid, + const bit *skip_nils, const bit *abort_on_error) + { + return subgroupedgmeancand_int(retval, bid, gid, eid, NULL, + skip_nils, abort_on_error); + } + + static char * + groupedgmean_int(bat *retval, const bat *bid, const bat *gid, const bat *eid) + { + return subgroupedgmeancand_int(retval, bid, gid, eid, NULL, + &(bit){1}, &(bit){1}); + } + +These are the C implementations of second and third MAL interface +above. The fourth MAL interface is the most complete and is the one +the other two use: + +.. code-block:: c + + static char * + subgroupedgmeancand_int(bat *retval, const bat *bid, const bat *gid, + const bat *eid, const bat *sid, + const bit *skip_nils, const bit *abort_on_error) + { + ... + } + +The parameters, in order, are, a location to return the BAT identifier +of the result BAT in, the BAT identifier of the BAT to be aggregated, +the BAT identifier of the BAT that specifies the groups, the BAT +identifier of a BAT that has a value for each of the groups (the +groupby implementation produces the groups and two other BATs with +so-called extents and a histogram, either of these can be used here), +the BAT identifier of a candidate list that specifies which rows of +the first and second input BAT are to be used, a Boolean value that +specifies whether or not to skip NULL (nil) values, and a Boolean +value that indicates whether errors should produce a nil result or +raise an error. + +The ``skip_nils`` value is generally ``true``, and we will actually +ignore it. It could be ``false`` for e.g. a ``COUNT(*)`` aggregate +where ``NULL`` values have to be counted as well. The +``abort_on_error`` value is also generally ``true`` since that is the +normal way SQL expects errors to be handled. We will ignore this +value as well. + +When an aggregate function is called, the first input parameter should +refer to a BAT, but the other three input BATs are optional. It they +are not present, the function argument should either be ``NULL``, or +should point to a value that represents ``nil`` for BAT identifiers. +If there is no group BAT, we should treat the all values as belonging +to a single group. If there is no "extents" BAT, we need to figure +out in some other way how many groups there are. If there is no +candidate list, all rows participate, otherwise only rows indicated by +the candidate list. + +We start by loading all input BATs and checking that they could be +loaded. This needs to be done, taking into account that three of the +input BATs are optional. Also, if loading of any of the BATs fails, +we clean up and return an error: + +.. code-block:: c + + BAT *g = NULL; + BAT *e = NULL; + BAT *s = NULL; + if ((b = BATdescriptor(*bid)) == NULL || + (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) || + (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) || + (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) { + if (b) + BBPunfix(b->batCacheid); + if (g) + BBPunfix(g->batCacheid); + if (e) + BBPunfix(e->batCacheid); + if (s) + BBPunfix(s->batCacheid); + throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING); + } + +The next step is to use the input BATs to initialize some variables +that we need for the groups: + +.. code-block:: c + + oid min, max; /* min and max group id */ + BUN ngrp, ncand; /* number of groups, number of candidates */ + struct canditer ci; /* candidate list iterator */ + const char *err; /* error message */ + err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci, &ncand); + +The function ``BATgroupaggrinit`` returns a static string with an +error message if it finds an error in the input. If it returns +``NULL``, there was no error and the values for ``min``, ``max``, +``ngrp``, ``ci``, and ``ncand`` are filled in. + +Note that if there is no candidate list ``s`` (i.e. the value is +``NULL``), the candidate iterator is initialized to iterate through +all values. + +The next part of the code has a lot to do with the calculation of the +geometric mean using the logarithms of the values. The important bits +for us are how to iterate through the candidates: + +.. code-block:: c + + const int *vals = (const int *) Tloc(b, 0); + const oid *gids = NULL; + if (g && !BATtdense(g)) + gids= (const oid *) Tloc(g, 0); + for (BUN i = 0; i < ncand; i++) { + oid o = canditer_next(&ci); + BUN p = o - b->hseqbase; + int v = vals[p]; + oid grp = gids ? gids[p] : g ? (oid) p : 0; /* group id */ + if (grp >= min && grp <= max) { /* extra check */ + ... + } + } + +There are three possibilities for the group BAT. There may not be one +(``g`` is ``NULL``), then all values are in a single group. The group +BAT may be dense (``BATtdense(g)`` is ``true``), then there is no C +array that we can obtain using the ``Tloc`` macro described earlier, +but then we know that all values are in their own group. The third +possibility is the most common one where there is a group BAT and we +can retrieve the data using the ``Tloc`` macro. In this last case, +the ``gids`` pointer will be non-NULL and it can be used to find the +group ID for each value. + +For the algorithm we use two temporary bit masks to have two bits for +each group. One bit in one mask is used to indicate whether we have +seen any values in the group. The bit in the other mask is used to +maintain whether we have seen an even or an odd number of negative +values in the group. + +At the end of the function, we free all resources, set the properties +of the result BAT and return it.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gmean/gmean.c Thu Jun 10 16:22:08 2021 +0200 @@ -0,0 +1,287 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2021 MonetDB B.V. + */ + +/* monetdb_config.h must be included as the first include file */ +#include <monetdb_config.h> + +/* mal_exception.h actually contains almost everything we need */ +#include <mal_exception.h> + +/* for the candidate iterator */ +#include <gdk_cand.h> + +/* for log() and exp() */ +#include <math.h> + +static char * +gmean_int(dbl *res, const bat *bid) +{ + BAT *b; + double val = 0; + BUN nneg = 0; + BUN nval = 0; + + if ((b = BATdescriptor(*bid)) == NULL) + throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING); + + const int *vals = (const int *) Tloc(b, 0); + for (BUN i = 0, n = BATcount(b); i < n; i++) { + if (is_int_nil(vals[i])) + continue; /* nils are ignored */ + if (vals[i] == 0) { + /* any value zero is easy: result is zero */ + BBPunfix(b->batCacheid); + *res = 0; + return MAL_SUCCEED; + } + if (vals[i] < 0) { + nneg++; /* one more negative value */ + val += log((double) -vals[i]); + } else { + val += log((double) vals[i]); + } + nval++; /* count non-nil values */ + } + BBPunfix(b->batCacheid); + if (nval == 0) { + /* if there are no non-nil values, the result is nil */ + *res = dbl_nil; + } else { + *res = exp(val / nval); + /* if the number of negative values in the input is odd, + * we return (incorrectly, but on purpose) a negative + * value (it should really be an imaginary number) */ + if (nneg & (BUN) 1) + *res = - *res; + } + return MAL_SUCCEED; +} + +static char * +subgroupedgmeancand_int(bat *retval, const bat *bid, const bat *gid, + const bat *eid, const bat *sid, + const bit *skip_nils, const bit *abort_on_error) +{ + BAT *b, *bn; /* these two are always assigned */ + BAT *g = NULL; /* these three are optional and may not ... */ + BAT *e = NULL; /* ... be assigned to below, ... */ + BAT *s = NULL; /* ... so we initialize them here */ + + /* we ignore these two inputs */ + (void) skip_nils; + (void) abort_on_error; + + /* the bat we're supposed to be working on (bid) is not + * optional, but the others are, so we test whether the bat id + * is not nil, and if it isn't, whether we can find the BAT + * descriptor */ + if ((b = BATdescriptor(*bid)) == NULL || + (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) || + (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) || + (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) { + if (b) + BBPunfix(b->batCacheid); + if (g) + BBPunfix(g->batCacheid); + if (e) + BBPunfix(e->batCacheid); + if (s) + BBPunfix(s->batCacheid); + throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING); + } + + oid min, max; /* min and max group id */ + BUN ngrp, ncand; /* number of groups, number of candidates */ + struct canditer ci; /* candidate list iterator */ + const char *err; /* error message */ + err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci, &ncand); + /* note that ncand == ci.ncand */ + if (err != NULL) { + BBPunfix(b->batCacheid); + if (g) + BBPunfix(g->batCacheid); + if (e) + BBPunfix(e->batCacheid); + if (s) + BBPunfix(s->batCacheid); + throw(MAL, "gmean.gmean", "%s\n", err); + } + + /* create a result BAT and initialize it with all NILs */ + bn = BATconstant(min, TYPE_dbl, &(dbl){0}, ngrp, TRANSIENT); + if (bn == NULL) { + BBPunfix(b->batCacheid); + if (g) + BBPunfix(g->batCacheid); + if (e) + BBPunfix(e->batCacheid); + if (s) + BBPunfix(s->batCacheid); + throw(MAL, "gmean.gmean", "%s\n", GDK_EXCEPTION); + } + + /* In general we need to distinguish two cases per group: there + * are no non-nil values, then the result should be nil; there + * are non-nil values, then the result should be the aggregate. + * If there is an error (e.g. overflow), the abort_on_error flag + * determines what should happen: if set, there should not be + * any result (error return), if not set, the group value should + * be set to nil. We shouldn't get any overflow errors, so this + * doesn't apply here. */ + const int *vals = (const int *) Tloc(b, 0); + const oid *gids = NULL; + if (g && !BATtdense(g)) + gids= (const oid *) Tloc(g, 0); + dbl *sums = (dbl *) Tloc(bn, 0); /* results */ + /* For this aggregate, we need to maintain three states per + * group: no non-nil values encountered yet; we encountered a + * zero value; we encountered values, but no zeros. We maintain whether we have seen any values with a bit in the temporary array 'seen'. When we encoun */ + uint32_t *seen = GDKzalloc(((ngrp + 31) / 32) * sizeof(uint32_t)); + /* We need to maintain whether we have seen an odd or an even + * number of negative values in each group, so we use a bitmap + * for that (true -> odd number of negatives) */ + uint32_t *odds = GDKzalloc(((ngrp + 31) / 32) * sizeof(uint32_t)); + if (seen == NULL || odds == NULL) { + BBPunfix(b->batCacheid); + if (g) + BBPunfix(g->batCacheid); + if (e) + BBPunfix(e->batCacheid); + if (s) + BBPunfix(s->batCacheid); + /* one of these is NULL, but GDKfree can deal with that */ + GDKfree(seen); + GDKfree(odds); + throw(MAL, "gmean.gmean", MAL_MALLOC_FAIL); + } + for (BUN i = 0; i < ncand; i++) { + oid o = canditer_next(&ci); /* id of candidate */ + BUN p = o - b->hseqbase; /* BUN (index) of candidate */ + int v = vals[p]; /* the actual value */ + oid grp = gids ? gids[p] : g ? (oid) p : 0; /* group id */ + if (grp >= min && grp <= max) { /* extra check */ + if (is_int_nil(v)) + continue; /* skip nils (no matter skip_nils) */ + else if (v == 0) + sums[grp] = dbl_nil; + oid grp32 = grp >> 5; + oid mod32 = grp & 31; + if (!(seen[grp32] & (1U << mod32))) { + seen[grp32] |= 1U << mod32; + } else if (is_dbl_nil(sums[grp])) { + /* if sums[grp] is nil we've encountered + * a zero */ + continue; + } + if (v < 0) { + /* flip the bit for this group */ + odds[grp32] ^= 1U << mod32; + sums[grp] += log((double) -v); + } else + sums[grp] += log((double) v); + } + } + /* we may change these two below */ + bn->tnil = false; + bn->tnonil = true; + for (oid grp = 0; grp < ngrp; grp++) { + if (seen[grp >> 5] & (1U << (grp & 31))) { + if (is_dbl_nil(sums[grp])) { + /* we encountered a zero, so the result + * is zero */ + sums[grp] = 0; + } else { + /* convert the sum of the logs into the + * geometric mean */ + sums[grp] = exp(sums[grp]); + /* if we saw an odd number of negatives + * in the group, return a negative + * value */ + if (odds[grp >> 5] & (1U << (grp & 31))) + sums[grp] = -sums[grp]; + } + } else { + /* no values found in the group, so result is nil */ + sums[grp] = dbl_nil; + bn->tnil = true; + bn->tnonil = false; + } + } + + /* free resources */ + GDKfree(seen); + GDKfree(odds); + BBPunfix(b->batCacheid); + if (g) + BBPunfix(g->batCacheid); + if (e) + BBPunfix(e->batCacheid); + if (s) + BBPunfix(s->batCacheid); + + /* set the properties and count */ + BATsetcount(bn, ngrp); + /* we don't know whether the result is sorted or whether there + * are any duplicate values, so we must set these properties to + * false */ + bn->tsorted = false; + bn->trevsorted = false; + bn->tkey = false; + + *retval = bn->batCacheid; + BBPkeepref(bn->batCacheid); + return MAL_SUCCEED; +} + +static char * +subgroupedgmean_int(bat *retval, const bat *bid, const bat *gid, + const bat *eid, + const bit *skip_nils, const bit *abort_on_error) +{ + return subgroupedgmeancand_int(retval, bid, gid, eid, NULL, + skip_nils, abort_on_error); +} + +static char * +groupedgmean_int(bat *retval, const bat *bid, const bat *gid, const bat *eid) +{ + return subgroupedgmeancand_int(retval, bid, gid, eid, NULL, + &(bit){1}, &(bit){1}); +} + +#include "mel.h" + +static char gmean_sql[] = "CREATE AGGREGATE FUNCTION gmean(val INTEGER)" + " returns DOUBLE external name gmean.gmean;"; + +static mel_func gmean_init_funcs[] = { + command("gmean", "gmean", gmean_int, false, + "geometric mean", + args(1,2, arg("",dbl),batarg("b",int))), + command("gmean", "gmean", groupedgmean_int, false, + "geometric mean", + args(1,4, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1))), + command("gmean", "subgmean", subgroupedgmean_int, false, + "geometric mean", + args(1,6, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),arg("skip_nils",bit),arg("abort_on_error",bit))), + command("gmean", "subgmean", subgroupedgmeancand_int, false, + "geometric mean", + args(1,7, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),batarg("s",oid),arg("skip_nils",bit),arg("abort_on_error",bit))), + { .imp=NULL } /* sentinel */ +}; + +#include "mal_import.h" +#include "sql_import.h" +#ifdef _MSC_VER +#undef read +#pragma section(".CRT$XCU",read) +#endif +LIB_STARTUP_FUNC(init_gmean) +{ + mal_module("gmean", NULL, gmean_init_funcs); + sql_register("gmean", gmean_sql); +}