view gmean/gmean.c @ 46:3b9611f1b048

Some updates.
author Sjoerd Mullender <sjoerd@acm.org>
date Fri, 11 Jun 2021 10:02:09 +0200 (2021-06-11)
parents 1cfe9d8a07e8
children c8140b1fabf5
line wrap: on
line source
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0.  If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * Copyright 2021 MonetDB B.V.
 */

/* monetdb_config.h must be included as the first include file */
#include <monetdb_config.h>

/* mal_exception.h actually contains almost everything we need */
#include <mal_exception.h>

/* for the candidate iterator */
#include <gdk_cand.h>

/* for log() and exp() */
#include <math.h>

static char *
gmean_int(dbl *res, const bat *bid)
{
	BAT *b;
	double val = 0;
	BUN nneg = 0;
	BUN nval = 0;

	if ((b = BATdescriptor(*bid)) == NULL)
		throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING);

	const int *vals = (const int *) Tloc(b, 0);
	for (BUN i = 0, n = BATcount(b); i < n; i++) {
		if (is_int_nil(vals[i]))
			continue; /* nils are ignored */
		if (vals[i] == 0) {
			/* any value zero is easy: result is zero */
			BBPunfix(b->batCacheid);
			*res = 0;
			return MAL_SUCCEED;
		}
		if (vals[i] < 0) {
			nneg++;	/* one more negative value */
			val += log((double) -vals[i]);
		} else {
			val += log((double) vals[i]);
		}
		nval++;		/* count non-nil values */
	}
	BBPunfix(b->batCacheid);
	if (nval == 0) {
		/* if there are no non-nil values, the result is nil */
		*res = dbl_nil;
	} else {
		*res = exp(val / nval);
		/* if the number of negative values in the input is odd,
		 * we return (incorrectly, but on purpose) a negative
		 * value (it should really be an imaginary number) */
		if (nneg & (BUN) 1)
			*res = - *res;
	}
	return MAL_SUCCEED;
}

static char *
subgroupedgmeancand_int(bat *retval, const bat *bid, const bat *gid,
			const bat *eid, const bat *sid,
			const bit *skip_nils, const bit *abort_on_error)
{
	BAT *b, *bn;		/* these two are always assigned */
	BAT *g = NULL;		/* these three are optional and may not ... */
	BAT *e = NULL;		/* ... be assigned to below, ... */
	BAT *s = NULL;		/* ... so we initialize them here */

	/* we ignore these two inputs */
	(void) skip_nils;
	(void) abort_on_error;

	/* the bat we're supposed to be working on (bid) is not
	 * optional, but the others are, so we test whether the bat id
	 * is not nil, and if it isn't, whether we can find the BAT
	 * descriptor */
	if ((b = BATdescriptor(*bid)) == NULL ||
	    (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) ||
	    (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) ||
	    (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) {
		if (b)
			BBPunfix(b->batCacheid);
		if (g)
			BBPunfix(g->batCacheid);
		if (e)
			BBPunfix(e->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING);
	}

	oid min, max;		/* min and max group id */
	BUN ngrp, ncand;	/* number of groups, number of candidates */
	struct canditer ci;	/* candidate list iterator */
	const char *err;	/* error message */
	err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci, &ncand);
	/* note that ncand == ci.ncand */
	if (err != NULL) {
		BBPunfix(b->batCacheid);
		if (g)
			BBPunfix(g->batCacheid);
		if (e)
			BBPunfix(e->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		throw(MAL, "gmean.gmean", "%s\n", err);
	}

	/* create a result BAT and initialize it with all zeros */
	bn = BATconstant(min, TYPE_dbl, &(dbl){0}, ngrp, TRANSIENT);
	if (bn == NULL) {
		BBPunfix(b->batCacheid);
		if (g)
			BBPunfix(g->batCacheid);
		if (e)
			BBPunfix(e->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		throw(MAL, "gmean.gmean", "%s\n", GDK_EXCEPTION);
	}

	/* In general we need to distinguish two cases per group: there
	 * are no non-nil values, then the result should be nil; there
	 * are non-nil values, then the result should be the aggregate.
	 * If there is an error (e.g. overflow), the abort_on_error flag
	 * determines what should happen: if set, there should not be
	 * any result (error return), if not set, the group value should
	 * be set to nil.  We shouldn't get any overflow errors, so this
	 * doesn't apply here. */
	const int *vals = (const int *) Tloc(b, 0);
	const oid *gids = NULL;
	if (g && !BATtdense(g))
		gids= (const oid *) Tloc(g, 0);
	dbl *sums = (dbl *) Tloc(bn, 0); /* results */
	/* For this aggregate, we need to maintain three states per
	 * group: no non-nil values encountered yet; we encountered a
	 * zero value; we encountered values, but no zeros.  We maintain whether we have seen any values with a bit in the temporary array 'seen'.  When we encoun */
	uint32_t *seen = GDKzalloc(((ngrp + 31) / 32) * sizeof(uint32_t));
	/* We need to maintain whether we have seen an odd or an even
	 * number of negative values in each group, so we use a bitmap
	 * for that (true -> odd number of negatives) */
	uint32_t *odds = GDKzalloc(((ngrp + 31) / 32) * sizeof(uint32_t));
	if (seen == NULL || odds == NULL) {
		BBPunfix(b->batCacheid);
		if (g)
			BBPunfix(g->batCacheid);
		if (e)
			BBPunfix(e->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		/* one of these is NULL, but GDKfree can deal with that */
		GDKfree(seen);
		GDKfree(odds);
		throw(MAL, "gmean.gmean", MAL_MALLOC_FAIL);
	}
	for (BUN i = 0; i < ncand; i++) {
		oid o = canditer_next(&ci); /* id of candidate */
		BUN p = o - b->hseqbase;    /* BUN (index) of candidate */
		int v = vals[p];	    /* the actual value */
		oid grp = gids ? gids[p] : g ? (oid) p : 0; /* group id */
		if (grp >= min && grp <= max) { /* extra check */
			if (is_int_nil(v))
				continue; /* skip nils (no matter skip_nils) */
			else if (v == 0)
				sums[grp] = dbl_nil;
			oid grp32 = grp >> 5;
			oid mod32 = grp & 31;
			if (!(seen[grp32] & (1U << mod32))) {
				seen[grp32] |= 1U << mod32;
			} else if (is_dbl_nil(sums[grp])) {
				/* if sums[grp] is nil we've encountered
				 * a zero */
				continue;
			}
			if (v < 0) {
				/* flip the bit for this group */
				odds[grp32] ^= 1U << mod32;
				sums[grp] += log((double) -v);
			} else
				sums[grp] += log((double) v);
		}
	}
	/* we may change these two below */
	bn->tnil = false;
	bn->tnonil = true;
	for (oid grp = 0; grp < ngrp; grp++) {
		if (seen[grp >> 5] & (1U << (grp & 31))) {
			if (is_dbl_nil(sums[grp])) {
				/* we encountered a zero, so the result
				 * is zero */
				sums[grp] = 0;
			} else {
				/* convert the sum of the logs into the
				 * geometric mean */
				sums[grp] = exp(sums[grp]);
				/* if we saw an odd number of negatives
				 * in the group, return a negative
				 * value */
				if (odds[grp >> 5] & (1U << (grp & 31)))
					sums[grp] = -sums[grp];
			}
		} else {
			/* no values found in the group, so result is nil */
			sums[grp] = dbl_nil;
			bn->tnil = true;
			bn->tnonil = false;
		}
	}

	/* free resources */
	GDKfree(seen);
	GDKfree(odds);
	BBPunfix(b->batCacheid);
	if (g)
		BBPunfix(g->batCacheid);
	if (e)
		BBPunfix(e->batCacheid);
	if (s)
		BBPunfix(s->batCacheid);

	/* set the properties and count */
	BATsetcount(bn, ngrp);
	/* we don't know whether the result is sorted or whether there
	 * are any duplicate values, so we must set these properties to
	 * false */
	bn->tsorted = false;
	bn->trevsorted = false;
	bn->tkey = false;

	*retval = bn->batCacheid;
	BBPkeepref(bn->batCacheid);
	return MAL_SUCCEED;
}

static char *
subgroupedgmean_int(bat *retval, const bat *bid, const bat *gid,
		    const bat *eid,
		    const bit *skip_nils, const bit *abort_on_error)
{
	return subgroupedgmeancand_int(retval, bid, gid, eid, NULL,
				       skip_nils, abort_on_error);
}

static char *
groupedgmean_int(bat *retval, const bat *bid, const bat *gid, const bat *eid)
{
	return subgroupedgmeancand_int(retval, bid, gid, eid, NULL,
				       &(bit){1}, &(bit){1});
}

#include "mel.h"

static char gmean_sql[] = "CREATE AGGREGATE FUNCTION gmean(val INTEGER)"
	" returns DOUBLE external name gmean.gmean;";

static mel_func gmean_init_funcs[] = {
	command("gmean", "gmean", gmean_int, false,
		"geometric mean",
		args(1,2, arg("",dbl),batarg("b",int))),
	command("gmean", "gmean", groupedgmean_int, false,
		"geometric mean",
		args(1,4, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1))),
	command("gmean", "subgmean", subgroupedgmean_int, false,
		"geometric mean",
		args(1,6, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),arg("skip_nils",bit),arg("abort_on_error",bit))),
	command("gmean", "subgmean", subgroupedgmeancand_int, false,
		"geometric mean",
		args(1,7, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),batarg("s",oid),arg("skip_nils",bit),arg("abort_on_error",bit))),
	{ .imp=NULL }		/* sentinel */
};

#include "mal_import.h"
#include "sql_import.h"
#ifdef _MSC_VER
#undef read
#pragma section(".CRT$XCU",read)
#endif
LIB_STARTUP_FUNC(init_gmean)
{
	mal_module("gmean", NULL, gmean_init_funcs);
	sql_register("gmean", gmean_sql);
}