view gmean/gmean.c @ 61:b42fe5c9bd34 default tip

update to new versioned monetdb directories
author Niels Nes <niels@cwi.nl>
date Mon, 23 Sep 2024 13:34:22 +0200 (10 months ago)
parents 02895996506d
children
line wrap: on
line source
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0.  If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * Copyright 2021-2022 MonetDB B.V.
 */

/* monetdb_config.h must be included as the first include file */
#include <monetdb_config.h>

/* mal_exception.h actually contains almost everything we need */
#include <mal_exception.h>

/* for the candidate iterator */
#include <gdk_cand.h>

/* for log() and exp() */
#include <math.h>

static char *
gmean_int(dbl *res, const bat *bid)
{
	BAT *b;
	double val = 0;
	BUN nneg = 0;
	BUN nval = 0;

	if ((b = BATdescriptor(*bid)) == NULL)
		throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING);

	const int *vals = (const int *) Tloc(b, 0);
	for (BUN i = 0, n = BATcount(b); i < n; i++) {
		if (is_int_nil(vals[i]))
			continue; /* nils are ignored */
		if (vals[i] == 0) {
			/* any value zero is easy: result is zero */
			BBPunfix(b->batCacheid);
			*res = 0;
			return MAL_SUCCEED;
		}
		if (vals[i] < 0) {
			nneg++;	/* one more negative value */
			val += log((double) -vals[i]);
		} else {
			val += log((double) vals[i]);
		}
		nval++;		/* count non-nil values */
	}
	BBPunfix(b->batCacheid);
	if (nval == 0) {
		/* if there are no non-nil values, the result is nil */
		*res = dbl_nil;
	} else {
		*res = exp(val / nval);
		/* if the number of negative values in the input is odd,
		 * we return (incorrectly, but on purpose) a negative
		 * value (it should really be an imaginary number) */
		if (nneg & (BUN) 1)
			*res = - *res;
	}
	return MAL_SUCCEED;
}

static char *
subgroupedgmeancand_int(bat *retval, const bat *bid, const bat *gid,
			const bat *eid, const bat *sid,
			const bit *skip_nils)
{
	BAT *b, *bn;		/* these two are always assigned */
	BAT *g = NULL;		/* these three are optional and may not ... */
	BAT *e = NULL;		/* ... be assigned to below, ... */
	BAT *s = NULL;		/* ... so we initialize them here */

	/* we ignore these two inputs */
	(void) skip_nils;

	/* the bat we're supposed to be working on (bid) is not
	 * optional, but the others are, so we test whether the bat id
	 * is not nil, and if it isn't, whether we can find the BAT
	 * descriptor */
	if ((b = BATdescriptor(*bid)) == NULL ||
	    (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) ||
	    (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) ||
	    (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) {
		if (b)
			BBPunfix(b->batCacheid);
		if (g)
			BBPunfix(g->batCacheid);
		if (e)
			BBPunfix(e->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		throw(MAL, "gmean.gmean", RUNTIME_OBJECT_MISSING);
	}

	oid min, max;		/* min and max group id */
	BUN ngrp, ncand;	/* number of groups, number of candidates */
	struct canditer ci;	/* candidate list iterator */
	const char *err;	/* error message */
	err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci, &ncand);
	/* note that ncand == ci.ncand */
	if (err != NULL) {
		BBPunfix(b->batCacheid);
		if (g)
			BBPunfix(g->batCacheid);
		if (e)
			BBPunfix(e->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		throw(MAL, "gmean.gmean", "%s\n", err);
	}

	/* create a result BAT and initialize it with all zeros */
	bn = BATconstant(min, TYPE_dbl, &(dbl){0}, ngrp, TRANSIENT);
	if (bn == NULL) {
		BBPunfix(b->batCacheid);
		if (g)
			BBPunfix(g->batCacheid);
		if (e)
			BBPunfix(e->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		throw(MAL, "gmean.gmean", "%s\n", GDK_EXCEPTION);
	}

	/* We are going to change the contents of the newly created BAT.
	 * We are allowed to do that because we are the "owner" and no
	 * other thread can have a reference to this BAT yet.  First we
	 * set the properties.  We don't know whether the result is
	 * sorted or whether there are any duplicate values, so we must
	 * set these properties to false. */
	bn->tsorted = false;
	bn->trevsorted = false;
	bn->tkey = false;
	/* we may change these two below */
	bn->tnil = false;
	bn->tnonil = true;

	/* In general we need to distinguish two cases per group: there
	 * are no non-nil values, then the result should be nil; there
	 * are non-nil values, then the result should be the aggregate.
	 * If there is an error (e.g. overflow), there should not be any
	 * result (error return).  We shouldn't get any overflow errors,
	 * so this doesn't apply here. */
	const int *vals = (const int *) Tloc(b, 0);
	const oid *gids = NULL;
	if (g && !BATtdense(g))
		gids= (const oid *) Tloc(g, 0);
	dbl *sums = (dbl *) Tloc(bn, 0); /* results */
	/* For this aggregate, we need to maintain the number of values
	 * we have added up per group.  We can also keep track of
	 * whether we have seen any zero values.  GDKzalloc initializes
	 * the allocated memory with zero bits. */
	BUN *cnts = GDKzalloc(ngrp * sizeof(BUN));
	/* We need to maintain whether we have seen an odd or an even
	 * number of negative values in each group, so we use a bitmap
	 * for that (true -> odd number of negatives) */
	uint32_t *odds = GDKzalloc(((ngrp + 31) / 32) * sizeof(uint32_t));
	if (cnts == NULL || odds == NULL) {
		BBPunfix(b->batCacheid);
		if (g)
			BBPunfix(g->batCacheid);
		if (e)
			BBPunfix(e->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		/* one of these is NULL, but GDKfree can deal with that */
		GDKfree(cnts);
		GDKfree(odds);
		throw(MAL, "gmean.gmean", MAL_MALLOC_FAIL);
	}
	for (BUN i = 0; i < ncand; i++) {
		oid o = canditer_next(&ci); /* id of candidate */
		BUN p = o - b->hseqbase;    /* BUN (index) of candidate */
		int v = vals[p];	    /* the actual value */
		oid grp = gids ? gids[p] : g ? min + (oid) p : 0; /* group id */
		if (grp >= min && grp <= max) { /* extra check */
			grp -= min;		/* zero based access */
			if (is_int_nil(v))
				continue; /* skip nils (no matter skip_nils) */
			else if (v == 0) {
				cnts[grp] = BUN_NONE;
				continue;
			} else if (cnts[grp] == BUN_NONE) {
				/* we already encountered a zero value
				 * in this group */
				continue;
			}
			/* non zero and non NULL value */
			cnts[grp]++;
			if (v < 0) {
				/* flip the bit for this group */
				sums[grp] += log((double) -v);
				/* flip the bit for this group */
				odds[grp >> 5] ^= 1U << (grp & 31);
			} else
				sums[grp] += log((double) v);
		}
	}

	for (oid grp = 0; grp < ngrp; grp++) {
		if (cnts[grp] == 0) {
			/* no values found in the group, so result is nil */
			sums[grp] = dbl_nil;
			bn->tnil = true;
			bn->tnonil = false;
		} else if (cnts[grp] == BUN_NONE) {
			/* we encountered a zero, so the result is
			 * zero */
			sums[grp] = 0;
		} else {
			/* convert the sum of the logs into the
			 * geometric mean, note there should be no
			 * overflow here: the geometric mean is at most
			 * equal to the largest value */
			sums[grp] = exp(sums[grp] / cnts[grp]);
			/* if we saw an odd number of negatives
			 * in the group, return a negative
			 * value */
			if (odds[grp >> 5] & (1U << (grp & 31)))
				sums[grp] = -sums[grp];
		}
	}

	/* free resources */
	GDKfree(cnts);
	GDKfree(odds);
	BBPunfix(b->batCacheid);
	if (g)
		BBPunfix(g->batCacheid);
	if (e)
		BBPunfix(e->batCacheid);
	if (s)
		BBPunfix(s->batCacheid);

	*retval = bn->batCacheid;
	BBPkeepref(bn);
	return MAL_SUCCEED;
}

static char *
subgroupedgmean_int(bat *retval, const bat *bid, const bat *gid,
		    const bat *eid, const bit *skip_nils)
{
	return subgroupedgmeancand_int(retval, bid, gid, eid, NULL, skip_nils);
}

static char *
groupedgmean_int(bat *retval, const bat *bid, const bat *gid, const bat *eid)
{
	return subgroupedgmeancand_int(retval, bid, gid, eid, NULL, &(bit){1});
}

#include "mel.h"

static char gmean_sql[] = "CREATE AGGREGATE FUNCTION gmean(val INTEGER)"
	" returns DOUBLE external name gmean.gmean;";

static mel_func gmean_init_funcs[] = {
	command("gmean", "gmean", gmean_int, false,
		"geometric mean",
		args(1,2, arg("",dbl),batarg("b",int))),
	command("gmean", "gmean", groupedgmean_int, false,
		"geometric mean",
		args(1,4, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1))),
	command("gmean", "subgmean", subgroupedgmean_int, false,
		"geometric mean",
		args(1,5, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),arg("skip_nils",bit))),
	command("gmean", "subgmean", subgroupedgmeancand_int, false,
		"geometric mean",
		args(1,6, batarg("",dbl),batarg("b",int),batarg("g",oid),batargany("e",1),batarg("s",oid),arg("skip_nils",bit))),
	{ .imp=NULL }		/* sentinel */
};

#include "mal_import.h"
#include "sql_import.h"
#ifdef _MSC_VER
#undef read
#pragma section(".CRT$XCU",read)
#endif
LIB_STARTUP_FUNC(init_gmean)
{
	mal_module("gmean", NULL, gmean_init_funcs);
	sql_register("gmean", gmean_sql);
}