Mercurial > hg > MonetDB-extend
changeset 48:099ce41179e9
Fixes to calculation of geometric mean.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Mon, 14 Jun 2021 09:44:28 +0200 (2021-06-14) |
parents | c8140b1fabf5 |
children | 0e5e08bd133b |
files | gmean/README.rst gmean/gmean.c |
diffstat | 2 files changed, 60 insertions(+), 55 deletions(-) [+] |
line wrap: on
line diff
--- a/gmean/README.rst Fri Jun 11 10:19:12 2021 +0200 +++ b/gmean/README.rst Mon Jun 14 09:44:28 2021 +0200 @@ -341,14 +341,14 @@ value as well. When an aggregate function is called, the first input parameter should -refer to a BAT, but the other three input BATs are optional. It they +refer to a BAT, but the other three input BATs are optional. If they are not present, the function argument should either be ``NULL``, or should point to a value that represents ``nil`` for BAT identifiers. -If there is no group BAT, we should treat the all values as belonging -to a single group. If there is no "extents" BAT, we need to figure -out in some other way how many groups there are. If there is no -candidate list, all rows participate, otherwise only rows indicated by -the candidate list. +If there is no group BAT, we should treat all values as belonging to a +single group. If there is no "extents" BAT, we need to figure out in +some other way how many groups there are. If there is no candidate +list, all rows participate, otherwise only rows indicated by the +candidate list. This is done in the function ``BATgroupaggrinit``. We start by loading all input BATs and checking that they could be loaded. This needs to be done, taking into account that three of the @@ -448,10 +448,12 @@ and if it is ``NULL``, we check the value of ``g`` to distinguish between the other two cases. -For the algorithm we use two temporary bit masks to have two extra -bits for each group. One bit in one mask is used to indicate whether -we have seen any values in the group. The bit in the other mask is -used to maintain whether we have seen an even or an odd number of +For the algorithm we use a temporary array to count the number of +values in each group. Using this we can easily see whether there were +any values at all (the count is zero or not), and we can maintain +whether we have seen a zero value by setting the count to an +impossible count (``BUN_NONE``). We also need a bit mask with one bit +for each group to maintain whether we have seen an odd number of negative values in the group. At the end of the function, we free all resources, set the properties
--- a/gmean/gmean.c Fri Jun 11 10:19:12 2021 +0200 +++ b/gmean/gmean.c Mon Jun 14 09:44:28 2021 +0200 @@ -124,6 +124,19 @@ throw(MAL, "gmean.gmean", "%s\n", GDK_EXCEPTION); } + /* We are going to change the contents of the newly created BAT. + * We are allowed to do that because we are the "owner" and no + * other thread can have a reference to this BAT yet. First we + * set the properties. We don't know whether the result is + * sorted or whether there are any duplicate values, so we must + * set these properties to false. */ + bn->tsorted = false; + bn->trevsorted = false; + bn->tkey = false; + /* we may change these two below */ + bn->tnil = false; + bn->tnonil = true; + /* In general we need to distinguish two cases per group: there * are no non-nil values, then the result should be nil; there * are non-nil values, then the result should be the aggregate. @@ -137,15 +150,16 @@ if (g && !BATtdense(g)) gids= (const oid *) Tloc(g, 0); dbl *sums = (dbl *) Tloc(bn, 0); /* results */ - /* For this aggregate, we need to maintain three states per - * group: no non-nil values encountered yet; we encountered a - * zero value; we encountered values, but no zeros. We maintain whether we have seen any values with a bit in the temporary array 'seen'. When we encoun */ - uint32_t *seen = GDKzalloc(((ngrp + 31) / 32) * sizeof(uint32_t)); + /* For this aggregate, we need to maintain the number of values + * we have added up per group. We can also keep track of + * whether we have seen any zero values. GDKzalloc initializes + * the allocated memory with zero bits. */ + BUN *cnts = GDKzalloc(ngrp * sizeof(BUN)); /* We need to maintain whether we have seen an odd or an even * number of negative values in each group, so we use a bitmap * for that (true -> odd number of negatives) */ uint32_t *odds = GDKzalloc(((ngrp + 31) / 32) * sizeof(uint32_t)); - if (seen == NULL || odds == NULL) { + if (cnts == NULL || odds == NULL) { BBPunfix(b->batCacheid); if (g) BBPunfix(g->batCacheid); @@ -154,7 +168,7 @@ if (s) BBPunfix(s->batCacheid); /* one of these is NULL, but GDKfree can deal with that */ - GDKfree(seen); + GDKfree(cnts); GDKfree(odds); throw(MAL, "gmean.gmean", MAL_MALLOC_FAIL); } @@ -167,54 +181,52 @@ grp -= min; /* zero based access */ if (is_int_nil(v)) continue; /* skip nils (no matter skip_nils) */ - else if (v == 0) - sums[grp] = dbl_nil; - oid grp32 = grp >> 5; - oid mod32 = grp & 31; - if (!(seen[grp32] & (1U << mod32))) { - seen[grp32] |= 1U << mod32; - } else if (is_dbl_nil(sums[grp])) { - /* if sums[grp] is nil we've encountered - * a zero */ + else if (v == 0) { + cnts[grp] = BUN_NONE; + continue; + } else if (cnts[grp] == BUN_NONE) { + /* we already encountered a zero value + * in this group */ continue; } + /* non zero and non NULL value */ + cnts[grp]++; if (v < 0) { /* flip the bit for this group */ - odds[grp32] ^= 1U << mod32; sums[grp] += log((double) -v); + /* flip the bit for this group */ + odds[grp >> 5] ^= 1U << (grp & 31); } else sums[grp] += log((double) v); } } - /* we may change these two below */ - bn->tnil = false; - bn->tnonil = true; + for (oid grp = 0; grp < ngrp; grp++) { - if (seen[grp >> 5] & (1U << (grp & 31))) { - if (is_dbl_nil(sums[grp])) { - /* we encountered a zero, so the result - * is zero */ - sums[grp] = 0; - } else { - /* convert the sum of the logs into the - * geometric mean */ - sums[grp] = exp(sums[grp]); - /* if we saw an odd number of negatives - * in the group, return a negative - * value */ - if (odds[grp >> 5] & (1U << (grp & 31))) - sums[grp] = -sums[grp]; - } - } else { + if (cnts[grp] == 0) { /* no values found in the group, so result is nil */ sums[grp] = dbl_nil; bn->tnil = true; bn->tnonil = false; + } else if (cnts[grp] == BUN_NONE) { + /* we encountered a zero, so the result is + * zero */ + sums[grp] = 0; + } else { + /* convert the sum of the logs into the + * geometric mean, note there should be no + * overflow here: the geometric mean is at most + * equal to the largest value */ + sums[grp] = exp(sums[grp] / cnts[grp]); + /* if we saw an odd number of negatives + * in the group, return a negative + * value */ + if (odds[grp >> 5] & (1U << (grp & 31))) + sums[grp] = -sums[grp]; } } /* free resources */ - GDKfree(seen); + GDKfree(cnts); GDKfree(odds); BBPunfix(b->batCacheid); if (g) @@ -224,15 +236,6 @@ if (s) BBPunfix(s->batCacheid); - /* set the properties and count */ - BATsetcount(bn, ngrp); - /* we don't know whether the result is sorted or whether there - * are any duplicate values, so we must set these properties to - * false */ - bn->tsorted = false; - bn->trevsorted = false; - bn->tkey = false; - *retval = bn->batCacheid; BBPkeepref(bn->batCacheid); return MAL_SUCCEED;