LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - groupby.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 82 104 78.8 %
Date: 2024-12-20 21:24:02 Functions: 5 5 100.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  * (c) Martin Kersten
      15             :  * Multicolumn group-by support
      16             :  * The group-by support module is meant to simplify code analysis and
      17             :  * speedup the kernel on multi-attribute grouping routines.
      18             :  *
      19             :  * The target is to support SQL-like multicolumngroup_by operations, which are lists of
      20             :  * attributes and a group aggregate function.
      21             :  * Each group can be represented with an oid into the n-ary table.
      22             :  * Consider the query "select count(*), max(A) from R group by A, B,C." whose code
      23             :  * snippet in MAL would become something like:
      24             :  *
      25             :  * @verbatim
      26             :  * _1:bat[:int]  := sql.bind("sys","r","a",0);
      27             :  * _2:bat[:str]  := sql.bind("sys","r","b",0);
      28             :  * _3:bat[:date]  := sql.bind("sys","r","c",0);
      29             :  * ...
      30             :  * _9 := algebra.select(_1,0,100);
      31             :  * ..
      32             :  * (grp_4:bat[:lng], gid:bat[:oid]) := groupby.count(_9,_2);
      33             :  * (grp_5:bat[:lng], gid:bat[:oid]) := groupby.max(_9,_2,_3);
      34             :  * @end verbatim
      35             :  *
      36             :  * The id() function merely becomes the old-fashioned oid-based group identification list.
      37             :  * This way related values can be obtained from the attribute columns. It can be the input
      38             :  * for the count() function, which saves some re-computation.
      39             :  *
      40             :  * Aside the group ids, we also provide options to return the value based aggregate table
      41             :  * to ease development of parallel plans.
      42             :  *
      43             :  * The implementation is optimized for a limited number of groups. The default is
      44             :  * to fall back on the old code sequences.
      45             :  *
      46             :  */
      47             : #include "monetdb_config.h"
      48             : #include "mal.h"
      49             : #include "mal_interpreter.h"
      50             : #include "group.h"
      51             : #include "mal_exception.h"
      52             : 
      53             : /*
      54             :  * The implementation is based on a two-phase process. In phase 1, we estimate
      55             :  * the number of groups to deal with using column independence.
      56             :  * The grouping is performed in parallel over slices of the tables.
      57             :  * The final pieces are glued together.
      58             :  */
      59             : typedef struct {
      60             :         bat *bid;                                       /* input bats */
      61             :         BAT *candidate;                         /* list */
      62             :         BAT **cols;
      63             :         BUN *unique;                            /* number of different values */
      64             :         int last;
      65             :         BUN size;
      66             : } AGGRtask;
      67             : 
      68             : static AGGRtask *
      69          10 : GROUPcollect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
      70             : {
      71          10 :         AGGRtask *a;
      72          10 :         int i;
      73          10 :         BAT *b, *bs, *bh = NULL;
      74             : 
      75          10 :         (void) mb;
      76          10 :         (void) cntxt;
      77          10 :         a = (AGGRtask *) GDKmalloc(sizeof(*a));
      78          10 :         if (a == NULL)
      79             :                 return NULL;
      80          20 :         *a = (AGGRtask) {
      81          10 :                 .bid = GDKzalloc(pci->argc * sizeof(bat)),
      82          10 :                 .cols = GDKzalloc(pci->argc * sizeof(BAT *)),
      83          10 :                 .unique = GDKzalloc(pci->argc * sizeof(BUN)),
      84             :         };
      85          10 :         if (a->cols == NULL || a->bid == NULL || a->unique == NULL) {
      86           0 :                 GDKfree(a->cols);
      87           0 :                 GDKfree(a->bid);
      88           0 :                 GDKfree(a->unique);
      89           0 :                 GDKfree(a);
      90           0 :                 return NULL;
      91             :         }
      92          31 :         for (i = pci->retc; i < pci->argc; i++, a->last++) {
      93          21 :                 a->bid[a->last] = *getArgReference_bat(stk, pci, i);
      94          21 :                 b = a->cols[a->last] = BATdescriptor(a->bid[a->last]);
      95          21 :                 if (a->cols[a->last] == NULL) {
      96           0 :                         for (a->last--; a->last >= 0; a->last--)
      97           0 :                                 BBPunfix(a->cols[a->last]->batCacheid);
      98           0 :                         GDKfree(a->cols);
      99           0 :                         GDKfree(a->bid);
     100           0 :                         GDKfree(a->unique);
     101           0 :                         GDKfree(a);
     102           0 :                         return NULL;
     103             :                 }
     104          21 :                 bs = BATsample(b, 1000);
     105          21 :                 if (bs) {
     106          21 :                         bh = BATunique(b, bs);
     107          21 :                         if (bh) {
     108          21 :                                 a->unique[a->last] = BATcount(bh);
     109          21 :                                 BBPunfix(bh->batCacheid);
     110             :                         }
     111          21 :                         BBPunfix(bs->batCacheid);
     112             :                 }
     113          21 :                 if (b->tsorted)
     114          11 :                         a->unique[a->last] = 1000;        /* sorting helps grouping */
     115          21 :                 a->size = BATcount(b);
     116             :         }
     117             : 
     118             :         return a;
     119             : }
     120             : 
     121             : static void
     122          10 : GROUPcollectSort(AGGRtask *a, int start, int finish)
     123             : {
     124          10 :         int i, j, k;
     125          10 :         BAT *b;
     126          10 :         BUN sample;
     127             : 
     128             :         /* sort the columns by decreasing unique */
     129          31 :         for (i = start; i < finish; i++)
     130          36 :                 for (j = i + 1; j < finish; j++)
     131          15 :                         if (a->unique[i] < a->unique[j]) {
     132           0 :                                 k = a->bid[i];
     133           0 :                                 a->bid[i] = a->bid[j];
     134           0 :                                 a->bid[j] = k;
     135             : 
     136           0 :                                 b = a->cols[i];
     137           0 :                                 a->cols[i] = a->cols[j];
     138           0 :                                 a->cols[j] = b;
     139             : 
     140           0 :                                 sample = a->unique[i];
     141           0 :                                 a->unique[i] = a->unique[j];
     142           0 :                                 a->unique[j] = sample;
     143             :                         }
     144          10 : }
     145             : 
     146             : static void
     147          10 : GROUPdelete(AGGRtask *a)
     148             : {
     149          31 :         for (a->last--; a->last >= 0; a->last--) {
     150          21 :                 BBPunfix(a->cols[a->last]->batCacheid);
     151             :         }
     152          10 :         GDKfree(a->bid);
     153          10 :         GDKfree(a->cols);
     154          10 :         GDKfree(a->unique);
     155          10 :         GDKfree(a);
     156          10 : }
     157             : 
     158             : /*
     159             :  * The groups optimizer takes a grouping sequence and attempts to
     160             :  * minimize the intermediate result.  The choice depends on a good
     161             :  * estimate of intermediate results using properties.
     162             :  */
     163             : 
     164             : static str
     165          10 : GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     166             : {
     167          10 :         bat *grp = getArgReference_bat(stk, pci, 0);
     168          10 :         bat *ext = getArgReference_bat(stk, pci, 1);
     169          10 :         bat *hist = getArgReference_bat(stk, pci, 2);
     170          10 :         int i, j;
     171          10 :         bat oldgrp, oldext, oldhist;
     172          10 :         str msg = MAL_SUCCEED;
     173          10 :         BAT *b;
     174          10 :         BUN count = 0;
     175          10 :         AGGRtask *aggr;
     176             : 
     177          10 :         aggr = GROUPcollect(cntxt, mb, stk, pci);
     178          10 :         if (aggr == NULL)
     179           0 :                 throw(MAL, "group.multicolumn", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     180          10 :         GROUPcollectSort(aggr, 0, aggr->last);
     181             : 
     182             :         /* (grp,ext,hist) := group.group(..) */
     183             :         /* use the old pattern to perform the incremental grouping */
     184          10 :         *grp = 0;
     185          10 :         *ext = 0;
     186          10 :         *hist = 0;
     187          10 :         msg = GRPgroup1(grp, ext, hist, &aggr->bid[0]);
     188          10 :         i = 1;
     189          10 :         if (msg == MAL_SUCCEED && aggr->last > 1)
     190          11 :                 do {
     191             :                         /* early break when there are as many groups as entries */
     192          11 :                         b = BBPquickdesc(*hist);
     193          11 :                         if (b) {
     194          11 :                                 j = BATcount(b) == count;
     195          11 :                                 if (j)
     196             :                                         break;
     197             :                         }
     198             : 
     199             :                         /* (grp,ext,hist) := group.subgroup(arg,grp,ext,hist) */
     200          11 :                         oldgrp = *grp;
     201          11 :                         oldext = *ext;
     202          11 :                         oldhist = *hist;
     203          11 :                         *grp = 0;
     204          11 :                         *ext = 0;
     205          11 :                         *hist = 0;
     206          11 :                         msg = GRPsubgroup5(grp, ext, hist, &aggr->bid[i], NULL, &oldgrp,
     207             :                                                            &oldext, &oldhist);
     208          11 :                         BBPrelease(oldgrp);
     209          11 :                         BBPrelease(oldext);
     210          11 :                         BBPrelease(oldhist);
     211          11 :                 } while (msg == MAL_SUCCEED && ++i < aggr->last);
     212          10 :         GROUPdelete(aggr);
     213          10 :         return msg;
     214             : }
     215             : 
     216             : #include "mel.h"
     217             : mel_func groupby_init_funcs[] = {
     218             :  pattern("group", "multicolumn", GROUPmulticolumngroup, false, "Derivation of a group index over multiple columns.", args(3,4, batarg("ref",oid),batarg("grp",oid),batargany("hist",0),batvarargany("b",0))),
     219             :  { .imp=NULL }
     220             : };
     221             : #include "mal_import.h"
     222             : #ifdef _MSC_VER
     223             : #undef read
     224             : #pragma section(".CRT$XCU",read)
     225             : #endif
     226         345 : LIB_STARTUP_FUNC(init_groupby_mal)
     227         345 : { mal_module("groupby", NULL, groupby_init_funcs); }

Generated by: LCOV version 1.14