LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - groupby.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 81 106 76.4 %
Date: 2024-04-25 20:03:45 Functions: 5 5 100.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  * (c) Martin Kersten
      15             :  * Multicolumn group-by support
      16             :  * The group-by support module is meant to simplify code analysis and
      17             :  * speedup the kernel on multi-attribute grouping routines.
      18             :  *
      19             :  * The target is to support SQL-like multicolumngroup_by operations, which are lists of
      20             :  * attributes and a group aggregate function.
      21             :  * Each group can be represented with an oid into the n-ary table.
      22             :  * Consider the query "select count(*), max(A) from R group by A, B,C." whose code
      23             :  * snippet in MAL would become something like:
      24             :  *
      25             :  * @verbatim
      26             :  * _1:bat[:int]  := sql.bind("sys","r","a",0);
      27             :  * _2:bat[:str]  := sql.bind("sys","r","b",0);
      28             :  * _3:bat[:date]  := sql.bind("sys","r","c",0);
      29             :  * ...
      30             :  * _9 := algebra.select(_1,0,100);
      31             :  * ..
      32             :  * (grp_4:bat[:lng], gid:bat[:oid]) := groupby.count(_9,_2);
      33             :  * (grp_5:bat[:lng], gid:bat[:oid]) := groupby.max(_9,_2,_3);
      34             :  * @end verbatim
      35             :  *
      36             :  * The id() function merely becomes the old-fashioned oid-based group identification list.
      37             :  * This way related values can be obtained from the attribute columns. It can be the input
      38             :  * for the count() function, which saves some re-computation.
      39             :  *
      40             :  * Aside the group ids, we also provide options to return the value based aggregate table
      41             :  * to ease development of parallel plans.
      42             :  *
      43             :  * The implementation is optimized for a limited number of groups. The default is
      44             :  * to fall back on the old code sequences.
      45             :  *
      46             :  */
      47             : #include "monetdb_config.h"
      48             : #include "mal.h"
      49             : #include "mal_interpreter.h"
      50             : #include "group.h"
      51             : #include "mal_exception.h"
      52             : 
      53             : /*
      54             :  * The implementation is based on a two-phase process. In phase 1, we estimate
      55             :  * the number of groups to deal with using column independence.
      56             :  * The grouping is performed in parallel over slices of the tables.
      57             :  * The final pieces are glued together.
      58             :  */
      59             : typedef struct {
      60             :         bat *bid;                                       /* input bats */
      61             :         BAT *candidate;                         /* list */
      62             :         BAT **cols;
      63             :         BUN *unique;                            /* number of different values */
      64             :         int last;
      65             :         BUN size;
      66             : } AGGRtask;
      67             : 
      68             : static AGGRtask *
      69          10 : GROUPcollect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
      70             : {
      71          10 :         AGGRtask *a;
      72          10 :         int i;
      73          10 :         BAT *b, *bs, *bh = NULL;
      74             : 
      75          10 :         (void) mb;
      76          10 :         (void) cntxt;
      77          10 :         a = (AGGRtask *) GDKzalloc(sizeof(*a));
      78          10 :         if (a == NULL)
      79             :                 return NULL;
      80          10 :         a->bid = (bat *) GDKzalloc(pci->argc * sizeof(bat));
      81          10 :         a->cols = (BAT **) GDKzalloc(pci->argc * sizeof(BAT *));
      82          10 :         a->unique = (BUN *) GDKzalloc(pci->argc * sizeof(BUN));
      83          10 :         if (a->cols == NULL || a->bid == NULL || a->unique == NULL) {
      84           0 :                 if (a->cols)
      85           0 :                         GDKfree(a->cols);
      86           0 :                 if (a->bid)
      87           0 :                         GDKfree(a->bid);
      88           0 :                 if (a->unique)
      89           0 :                         GDKfree(a->unique);
      90           0 :                 GDKfree(a);
      91           0 :                 return NULL;
      92             :         }
      93          31 :         for (i = pci->retc; i < pci->argc; i++, a->last++) {
      94          21 :                 a->bid[a->last] = *getArgReference_bat(stk, pci, i);
      95          21 :                 b = a->cols[a->last] = BATdescriptor(a->bid[a->last]);
      96          21 :                 if (a->cols[a->last] == NULL) {
      97           0 :                         for (a->last--; a->last >= 0; a->last--)
      98           0 :                                 BBPunfix(a->cols[a->last]->batCacheid);
      99           0 :                         GDKfree(a->cols);
     100           0 :                         GDKfree(a->bid);
     101           0 :                         GDKfree(a->unique);
     102           0 :                         GDKfree(a);
     103           0 :                         return NULL;
     104             :                 }
     105          21 :                 bs = BATsample(b, 1000);
     106          21 :                 if (bs) {
     107          21 :                         bh = BATunique(b, bs);
     108          21 :                         if (bh) {
     109          21 :                                 a->unique[a->last] = BATcount(bh);
     110          21 :                                 BBPunfix(bh->batCacheid);
     111             :                         }
     112          21 :                         BBPunfix(bs->batCacheid);
     113             :                 }
     114          21 :                 if (b->tsorted)
     115          11 :                         a->unique[a->last] = 1000;        /* sorting helps grouping */
     116          21 :                 a->size = BATcount(b);
     117             :         }
     118             : 
     119             :         return a;
     120             : }
     121             : 
     122             : static void
     123          10 : GROUPcollectSort(AGGRtask *a, int start, int finish)
     124             : {
     125          10 :         int i, j, k;
     126          10 :         BAT *b;
     127          10 :         BUN sample;
     128             : 
     129             :         /* sort the columns by decreasing unique */
     130          31 :         for (i = start; i < finish; i++)
     131          36 :                 for (j = i + 1; j < finish; j++)
     132          15 :                         if (a->unique[i] < a->unique[j]) {
     133           0 :                                 k = a->bid[i];
     134           0 :                                 a->bid[i] = a->bid[j];
     135           0 :                                 a->bid[j] = k;
     136             : 
     137           0 :                                 b = a->cols[i];
     138           0 :                                 a->cols[i] = a->cols[j];
     139           0 :                                 a->cols[j] = b;
     140             : 
     141           0 :                                 sample = a->unique[i];
     142           0 :                                 a->unique[i] = a->unique[j];
     143           0 :                                 a->unique[j] = sample;
     144             :                         }
     145          10 : }
     146             : 
     147             : static void
     148          10 : GROUPdelete(AGGRtask *a)
     149             : {
     150          31 :         for (a->last--; a->last >= 0; a->last--) {
     151          21 :                 BBPunfix(a->cols[a->last]->batCacheid);
     152             :         }
     153          10 :         GDKfree(a->bid);
     154          10 :         GDKfree(a->cols);
     155          10 :         GDKfree(a->unique);
     156          10 :         GDKfree(a);
     157          10 : }
     158             : 
     159             : /*
     160             :  * The groups optimizer takes a grouping sequence and attempts to
     161             :  * minimize the intermediate result.  The choice depends on a good
     162             :  * estimate of intermediate results using properties.
     163             :  */
     164             : 
     165             : static str
     166          10 : GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     167             : {
     168          10 :         bat *grp = getArgReference_bat(stk, pci, 0);
     169          10 :         bat *ext = getArgReference_bat(stk, pci, 1);
     170          10 :         bat *hist = getArgReference_bat(stk, pci, 2);
     171          10 :         int i, j;
     172          10 :         bat oldgrp, oldext, oldhist;
     173          10 :         str msg = MAL_SUCCEED;
     174          10 :         BAT *b;
     175          10 :         BUN count = 0;
     176          10 :         AGGRtask *aggr;
     177             : 
     178          10 :         aggr = GROUPcollect(cntxt, mb, stk, pci);
     179          10 :         if (aggr == NULL)
     180           0 :                 throw(MAL, "group.multicolumn", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     181          10 :         GROUPcollectSort(aggr, 0, aggr->last);
     182             : 
     183             :         /* (grp,ext,hist) := group.group(..) */
     184             :         /* use the old pattern to perform the incremental grouping */
     185          10 :         *grp = 0;
     186          10 :         *ext = 0;
     187          10 :         *hist = 0;
     188          10 :         msg = GRPgroup1(grp, ext, hist, &aggr->bid[0]);
     189          10 :         i = 1;
     190          10 :         if (msg == MAL_SUCCEED && aggr->last > 1)
     191          11 :                 do {
     192             :                         /* early break when there are as many groups as entries */
     193          11 :                         b = BBPquickdesc(*hist);
     194          11 :                         if (b) {
     195          11 :                                 j = BATcount(b) == count;
     196          11 :                                 if (j)
     197             :                                         break;
     198             :                         }
     199             : 
     200             :                         /* (grp,ext,hist) := group.subgroup(arg,grp,ext,hist) */
     201          11 :                         oldgrp = *grp;
     202          11 :                         oldext = *ext;
     203          11 :                         oldhist = *hist;
     204          11 :                         *grp = 0;
     205          11 :                         *ext = 0;
     206          11 :                         *hist = 0;
     207          11 :                         msg = GRPsubgroup5(grp, ext, hist, &aggr->bid[i], NULL, &oldgrp,
     208             :                                                            &oldext, &oldhist);
     209          11 :                         BBPrelease(oldgrp);
     210          11 :                         BBPrelease(oldext);
     211          11 :                         BBPrelease(oldhist);
     212          11 :                 } while (msg == MAL_SUCCEED && ++i < aggr->last);
     213          10 :         GROUPdelete(aggr);
     214          10 :         return msg;
     215             : }
     216             : 
     217             : #include "mel.h"
     218             : mel_func groupby_init_funcs[] = {
     219             :  pattern("group", "multicolumn", GROUPmulticolumngroup, false, "Derivation of a group index over multiple columns.", args(3,4, batarg("ref",oid),batarg("grp",oid),batargany("hist",0),batvarargany("b",0))),
     220             :  { .imp=NULL }
     221             : };
     222             : #include "mal_import.h"
     223             : #ifdef _MSC_VER
     224             : #undef read
     225             : #pragma section(".CRT$XCU",read)
     226             : #endif
     227         329 : LIB_STARTUP_FUNC(init_groupby_mal)
     228         329 : { mal_module("groupby", NULL, groupby_init_funcs); }

Generated by: LCOV version 1.14