Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * (c) Martin Kersten
15 : * Multicolumn group-by support
16 : * The group-by support module is meant to simplify code analysis and
17 : * speedup the kernel on multi-attribute grouping routines.
18 : *
19 : * The target is to support SQL-like multicolumngroup_by operations, which are lists of
20 : * attributes and a group aggregate function.
21 : * Each group can be represented with an oid into the n-ary table.
22 : * Consider the query "select count(*), max(A) from R group by A, B,C." whose code
23 : * snippet in MAL would become something like:
24 : *
25 : * @verbatim
26 : * _1:bat[:int] := sql.bind("sys","r","a",0);
27 : * _2:bat[:str] := sql.bind("sys","r","b",0);
28 : * _3:bat[:date] := sql.bind("sys","r","c",0);
29 : * ...
30 : * _9 := algebra.select(_1,0,100);
31 : * ..
32 : * (grp_4:bat[:lng], gid:bat[:oid]) := groupby.count(_9,_2);
33 : * (grp_5:bat[:lng], gid:bat[:oid]) := groupby.max(_9,_2,_3);
34 : * @end verbatim
35 : *
36 : * The id() function merely becomes the old-fashioned oid-based group identification list.
37 : * This way related values can be obtained from the attribute columns. It can be the input
38 : * for the count() function, which saves some re-computation.
39 : *
40 : * Aside the group ids, we also provide options to return the value based aggregate table
41 : * to ease development of parallel plans.
42 : *
43 : * The implementation is optimized for a limited number of groups. The default is
44 : * to fall back on the old code sequences.
45 : *
46 : */
47 : #include "monetdb_config.h"
48 : #include "mal.h"
49 : #include "mal_interpreter.h"
50 : #include "group.h"
51 : #include "mal_exception.h"
52 :
53 : /*
54 : * The implementation is based on a two-phase process. In phase 1, we estimate
55 : * the number of groups to deal with using column independence.
56 : * The grouping is performed in parallel over slices of the tables.
57 : * The final pieces are glued together.
58 : */
59 : typedef struct {
60 : bat *bid; /* input bats */
61 : BAT *candidate; /* list */
62 : BAT **cols;
63 : BUN *unique; /* number of different values */
64 : int last;
65 : BUN size;
66 : } AGGRtask;
67 :
68 : static AGGRtask *
69 10 : GROUPcollect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
70 : {
71 10 : AGGRtask *a;
72 10 : int i;
73 10 : BAT *b, *bs, *bh = NULL;
74 :
75 10 : (void) mb;
76 10 : (void) cntxt;
77 10 : a = (AGGRtask *) GDKzalloc(sizeof(*a));
78 10 : if (a == NULL)
79 : return NULL;
80 10 : a->bid = (bat *) GDKzalloc(pci->argc * sizeof(bat));
81 10 : a->cols = (BAT **) GDKzalloc(pci->argc * sizeof(BAT *));
82 10 : a->unique = (BUN *) GDKzalloc(pci->argc * sizeof(BUN));
83 10 : if (a->cols == NULL || a->bid == NULL || a->unique == NULL) {
84 0 : if (a->cols)
85 0 : GDKfree(a->cols);
86 0 : if (a->bid)
87 0 : GDKfree(a->bid);
88 0 : if (a->unique)
89 0 : GDKfree(a->unique);
90 0 : GDKfree(a);
91 0 : return NULL;
92 : }
93 31 : for (i = pci->retc; i < pci->argc; i++, a->last++) {
94 21 : a->bid[a->last] = *getArgReference_bat(stk, pci, i);
95 21 : b = a->cols[a->last] = BATdescriptor(a->bid[a->last]);
96 21 : if (a->cols[a->last] == NULL) {
97 0 : for (a->last--; a->last >= 0; a->last--)
98 0 : BBPunfix(a->cols[a->last]->batCacheid);
99 0 : GDKfree(a->cols);
100 0 : GDKfree(a->bid);
101 0 : GDKfree(a->unique);
102 0 : GDKfree(a);
103 0 : return NULL;
104 : }
105 21 : bs = BATsample(b, 1000);
106 21 : if (bs) {
107 21 : bh = BATunique(b, bs);
108 21 : if (bh) {
109 21 : a->unique[a->last] = BATcount(bh);
110 21 : BBPunfix(bh->batCacheid);
111 : }
112 21 : BBPunfix(bs->batCacheid);
113 : }
114 21 : if (b->tsorted)
115 11 : a->unique[a->last] = 1000; /* sorting helps grouping */
116 21 : a->size = BATcount(b);
117 : }
118 :
119 : return a;
120 : }
121 :
122 : static void
123 10 : GROUPcollectSort(AGGRtask *a, int start, int finish)
124 : {
125 10 : int i, j, k;
126 10 : BAT *b;
127 10 : BUN sample;
128 :
129 : /* sort the columns by decreasing unique */
130 31 : for (i = start; i < finish; i++)
131 36 : for (j = i + 1; j < finish; j++)
132 15 : if (a->unique[i] < a->unique[j]) {
133 0 : k = a->bid[i];
134 0 : a->bid[i] = a->bid[j];
135 0 : a->bid[j] = k;
136 :
137 0 : b = a->cols[i];
138 0 : a->cols[i] = a->cols[j];
139 0 : a->cols[j] = b;
140 :
141 0 : sample = a->unique[i];
142 0 : a->unique[i] = a->unique[j];
143 0 : a->unique[j] = sample;
144 : }
145 10 : }
146 :
147 : static void
148 10 : GROUPdelete(AGGRtask *a)
149 : {
150 31 : for (a->last--; a->last >= 0; a->last--) {
151 21 : BBPunfix(a->cols[a->last]->batCacheid);
152 : }
153 10 : GDKfree(a->bid);
154 10 : GDKfree(a->cols);
155 10 : GDKfree(a->unique);
156 10 : GDKfree(a);
157 10 : }
158 :
159 : /*
160 : * The groups optimizer takes a grouping sequence and attempts to
161 : * minimize the intermediate result. The choice depends on a good
162 : * estimate of intermediate results using properties.
163 : */
164 :
165 : static str
166 10 : GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
167 : {
168 10 : bat *grp = getArgReference_bat(stk, pci, 0);
169 10 : bat *ext = getArgReference_bat(stk, pci, 1);
170 10 : bat *hist = getArgReference_bat(stk, pci, 2);
171 10 : int i, j;
172 10 : bat oldgrp, oldext, oldhist;
173 10 : str msg = MAL_SUCCEED;
174 10 : BAT *b;
175 10 : BUN count = 0;
176 10 : AGGRtask *aggr;
177 :
178 10 : aggr = GROUPcollect(cntxt, mb, stk, pci);
179 10 : if (aggr == NULL)
180 0 : throw(MAL, "group.multicolumn", SQLSTATE(HY013) MAL_MALLOC_FAIL);
181 10 : GROUPcollectSort(aggr, 0, aggr->last);
182 :
183 : /* (grp,ext,hist) := group.group(..) */
184 : /* use the old pattern to perform the incremental grouping */
185 10 : *grp = 0;
186 10 : *ext = 0;
187 10 : *hist = 0;
188 10 : msg = GRPgroup1(grp, ext, hist, &aggr->bid[0]);
189 10 : i = 1;
190 10 : if (msg == MAL_SUCCEED && aggr->last > 1)
191 11 : do {
192 : /* early break when there are as many groups as entries */
193 11 : b = BBPquickdesc(*hist);
194 11 : if (b) {
195 11 : j = BATcount(b) == count;
196 11 : if (j)
197 : break;
198 : }
199 :
200 : /* (grp,ext,hist) := group.subgroup(arg,grp,ext,hist) */
201 11 : oldgrp = *grp;
202 11 : oldext = *ext;
203 11 : oldhist = *hist;
204 11 : *grp = 0;
205 11 : *ext = 0;
206 11 : *hist = 0;
207 11 : msg = GRPsubgroup5(grp, ext, hist, &aggr->bid[i], NULL, &oldgrp,
208 : &oldext, &oldhist);
209 11 : BBPrelease(oldgrp);
210 11 : BBPrelease(oldext);
211 11 : BBPrelease(oldhist);
212 11 : } while (msg == MAL_SUCCEED && ++i < aggr->last);
213 10 : GROUPdelete(aggr);
214 10 : return msg;
215 : }
216 :
217 : #include "mel.h"
218 : mel_func groupby_init_funcs[] = {
219 : pattern("group", "multicolumn", GROUPmulticolumngroup, false, "Derivation of a group index over multiple columns.", args(3,4, batarg("ref",oid),batarg("grp",oid),batargany("hist",0),batvarargany("b",0))),
220 : { .imp=NULL }
221 : };
222 : #include "mal_import.h"
223 : #ifdef _MSC_VER
224 : #undef read
225 : #pragma section(".CRT$XCU",read)
226 : #endif
227 320 : LIB_STARTUP_FUNC(init_groupby_mal)
228 320 : { mal_module("groupby", NULL, groupby_init_funcs); }
|