Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * (c) Martin Kersten
15 : * Multicolumn group-by support
16 : * The group-by support module is meant to simplify code analysis and
17 : * speedup the kernel on multi-attribute grouping routines.
18 : *
19 : * The target is to support SQL-like multicolumngroup_by operations, which are lists of
20 : * attributes and a group aggregate function.
21 : * Each group can be represented with an oid into the n-ary table.
22 : * Consider the query "select count(*), max(A) from R group by A, B,C." whose code
23 : * snippet in MAL would become something like:
24 : *
25 : * @verbatim
26 : * _1:bat[:int] := sql.bind("sys","r","a",0);
27 : * _2:bat[:str] := sql.bind("sys","r","b",0);
28 : * _3:bat[:date] := sql.bind("sys","r","c",0);
29 : * ...
30 : * _9 := algebra.select(_1,0,100);
31 : * ..
32 : * (grp_4:bat[:lng], gid:bat[:oid]) := groupby.count(_9,_2);
33 : * (grp_5:bat[:lng], gid:bat[:oid]) := groupby.max(_9,_2,_3);
34 : * @end verbatim
35 : *
36 : * The id() function merely becomes the old-fashioned oid-based group identification list.
37 : * This way related values can be obtained from the attribute columns. It can be the input
38 : * for the count() function, which saves some re-computation.
39 : *
40 : * Aside the group ids, we also provide options to return the value based aggregate table
41 : * to ease development of parallel plans.
42 : *
43 : * The implementation is optimized for a limited number of groups. The default is
44 : * to fall back on the old code sequences.
45 : *
46 : */
47 : #include "monetdb_config.h"
48 : #include "mal.h"
49 : #include "mal_interpreter.h"
50 : #include "group.h"
51 : #include "mal_exception.h"
52 :
53 : /*
54 : * The implementation is based on a two-phase process. In phase 1, we estimate
55 : * the number of groups to deal with using column independence.
56 : * The grouping is performed in parallel over slices of the tables.
57 : * The final pieces are glued together.
58 : */
59 : typedef struct {
60 : bat *bid; /* input bats */
61 : BAT *candidate; /* list */
62 : BAT **cols;
63 : BUN *unique; /* number of different values */
64 : int last;
65 : BUN size;
66 : } AGGRtask;
67 :
68 : static AGGRtask *
69 10 : GROUPcollect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
70 : {
71 10 : AGGRtask *a;
72 10 : int i;
73 10 : BAT *b, *bs, *bh = NULL;
74 :
75 10 : (void) mb;
76 10 : (void) cntxt;
77 10 : a = (AGGRtask *) GDKmalloc(sizeof(*a));
78 10 : if (a == NULL)
79 : return NULL;
80 20 : *a = (AGGRtask) {
81 10 : .bid = GDKzalloc(pci->argc * sizeof(bat)),
82 10 : .cols = GDKzalloc(pci->argc * sizeof(BAT *)),
83 10 : .unique = GDKzalloc(pci->argc * sizeof(BUN)),
84 : };
85 10 : if (a->cols == NULL || a->bid == NULL || a->unique == NULL) {
86 0 : GDKfree(a->cols);
87 0 : GDKfree(a->bid);
88 0 : GDKfree(a->unique);
89 0 : GDKfree(a);
90 0 : return NULL;
91 : }
92 31 : for (i = pci->retc; i < pci->argc; i++, a->last++) {
93 21 : a->bid[a->last] = *getArgReference_bat(stk, pci, i);
94 21 : b = a->cols[a->last] = BATdescriptor(a->bid[a->last]);
95 21 : if (a->cols[a->last] == NULL) {
96 0 : for (a->last--; a->last >= 0; a->last--)
97 0 : BBPunfix(a->cols[a->last]->batCacheid);
98 0 : GDKfree(a->cols);
99 0 : GDKfree(a->bid);
100 0 : GDKfree(a->unique);
101 0 : GDKfree(a);
102 0 : return NULL;
103 : }
104 21 : bs = BATsample(b, 1000);
105 21 : if (bs) {
106 21 : bh = BATunique(b, bs);
107 21 : if (bh) {
108 21 : a->unique[a->last] = BATcount(bh);
109 21 : BBPunfix(bh->batCacheid);
110 : }
111 21 : BBPunfix(bs->batCacheid);
112 : }
113 21 : if (b->tsorted)
114 11 : a->unique[a->last] = 1000; /* sorting helps grouping */
115 21 : a->size = BATcount(b);
116 : }
117 :
118 : return a;
119 : }
120 :
121 : static void
122 10 : GROUPcollectSort(AGGRtask *a, int start, int finish)
123 : {
124 10 : int i, j, k;
125 10 : BAT *b;
126 10 : BUN sample;
127 :
128 : /* sort the columns by decreasing unique */
129 31 : for (i = start; i < finish; i++)
130 36 : for (j = i + 1; j < finish; j++)
131 15 : if (a->unique[i] < a->unique[j]) {
132 0 : k = a->bid[i];
133 0 : a->bid[i] = a->bid[j];
134 0 : a->bid[j] = k;
135 :
136 0 : b = a->cols[i];
137 0 : a->cols[i] = a->cols[j];
138 0 : a->cols[j] = b;
139 :
140 0 : sample = a->unique[i];
141 0 : a->unique[i] = a->unique[j];
142 0 : a->unique[j] = sample;
143 : }
144 10 : }
145 :
146 : static void
147 10 : GROUPdelete(AGGRtask *a)
148 : {
149 31 : for (a->last--; a->last >= 0; a->last--) {
150 21 : BBPunfix(a->cols[a->last]->batCacheid);
151 : }
152 10 : GDKfree(a->bid);
153 10 : GDKfree(a->cols);
154 10 : GDKfree(a->unique);
155 10 : GDKfree(a);
156 10 : }
157 :
158 : /*
159 : * The groups optimizer takes a grouping sequence and attempts to
160 : * minimize the intermediate result. The choice depends on a good
161 : * estimate of intermediate results using properties.
162 : */
163 :
164 : static str
165 10 : GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
166 : {
167 10 : bat *grp = getArgReference_bat(stk, pci, 0);
168 10 : bat *ext = getArgReference_bat(stk, pci, 1);
169 10 : bat *hist = getArgReference_bat(stk, pci, 2);
170 10 : int i, j;
171 10 : bat oldgrp, oldext, oldhist;
172 10 : str msg = MAL_SUCCEED;
173 10 : BAT *b;
174 10 : BUN count = 0;
175 10 : AGGRtask *aggr;
176 :
177 10 : aggr = GROUPcollect(cntxt, mb, stk, pci);
178 10 : if (aggr == NULL)
179 0 : throw(MAL, "group.multicolumn", SQLSTATE(HY013) MAL_MALLOC_FAIL);
180 10 : GROUPcollectSort(aggr, 0, aggr->last);
181 :
182 : /* (grp,ext,hist) := group.group(..) */
183 : /* use the old pattern to perform the incremental grouping */
184 10 : *grp = 0;
185 10 : *ext = 0;
186 10 : *hist = 0;
187 10 : msg = GRPgroup1(grp, ext, hist, &aggr->bid[0]);
188 10 : i = 1;
189 10 : if (msg == MAL_SUCCEED && aggr->last > 1)
190 11 : do {
191 : /* early break when there are as many groups as entries */
192 11 : b = BBPquickdesc(*hist);
193 11 : if (b) {
194 11 : j = BATcount(b) == count;
195 11 : if (j)
196 : break;
197 : }
198 :
199 : /* (grp,ext,hist) := group.subgroup(arg,grp,ext,hist) */
200 11 : oldgrp = *grp;
201 11 : oldext = *ext;
202 11 : oldhist = *hist;
203 11 : *grp = 0;
204 11 : *ext = 0;
205 11 : *hist = 0;
206 11 : msg = GRPsubgroup5(grp, ext, hist, &aggr->bid[i], NULL, &oldgrp,
207 : &oldext, &oldhist);
208 11 : BBPrelease(oldgrp);
209 11 : BBPrelease(oldext);
210 11 : BBPrelease(oldhist);
211 11 : } while (msg == MAL_SUCCEED && ++i < aggr->last);
212 10 : GROUPdelete(aggr);
213 10 : return msg;
214 : }
215 :
216 : #include "mel.h"
217 : mel_func groupby_init_funcs[] = {
218 : pattern("group", "multicolumn", GROUPmulticolumngroup, false, "Derivation of a group index over multiple columns.", args(3,4, batarg("ref",oid),batarg("grp",oid),batargany("hist",0),batvarargany("b",0))),
219 : { .imp=NULL }
220 : };
221 : #include "mal_import.h"
222 : #ifdef _MSC_VER
223 : #undef read
224 : #pragma section(".CRT$XCU",read)
225 : #endif
226 345 : LIB_STARTUP_FUNC(init_groupby_mal)
227 345 : { mal_module("groupby", NULL, groupby_init_funcs); }
|