Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 :
19 : static sql_exp *
20 3469448 : comparison_find_column(sql_exp *input, sql_exp *e)
21 : {
22 3469448 : switch (input->type) {
23 99298 : case e_convert: {
24 99298 : list *types = (list *)input->r;
25 99298 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
26 99298 : if (from == to)
27 191769 : return comparison_find_column(input->l, e) ? input : NULL;
28 : return NULL;
29 : }
30 3054496 : case e_column:
31 3054496 : return exp_match(e, input) ? input : NULL;
32 : default:
33 : return NULL;
34 : }
35 : }
36 :
37 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
38 : * columns */
39 : #define comp_single_column(c) (!c->f)
40 :
41 : static sql_exp *
42 8756591 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
43 : {
44 8839144 : assert(e->type == e_column);
45 8839144 : if (rel) {
46 8839064 : switch(rel->op) {
47 4094819 : case op_left:
48 : case op_right:
49 : case op_full:
50 : case op_join:
51 : case op_select:
52 : case op_anti:
53 : case op_semi: {
54 4094819 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
55 :
56 4094819 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
57 : return NULL; /* nothing will pass, skip */
58 :
59 : /* propagate from the bottom first */
60 4079196 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
61 : found_left = true;
62 2592625 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
63 4079195 : found_right = true;
64 :
65 4079195 : if (!found_left && !found_right)
66 : return NULL;
67 1782513 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
68 3500835 : for (node *n = rel->exps->h ; n ; n = n->next) {
69 1813826 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
70 :
71 1813826 : if (comp->type == e_cmp) {
72 1812822 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
73 124338 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
74 124338 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
75 :
76 : /* not semantics found or if explicitly filtering not null values from the column */
77 124338 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
78 124338 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
79 124338 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
80 19700 : continue;
81 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
82 104638 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
83 4622 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
84 4622 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
85 4622 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
86 4622 : symmetric = is_symmetric(comp);
87 :
88 4622 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
89 1817 : continue;
90 2805 : if (lne && int1 && int2) {
91 104 : if (symmetric) {
92 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
93 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
94 : /* max is min from le and (max from re and fe max) */
95 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
96 : /* min is max from le and (min from re and fe min) */
97 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
98 : } else {
99 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
100 : /* max is min from le and fe max */
101 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
102 : /* min is max from le and re min */
103 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
104 : }
105 2701 : } else if (rne) {
106 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
107 0 : prop *p = find_prop(e->p, PROP_MIN);
108 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
109 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
110 590 : } else if (int1) { /* min is max from le and re min */
111 100 : prop *p = find_prop(e->p, PROP_MIN);
112 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
113 : }
114 2111 : } else if (fne) {
115 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
116 0 : prop *p = find_prop(e->p, PROP_MAX);
117 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
118 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
119 549 : } else if (int2) { /* max is min from le and fe max */
120 266 : prop *p = find_prop(e->p, PROP_MAX);
121 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
122 : }
123 : }
124 100016 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
125 : /* both min and max must be set and the intervals must overlap */
126 42020 : switch (comp->flag) {
127 27998 : case cmp_equal: /* for equality reduce */
128 27998 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
129 27998 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
130 27998 : break;
131 4706 : case cmp_notequal: /* for inequality expand */
132 4706 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
133 4706 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
134 4706 : break;
135 5310 : case cmp_gt:
136 : case cmp_gte:
137 9682 : if (!is_anti(comp) && lne) { /* min is max from both min */
138 4372 : prop *p = find_prop(e->p, PROP_MIN);
139 4372 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
140 938 : } else if (!is_anti(comp)) { /* max is min from both max */
141 938 : prop *p = find_prop(e->p, PROP_MAX);
142 938 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
143 : }
144 : break;
145 4006 : case cmp_lt:
146 : case cmp_lte:
147 7205 : if (!is_anti(comp) && lne) { /* max is min from both max */
148 3199 : prop *p = find_prop(e->p, PROP_MAX);
149 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
150 807 : } else if (!is_anti(comp)) { /* min is max from both min */
151 807 : prop *p = find_prop(e->p, PROP_MIN);
152 807 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
153 : }
154 : break;
155 : default: /* Maybe later I can do cmp_in and cmp_notin */
156 : break;
157 : }
158 : }
159 : }
160 : }
161 : }
162 : }
163 1782513 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
164 35455 : set_has_nil(e);
165 1782513 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
166 93806 : set_has_no_nil(e);
167 1782513 : if (is_join(rel->op) && is_unique(e) && !still_unique)
168 118686 : set_not_unique(e);
169 1782513 : return e;
170 : }
171 4642325 : case op_table:
172 : case op_basetable:
173 : case op_union:
174 : case op_except:
175 : case op_inter:
176 : case op_munion:
177 : case op_project:
178 : case op_groupby: {
179 4642325 : if (is_recursive(rel))
180 : return NULL;
181 4642137 : sql_exp *found;
182 4642137 : atom *fval;
183 4642137 : prop *est;
184 4642137 : if ((found = rel_find_exp(rel, e))) {
185 2175655 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
186 2131166 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
187 1121487 : set_minmax_property(sql, e, PROP_MAX, fval);
188 2131167 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
189 1128567 : set_minmax_property(sql, e, PROP_MIN, fval);
190 2131167 : if (!has_nil(found))
191 1375228 : set_has_no_nil(e);
192 2131167 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
193 1717159 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
194 425250 : set_unique(e);
195 : /* propagate unique estimation for known cases */
196 2131167 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
197 7580 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
198 7580 : p->value.dval = 1;
199 2123587 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
200 71515 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
201 2064738 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
202 193936 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
203 193936 : p->value.dval = est->value.dval;
204 : }
205 : }
206 2175658 : return e;
207 : }
208 : return NULL;
209 : }
210 82553 : case op_topn:
211 : case op_sample:
212 82553 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
213 : default:
214 : break;
215 : }
216 : }
217 : return NULL;
218 : }
219 :
220 : static atom *
221 4596336 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
222 : {
223 4596336 : atom *a = SA_NEW(sa, atom);
224 :
225 4596311 : assert(!VALisnil(v));
226 4596341 : *a = (atom) {.tpe = *tpe,};
227 4596341 : SA_VALcopy(sa, &a->data, v);
228 4596435 : return a;
229 : }
230 :
231 : void
232 4245428 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
233 : {
234 4245428 : bool nonil = false, unique = false;
235 4245428 : double unique_est = 0.0;
236 4245428 : ValRecord min, max;
237 4245428 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
238 :
239 4246537 : if (has_nil(e) && nonil)
240 2800808 : set_has_no_nil(e);
241 4246537 : if (!is_unique(e) && unique)
242 1120237 : set_unique(e);
243 4246537 : if (unique_est != 0.0) {
244 2989359 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
245 2989190 : p->value.dval = unique_est;
246 : }
247 4246368 : unsigned int digits = 0;
248 4246368 : sql_subtype *et = exp_subtype(e);
249 4246556 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
250 2760073 : digits = et->digits;
251 4246556 : if ((ok & 2) == 2) {
252 2295363 : if (!VALisnil(&max)) {
253 2295335 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
254 2295289 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
255 2295292 : if (digits) {
256 1706330 : unsigned int nd = atom_digits(p->value.pval);
257 1706289 : if (nd < digits)
258 : digits = nd;
259 1706289 : if (!digits)
260 : digits = 1;
261 : }
262 : }
263 2295178 : VALclear(&max);
264 : }
265 4246421 : if ((ok & 1) == 1) {
266 2301418 : if (!VALisnil(&min)) {
267 2301411 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
268 2301459 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
269 2301572 : if (digits) {
270 1713595 : unsigned int nd = atom_digits(p->value.pval);
271 1713572 : if (nd > digits) /* possibly set to low by max value */
272 : digits = nd;
273 : if (!digits)
274 : digits = 1;
275 : }
276 : }
277 2301539 : VALclear(&min);
278 : }
279 4246569 : if (digits)
280 2760014 : et->digits = digits;
281 4246569 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
282 216 : et->digits = et->scale + 1;
283 4246569 : }
284 :
285 : static void
286 946404 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
287 : {
288 946404 : if (e->p)
289 : return;
290 304428 : sql_column *c = NULL;
291 :
292 304428 : if ((c = rel_base_find_column(rel, e->nid))) {
293 206818 : sql_column_get_statistics(sql, c, e);
294 : }
295 : }
296 :
297 : static bool
298 8875 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
299 : {
300 8875 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
301 8875 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
302 8875 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
303 8875 : prop *est;
304 :
305 : /* for the intersection, if both expressions don't overlap, it can be pruned */
306 8875 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
307 511 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
308 26 : return true;
309 :
310 8849 : if (lval_max && rval_max) {
311 2557 : if (is_union(rel->op))
312 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
313 2554 : else if (is_inter(rel->op))
314 399 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
315 : else /* except */
316 2155 : set_minmax_property(sql, e, PROP_MAX, lval_max);
317 : }
318 8849 : if (lval_min && rval_min) {
319 2557 : if (is_union(rel->op))
320 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
321 2554 : else if (is_inter(rel->op))
322 399 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
323 : else /* except */
324 2155 : set_minmax_property(sql, e, PROP_MIN, lval_min);
325 : }
326 :
327 8849 : if (is_union(rel->op) || is_munion(rel->op)) {
328 5986 : if (!has_nil(le) && !has_nil(re))
329 879 : set_has_no_nil(e);
330 5986 : if (need_distinct(rel) && list_length(rel->exps) == 1)
331 0 : set_unique(e);
332 2863 : } else if (is_inter(rel->op)) {
333 564 : if (!has_nil(le) || !has_nil(re))
334 509 : set_has_no_nil(e);
335 564 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
336 345 : set_unique(e);
337 : } else {
338 2299 : assert(is_except(rel->op));
339 2299 : if (!has_nil(le))
340 2236 : set_has_no_nil(e);
341 2299 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
342 2012 : set_unique(e);
343 : }
344 : /* propagate unique estimation for known cases */
345 8849 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
346 207 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
347 207 : p->value.dval = est->value.dval;
348 : }
349 : return false;
350 : }
351 :
352 :
353 : static void
354 62385 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
355 : {
356 62385 : if (is_recursive(rel))
357 : return ;
358 62385 : assert(is_munion(rel->op));
359 :
360 62385 : sql_rel *l = rels->h->data;
361 62385 : sql_exp *le = list_fetch(l->exps, i);
362 62385 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
363 62385 : bool has_nonil = !has_nil(le);
364 :
365 181454 : for(node *n = rels->h->next; n; n = n->next) {
366 119069 : sql_rel *r = n->data;
367 119069 : sql_exp *re = list_fetch(r->exps, i);
368 119069 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
369 :
370 119069 : if (lval_max && rval_max) {
371 43928 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
372 43928 : lval_max = find_prop_and_get(e->p, PROP_MAX);
373 : }
374 119069 : if (lval_min && rval_min) {
375 43379 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
376 43379 : lval_min = find_prop_and_get(e->p, PROP_MIN);
377 : }
378 119069 : has_nonil &= !has_nil(re);
379 :
380 : }
381 :
382 62385 : if (has_nonil)
383 42484 : set_has_no_nil(e);
384 :
385 62385 : if (need_distinct(rel) && list_length(rel->exps) == 1)
386 1596 : set_unique(e);
387 : }
388 :
389 :
390 : static sql_exp *
391 3493745 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
392 : {
393 3493745 : mvc *sql = v->sql;
394 3493745 : atom *lval;
395 :
396 3493745 : (void) depth;
397 3493745 : switch(e->type) {
398 2181609 : case e_column:
399 2181609 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
400 282795 : case op_join:
401 : case op_left:
402 : case op_right:
403 : case op_full:
404 : case op_semi:
405 : case op_anti: {
406 282795 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
407 282795 : if (!found)
408 141839 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
409 : break;
410 : }
411 1898814 : case op_select:
412 : case op_project:
413 : case op_groupby: {
414 1898814 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
415 1898818 : if (!found && is_simple_project(rel->op))
416 127684 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
417 : break;
418 : }
419 0 : case op_insert:
420 : case op_update:
421 : case op_delete:
422 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
423 0 : break;
424 : default:
425 : break;
426 : }
427 : break;
428 101099 : case e_convert: {
429 101099 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
430 101099 : sql_exp *l = e->l;
431 101099 : sql_class fr = from->type->eclass, too = to->type->eclass;
432 101099 : prop *est;
433 :
434 101099 : if (fr == too) {
435 91987 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
436 60377 : atom *res = atom_copy(sql->sa, lval);
437 60377 : if ((res = atom_cast(sql->sa, res, to)))
438 60354 : set_minmax_property(sql, e, PROP_MAX, res);
439 : }
440 91987 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
441 61016 : atom *res = atom_copy(sql->sa, lval);
442 61016 : if ((res = atom_cast(sql->sa, res, to)))
443 60993 : set_minmax_property(sql, e, PROP_MIN, res);
444 : }
445 : }
446 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
447 101099 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
448 62888 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
449 62863 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
450 59435 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
451 59435 : p->value.dval = est->value.dval;
452 : }
453 101099 : if (!has_nil(l))
454 57234 : set_has_no_nil(e);
455 : break;
456 : }
457 342808 : case e_aggr:
458 : case e_func: {
459 342808 : BUN lv;
460 342808 : sql_subfunc *f = e->f;
461 :
462 342808 : if (!f->func->s) {
463 316026 : int key = hash_key(f->func->base.name); /* Using hash lookup */
464 316026 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
465 316026 : lookup_function look = NULL;
466 :
467 690633 : for (; he && !look; he = he->chain) {
468 374607 : struct function_properties* fp = (struct function_properties*) he->value;
469 :
470 374607 : if (!strcmp(f->func->base.name, fp->name))
471 108072 : look = fp->func;
472 : }
473 316026 : if (look)
474 108072 : look(sql, e);
475 : }
476 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
477 342808 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
478 90524 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
479 90193 : set_has_no_nil(e);
480 : /* set properties for global aggregates */
481 342808 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
482 7912 : if (!find_prop(e->p, PROP_NUNIQUES)) {
483 7912 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
484 7912 : p->value.dval = 1;
485 : }
486 7912 : set_unique(e);
487 : }
488 : break;
489 : }
490 578271 : case e_atom:
491 578271 : if (e->l) {
492 560418 : atom *a = (atom*) e->l;
493 560418 : if (!a->isnull) {
494 496104 : set_minmax_property(sql, e, PROP_MAX, a);
495 496104 : set_minmax_property(sql, e, PROP_MIN, a);
496 : }
497 560418 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
498 560418 : p->value.dval = 1;
499 17853 : } else if (e->f) {
500 4277 : list *vals = (list *) e->f;
501 4277 : sql_exp *first = vals->h ? vals->h->data : NULL;
502 4277 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
503 4277 : int has_nil = 0;
504 :
505 4277 : if (first) {
506 4277 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
507 4277 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
508 4277 : has_nil |= has_nil(first);
509 : }
510 :
511 18354 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
512 9800 : sql_exp *ee = n->data;
513 :
514 9800 : if (min && max) {
515 9307 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
516 9261 : max = atom_cmp(lval, max) > 0 ? lval : max;
517 : } else {
518 : max = NULL;
519 : }
520 9307 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
521 9261 : min = atom_cmp(min, lval) > 0 ? lval : min;
522 : } else {
523 : min = NULL;
524 : }
525 : }
526 9800 : has_nil |= has_nil(ee);
527 : }
528 :
529 4277 : if (!has_nil)
530 3906 : set_has_no_nil(e);
531 4277 : if (min && max) {
532 3864 : set_minmax_property(sql, e, PROP_MAX, max);
533 3864 : set_minmax_property(sql, e, PROP_MIN, min);
534 : }
535 4277 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
536 4277 : p->value.dval = (dbl) list_length(vals);
537 : } else {
538 13576 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
539 13576 : p->value.dval = 1;
540 : }
541 : break;
542 289958 : case e_cmp:
543 : /* TODO? propagating min/max/unique of booleans is not very worth it */
544 289958 : if (e->flag == cmp_or || e->flag == cmp_filter) {
545 17847 : if (!have_nil(e->l) && !have_nil(e->r))
546 13204 : set_has_no_nil(e);
547 272111 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
548 21723 : sql_exp *le = e->l;
549 21723 : if (!has_nil(le) && !have_nil(e->r))
550 18606 : set_has_no_nil(e);
551 : } else {
552 250388 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
553 250388 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
554 179840 : set_has_no_nil(e);
555 : }
556 : break;
557 : case e_psm:
558 : break;
559 : }
560 :
561 : #ifndef NDEBUG
562 : {
563 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
564 3493749 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
565 :
566 3493744 : (void) min;
567 3493744 : (void) max;
568 3493744 : assert(!min || !min->isnull);
569 3493744 : assert(!max || !max->isnull);
570 3493744 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
571 : }
572 : #endif
573 3493744 : return e;
574 : }
575 :
576 : static list * /* Remove predicates always false from min/max values */
577 141218 : rel_prune_predicates(visitor *v, sql_rel *rel)
578 : {
579 141218 : if (rel->l) {
580 141218 : sql_rel *l = rel->l;
581 141218 : if (is_single(l))
582 3517 : return rel->exps;
583 : }
584 137701 : if (!list_empty(rel->attr))
585 2991 : return rel->exps;
586 286688 : for (node *n = rel->exps->h ; n ; n = n->next) {
587 151978 : sql_exp *e = n->data;
588 :
589 151978 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
590 125059 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
591 125059 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
592 125059 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
593 125059 : bool always_false = false, always_true = false;
594 :
595 125059 : if (fe && !is_symmetric(e)) {
596 3058 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
597 3058 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
598 3660 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
599 1663 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
600 4072 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
601 2485 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
602 3058 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
603 1287 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
604 :
605 3058 : always_false |= not_int1 || not_int2 || not_int3;
606 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
607 4113 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
608 3957 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
609 575 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
610 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
611 : } else if (!fe) {
612 121983 : if (!is_semantics(e)) /* trivial not null cmp null case */
613 232552 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
614 121983 : switch (e->flag) {
615 106451 : case cmp_equal:
616 106451 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
617 135952 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
618 106451 : if (is_semantics(e)) { /* prune *= NULL cases */
619 5678 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
620 11356 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
621 : }
622 : break;
623 7338 : case cmp_notequal:
624 7338 : if (lval_min && lval_max && rval_min && rval_max)
625 11434 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
626 7338 : if (is_semantics(e)) {
627 29 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
628 58 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
629 : }
630 : break;
631 5489 : case cmp_gt:
632 5489 : if (lval_max && rval_min)
633 1833 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
634 5489 : if (lval_min && rval_max)
635 3666 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
636 : break;
637 121 : case cmp_gte:
638 121 : if (lval_max && rval_min)
639 51 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
640 121 : if (lval_min && rval_max)
641 102 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
642 : break;
643 2499 : case cmp_lt:
644 2499 : if (lval_min && rval_max)
645 1383 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
646 2499 : if (lval_max && rval_min)
647 2814 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
648 : break;
649 85 : case cmp_lte:
650 85 : if (lval_min && rval_max)
651 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
652 85 : if (lval_max && rval_min)
653 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
654 : break;
655 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
656 : break;
657 : }
658 : }
659 125059 : assert(!always_false || !always_true);
660 125059 : if (always_false || always_true) {
661 3687 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
662 3687 : if (exp_name(e))
663 0 : exp_prop_alias(v->sql->sa, ne, e);
664 3687 : n->data = ne;
665 3687 : v->changes++;
666 : }
667 : }
668 : }
669 134710 : return rel->exps;
670 : }
671 :
672 : static sql_rel *
673 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
674 : {
675 14 : if (side == rel->r)
676 0 : rel->r = NULL;
677 : else
678 14 : rel->l = NULL;
679 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
680 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
681 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
682 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
683 : }
684 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
685 21 : sql_exp *e1 = n->data, *e2 = m->data;
686 21 : exp_prop_alias(v->sql->sa, e2, e1);
687 : }
688 14 : list_hash_clear(side->exps);
689 :
690 14 : if (need_distinct(rel))
691 0 : set_distinct(side);
692 14 : side->p = prop_copy(v->sql->sa, rel->p);
693 14 : rel_destroy(rel);
694 14 : return side;
695 : }
696 :
697 : static BUN
698 22157 : trivial_project_exp_card(sql_exp *e)
699 : {
700 22520 : if (e->type == e_convert)
701 363 : return trivial_project_exp_card(e->l);
702 22157 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
703 : }
704 :
705 : static BUN
706 26136 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
707 : {
708 26136 : BUN lv = get_rel_count(l);
709 :
710 26136 : if (lv == 0)
711 : return 0;
712 23421 : if (!list_empty(exps)) {
713 23421 : BUN nuniques = 0;
714 : /* compute the highest number of unique values */
715 58497 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
716 35076 : sql_exp *e = n->data;
717 35076 : sql_rel *bt = NULL;
718 35076 : prop *p = NULL;
719 35076 : BUN euniques = BUN_NONE;
720 35076 : atom *min, *max, *sub = NULL;
721 35076 : sql_subtype *tp = exp_subtype(e);
722 35076 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
723 :
724 35076 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
725 25369 : euniques = (BUN) p->value.dval;
726 9707 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
727 7633 : euniques = (BUN) p->value.lval;
728 : }
729 : /* use min to max range to compute number of possible values in the domain for number types */
730 35076 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
731 24247 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
732 : /* the range includes min and max, so the atom_inc call is needed */
733 : /* if 'euniques' has number of distinct values, compute min between both */
734 19299 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
735 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
736 : }
737 35076 : if (euniques != BUN_NONE)
738 33002 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
739 : else
740 : nuniques = BUN_NONE;
741 : }
742 23421 : if (nuniques != BUN_NONE)
743 : return nuniques;
744 : }
745 : return lv;
746 : }
747 :
748 : static sql_rel *
749 2088299 : rel_get_statistics_(visitor *v, sql_rel *rel)
750 : {
751 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
752 2088299 : uint8_t has_special_modify = *(uint8_t*) v->data;
753 2088299 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
754 :
755 : /* Don't look at the same relation twice */
756 2088299 : if (are_statistics_gathered(rel->used))
757 : return rel;
758 1346698 : rel->used |= statistics_gathered;
759 :
760 1346698 : switch (rel->op) {
761 316906 : case op_basetable: {
762 316906 : sql_table *t = (sql_table *) rel->l;
763 316906 : sqlstore *store = v->sql->session->tr->store;
764 :
765 316906 : if (!list_empty(rel->exps)) {
766 1263449 : for (node *n = rel->exps->h ; n ; n = n->next)
767 946351 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
768 : }
769 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
770 317100 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
771 263161 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
772 : break;
773 : }
774 2793 : case op_union:
775 : case op_inter:
776 : case op_except: {
777 2793 : bool empty_cross = false;
778 2793 : int i = 0;
779 2793 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
780 :
781 2793 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
782 0 : pl = pl->l;
783 2793 : while (is_sample(pr->op) || is_topn(pr->op))
784 0 : pr = pr->l;
785 : /* if it's not a projection, then project and propagate statistics */
786 2793 : if (!is_project(pl->op) && !is_base(pl->op)) {
787 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
788 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
789 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
790 : }
791 2793 : if (!is_project(pr->op) && !is_base(pr->op)) {
792 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
793 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
794 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
795 : }
796 :
797 11668 : for (node *n = rel->exps->h ; n ; n = n->next) {
798 8875 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
799 8875 : i++;
800 : }
801 :
802 : /* propagate row count */
803 2793 : if (is_union(rel->op)) {
804 277 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
805 277 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
806 :
807 277 : if (lv == 0 && rv == 0) { /* both sides empty */
808 2 : if (can_be_pruned)
809 : empty_cross = true;
810 : else
811 2 : set_count_prop(v->sql->sa, rel, 0);
812 275 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
813 0 : rel = set_setop_side(v, rel, r);
814 0 : empty_cross = false; /* don't rewrite again */
815 275 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
816 0 : rel = set_setop_side(v, rel, l);
817 0 : empty_cross = false; /* don't rewrite again */
818 275 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
819 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
820 : }
821 2516 : } else if (is_inter(rel->op) || is_except(rel->op)) {
822 2516 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
823 2516 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
824 :
825 2516 : if (lv == 0) { /* left side empty */
826 62 : if (can_be_pruned)
827 : empty_cross = true;
828 : else
829 5 : set_count_prop(v->sql->sa, rel, 0);
830 2454 : } else if (rv == 0) { /* right side empty */
831 17 : if (is_inter(rel->op)) {
832 3 : if (can_be_pruned)
833 : empty_cross = true;
834 : else
835 0 : set_count_prop(v->sql->sa, rel, 0);
836 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
837 14 : rel = set_setop_side(v, rel, l);
838 14 : empty_cross = false; /* don't rewrite again */
839 : } else {
840 0 : set_count_prop(v->sql->sa, rel, lv);
841 : }
842 : } else {
843 2437 : set_count_prop(v->sql->sa, rel, lv);
844 : }
845 : }
846 2793 : if (can_be_pruned && empty_cross) {
847 80 : rel_destroy(rel->l);
848 80 : rel->l = NULL;
849 80 : rel_destroy(rel->r);
850 80 : rel->r = NULL;
851 244 : for (node *n = rel->exps->h ; n ; n = n->next) {
852 164 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
853 164 : exp_prop_alias(v->sql->sa, a, e);
854 164 : n->data = a;
855 : }
856 80 : list_hash_clear(rel->exps);
857 80 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
858 80 : set_count_prop(v->sql->sa, l, 1);
859 80 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
860 80 : set_count_prop(v->sql->sa, l, 0);
861 80 : rel->op = op_project;
862 80 : rel->l = l;
863 80 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
864 80 : set_count_prop(v->sql->sa, rel, 0);
865 80 : set_nodistinct(rel); /* set relations may have distinct flag set */
866 80 : v->changes++;
867 : }
868 : break;
869 : }
870 13395 : case op_munion: {
871 13395 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
872 13395 : BUN cnt = 0;
873 13395 : bool needs_pruning = false;
874 :
875 13395 : if (is_recursive(rel))
876 : break;
877 50930 : for (node *n = l->h; n; n = n->next) {
878 37605 : sql_rel *r = n->data, *pl = r;
879 :
880 37605 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
881 0 : pl = pl->l;
882 : /* if it's not a projection, then project and propagate statistics */
883 37605 : if (!is_project(pl->op) && !is_base(pl->op)) {
884 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
885 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
886 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
887 : }
888 37605 : nrels = append(nrels, pl);
889 : /* we need new munion statistics */
890 : /* propagate row count */
891 37605 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
892 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
893 37605 : if (rv == BUN_NONE) {
894 1208 : cnt++;
895 1208 : continue;
896 : }
897 36397 : if (!rv && can_be_pruned)
898 6695 : needs_pruning = true;
899 : /* overflow check */
900 36397 : if (rv > (BUN_MAX - cnt))
901 37605 : rv = BUN_MAX;
902 : else
903 36397 : cnt += rv;
904 : }
905 13325 : int i = 0;
906 75710 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
907 62385 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
908 :
909 13325 : if (needs_pruning && !rel_is_ref(rel)) {
910 4534 : v->changes++;
911 4534 : list *nl = sa_list(l->sa);
912 :
913 16675 : for (node *n = nrels->h; n; n = n->next) {
914 12141 : sql_rel *r = n->data;
915 12141 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
916 :
917 12141 : if (!rv) { /* keep last for now */
918 6225 : rel_destroy(r);
919 6225 : continue;
920 : }
921 5916 : nl = append(nl, r);
922 : }
923 4534 : rel->l = nl;
924 4534 : if (list_length(nl) == 1) {
925 4208 : sql_rel *l = rel->l = nl->h->data; /* ugh */
926 4208 : rel->r = NULL;
927 4208 : rel->op = op_project;
928 :
929 20691 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
930 16483 : sql_exp *pe = n->data, *ie = m->data;
931 16483 : sql_exp *ne = exp_ref(v->sql, ie);
932 16483 : exp_prop_alias(v->sql->sa, ne, pe);
933 16483 : n->data = ne;
934 : }
935 4208 : list_hash_clear(rel->exps);
936 326 : } else if (list_empty(nl)) {
937 : /* empty select (project [ nils ] ) */
938 454 : for (node *n = rel->exps->h ; n ; n = n->next) {
939 354 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
940 354 : exp_prop_alias(v->sql->sa, a, e);
941 354 : n->data = a;
942 : }
943 100 : list_hash_clear(rel->exps);
944 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
945 100 : set_count_prop(v->sql->sa, l, 1);
946 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
947 100 : set_count_prop(v->sql->sa, l, 0);
948 100 : rel->op = op_project;
949 100 : rel->r = NULL;
950 100 : rel->l = l;
951 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
952 100 : set_count_prop(v->sql->sa, rel, 0);
953 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
954 : }
955 : } else {
956 8791 : set_count_prop(v->sql->sa, rel, cnt);
957 : }
958 : break;
959 : }
960 538862 : case op_join:
961 : case op_left:
962 : case op_right:
963 : case op_full:
964 : case op_semi:
965 : case op_anti:
966 : case op_select:
967 : case op_project:
968 : case op_groupby: {
969 538862 : if (is_groupby(rel->op) && !list_empty(rel->r))
970 16891 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
971 538862 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
972 538846 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
973 39355 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
974 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
975 538849 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
976 141218 : int changes = v->changes;
977 141218 : rel->exps = rel_prune_predicates(v, rel);
978 141218 : if (v->changes > changes)
979 3654 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
980 : }
981 :
982 : /* propagate row count */
983 538849 : sql_rel *l = rel->l, *r = rel->r;
984 538849 : switch (rel->op) {
985 136378 : case op_join:
986 : case op_left:
987 : case op_right:
988 : case op_full: {
989 136378 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
990 :
991 136378 : if (!list_empty(rel->exps) && !is_single(rel)) {
992 249052 : for (node *n = rel->exps->h ; n ; n = n->next) {
993 126994 : sql_exp *e = n->data, *el = e->l, *er = e->r;
994 :
995 126994 : if (find_prop(e->p, PROP_JOINIDX)) {
996 667 : join_idx_estimate = lv>rv?lv:rv;
997 126327 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
998 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
999 122443 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
1000 122267 : BUN lu = 0, ru = 0;
1001 122267 : prop *p = NULL;
1002 122267 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
1003 90815 : lu = (BUN) p->value.dval;
1004 122267 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
1005 105340 : ru = (BUN) p->value.dval;
1006 122267 : if (is_unique(el) || lu > lv) {
1007 74600 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1008 74600 : uniques_estimate = MIN(uniques_estimate, ncount);
1009 47667 : } else if (is_unique(er) || ru > rv) {
1010 29815 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1011 29815 : uniques_estimate = MIN(uniques_estimate, ncount);
1012 : }
1013 : }
1014 : }
1015 : }
1016 : }
1017 136378 : if (is_single(rel)) {
1018 4926 : set_count_prop(v->sql->sa, rel, lv);
1019 131452 : } else if (join_idx_estimate != BUN_MAX) {
1020 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1021 130787 : } else if (uniques_estimate != BUN_MAX) {
1022 97902 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1023 32885 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1024 : /* corner cases for outer joins */
1025 126 : if (is_left(rel->op)) {
1026 114 : set_count_prop(v->sql->sa, rel, lv);
1027 12 : } else if (is_right(rel->op)) {
1028 3 : set_count_prop(v->sql->sa, rel, rv);
1029 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1030 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1031 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1032 0 : set_count_prop(v->sql->sa, rel, 0);
1033 : }
1034 32759 : } else if (lv == 0) {
1035 1172 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1036 32159 : } else if (rv == 0) {
1037 1570 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1038 31083 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1039 18436 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1040 : }
1041 : break;
1042 : }
1043 2916 : case op_anti:
1044 2916 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1045 2916 : break;
1046 110142 : case op_semi:
1047 : case op_select:
1048 : /* TODO calculate cardinalities using selectivities */
1049 110142 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1050 2746 : set_count_prop(v->sql->sa, rel, 0);
1051 : } else {
1052 211529 : if (!list_empty(rel->exps) && !is_single(rel)) {
1053 104132 : BUN cnt = get_rel_count(l), u = 1;
1054 172700 : for (node *n = rel->exps->h ; n ; n = n->next) {
1055 114240 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1056 :
1057 : /* simple expressions first */
1058 114240 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1059 : /* use selectivity */
1060 58992 : prop *p;
1061 58992 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1062 45672 : u = (BUN) p->value.dval;
1063 45672 : break;
1064 : }
1065 : }
1066 : }
1067 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1068 104132 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1069 : } else {
1070 3264 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1071 : }
1072 : }
1073 : break;
1074 264647 : case op_project:
1075 264647 : if (l) {
1076 254397 : if (need_distinct(rel)) {
1077 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1078 : } else {
1079 254397 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1080 : }
1081 : } else {
1082 10250 : BUN card = 1;
1083 :
1084 10250 : if (!list_empty(rel->exps)) {
1085 31064 : for (node *n = rel->exps->h ; n ; n = n->next) {
1086 20814 : sql_exp *e = n->data;
1087 20814 : card = MAX(card, trivial_project_exp_card(e));
1088 : }
1089 : }
1090 10250 : set_count_prop(v->sql->sa, rel, card);
1091 : }
1092 : break;
1093 24352 : case op_groupby:
1094 24352 : if (list_empty(rel->r)) {
1095 7460 : set_count_prop(v->sql->sa, rel, 1);
1096 : } else {
1097 16892 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1098 : }
1099 : break;
1100 : default:
1101 : break;
1102 : }
1103 : break;
1104 : }
1105 16676 : case op_topn: {
1106 16676 : BUN lv = get_rel_count(rel->l);
1107 :
1108 16676 : if (lv != BUN_NONE) {
1109 16654 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1110 84 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1111 84 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1112 84 : lv = offset >= lv ? 0 : lv - offset;
1113 : }
1114 16655 : if (le->l && exp_is_not_null(le)) {
1115 16620 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1116 16620 : lv = MIN(lv, limit);
1117 : }
1118 16654 : set_count_prop(v->sql->sa, rel, lv);
1119 : }
1120 : break;
1121 : }
1122 22 : case op_sample: {
1123 22 : BUN lv = get_rel_count(rel->l);
1124 :
1125 22 : if (lv != BUN_NONE) {
1126 4 : sql_exp *se = rel->exps->h->data;
1127 4 : sql_subtype *tp = exp_subtype(se);
1128 :
1129 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1130 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1131 4 : lv = MIN(lv, sample);
1132 0 : } else if (se->l) { /* sample is a percentage of rows */
1133 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1134 0 : assert(tp->type->eclass == EC_FLT);
1135 0 : lv = (BUN) ceil((dbl)lv * percent);
1136 : }
1137 4 : set_count_prop(v->sql->sa, rel, lv);
1138 : }
1139 : break;
1140 : }
1141 6015 : case op_table: {
1142 6015 : sql_exp *op = rel->r;
1143 6015 : if (rel->flag != TRIGGER_WRAPPER && op) {
1144 5703 : sql_subfunc *f = op->f;
1145 5703 : if (f->func->lang == FUNC_LANG_SQL) {
1146 97 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1147 5606 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1148 831 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1149 4775 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1150 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1151 4775 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1152 1089 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1153 3686 : } else if (f->func->lang == FUNC_LANG_MAL &&
1154 3574 : (strcmp(f->func->base.name, "queue") == 0 ||
1155 3309 : strcmp(f->func->base.name, "optimizers") == 0 ||
1156 2996 : strcmp(f->func->base.name, "env") == 0 ||
1157 2703 : strcmp(f->func->base.name, "keywords") == 0 ||
1158 2703 : strcmp(f->func->base.name, "statistics") == 0 ||
1159 2034 : strcmp(f->func->base.name, "rejects") == 0 ||
1160 1779 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1161 1779 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1162 1779 : strcmp(f->func->base.name, "sessions") == 0) ) {
1163 2086 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1164 : }
1165 : /* else {
1166 : printf("%%func needs stats : %s\n", f->func->base.name);
1167 : } */
1168 : }
1169 : break;
1170 : }
1171 : /*These relations are less important for now
1172 : TODO later we can tune it */
1173 : case op_insert:
1174 : case op_update:
1175 : case op_delete:
1176 : case op_truncate:
1177 : case op_ddl:
1178 : default:
1179 : break;
1180 : }
1181 :
1182 : return rel;
1183 : }
1184 :
1185 : static sql_rel *
1186 544502 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1187 : {
1188 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1189 544502 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1190 544502 : v->data = &has_special_modify;
1191 544502 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1192 544982 : v->data = gp;
1193 544982 : return rel;
1194 : }
1195 :
1196 : run_optimizer
1197 720817 : bind_get_statistics(visitor *v, global_props *gp)
1198 : {
1199 720817 : (void) v;
1200 720817 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1201 : }
1202 :
1203 :
1204 : static bool
1205 95559 : point_select_on_unique_column(sql_rel *rel)
1206 : {
1207 95559 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1208 131620 : for (node *n = rel->exps->h; n ; n = n->next) {
1209 75836 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1210 :
1211 75836 : if (is_compare(e->type) && e->flag == cmp_equal) {
1212 33632 : if (is_numeric_upcast(el))
1213 0 : el = el->l;
1214 33632 : if (is_numeric_upcast(er))
1215 0 : er = er->l;
1216 33632 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1217 752 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1218 : return true;
1219 32880 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1220 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1221 : return true;
1222 : }
1223 : }
1224 : }
1225 : return false;
1226 : }
1227 :
1228 : /*
1229 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1230 : * join, the opposite side's select can be pushed above the join.
1231 : */
1232 : static inline sql_rel *
1233 1258781 : rel_push_select_up(visitor *v, sql_rel *rel)
1234 : {
1235 1258781 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1236 256094 : sql_rel *l = rel->l, *r = rel->r;
1237 256094 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1238 256094 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1239 :
1240 256094 : if (can_pushup_left || can_pushup_right) {
1241 67415 : if (can_pushup_left)
1242 45777 : can_pushup_left = point_select_on_unique_column(r);
1243 67415 : if (can_pushup_right)
1244 49782 : can_pushup_right = point_select_on_unique_column(l);
1245 :
1246 : /* if both selects retrieve one row each, it's not worth it to push both up */
1247 67415 : if (can_pushup_left && !can_pushup_right) {
1248 686 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1249 686 : nrel->l = l->l;
1250 686 : rel = rel_inplace_select(rel, nrel, l->exps);
1251 686 : assert(is_select(rel->op));
1252 686 : v->changes++;
1253 66729 : } else if (!can_pushup_left && can_pushup_right) {
1254 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1255 14 : nrel->r = r->l;
1256 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1257 14 : assert(is_select(rel->op));
1258 14 : v->changes++;
1259 : }
1260 : }
1261 : }
1262 1258781 : return rel;
1263 : }
1264 :
1265 : static int
1266 98681 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1267 : {
1268 98681 : int de;
1269 :
1270 98681 : if (!t)
1271 : return 0;
1272 98681 : switch (ATOMstorage(t->type->localtype)) {
1273 : case TYPE_bte:
1274 : return 150 - 8;
1275 1634 : case TYPE_sht:
1276 1634 : return 150 - 16;
1277 39113 : case TYPE_int:
1278 39113 : return 150 - 32;
1279 519 : case TYPE_void:
1280 : case TYPE_lng:
1281 519 : return 150 - 64;
1282 : case TYPE_uuid:
1283 : #ifdef HAVE_HGE
1284 : case TYPE_hge:
1285 : #endif
1286 : return 150 - 128;
1287 2 : case TYPE_flt:
1288 2 : return 75 - 24;
1289 : case TYPE_dbl:
1290 : return 75 - 53;
1291 33888 : default:
1292 33888 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1293 1666 : return 150 - de * 8;
1294 : /* strings and blobs not duplicate eliminated don't get any points here */
1295 : return 0;
1296 : }
1297 : }
1298 :
1299 : static int
1300 35922 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1301 : {
1302 35922 : int res = 0;
1303 35922 : sql_subtype *t = exp_subtype(e);
1304 35922 : sql_column *c = NULL;
1305 :
1306 35922 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1307 : return -1000;
1308 : /* can we find out if the underlying table is sorted */
1309 35321 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1310 35321 : res += 600;
1311 :
1312 : /* prefer the shorter var types over the longer ones */
1313 35321 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1314 35321 : return res;
1315 : }
1316 :
1317 : static int
1318 60407 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1319 : {
1320 60407 : int score = 0;
1321 60407 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1322 35922 : sql_exp *l = e->l;
1323 :
1324 35922 : while (l->type == e_cmp) { /* go through nested comparisons */
1325 2 : sql_exp *ll;
1326 :
1327 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1328 0 : ll = ((list*)l->l)->h->data;
1329 : else
1330 2 : ll = l->l;
1331 2 : if (ll->type != e_cmp)
1332 : break;
1333 : l = ll;
1334 : }
1335 35922 : score += score_se_base(v, rel, l);
1336 : }
1337 60407 : score += exp_keyvalue(e);
1338 60407 : return score;
1339 : }
1340 :
1341 : static inline sql_rel *
1342 1258782 : rel_select_order(visitor *v, sql_rel *rel)
1343 : {
1344 1258782 : int *scores = NULL;
1345 1258782 : sql_exp **exps = NULL;
1346 :
1347 1258782 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1348 28102 : node *n;
1349 28102 : int i, nexps = list_length(rel->exps);
1350 28102 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1351 28102 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1352 :
1353 88509 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1354 60437 : exps[i] = n->data;
1355 60437 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1356 : return rel;
1357 60407 : scores[i] = score_se(v, rel, n->data);
1358 : }
1359 28072 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1360 :
1361 88479 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1362 60407 : n->data = exps[i];
1363 : }
1364 :
1365 : return rel;
1366 : }
1367 :
1368 : /* Compute the efficiency of using this expression early in a group by list */
1369 : static int
1370 63360 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1371 : {
1372 63360 : int res = 0;
1373 63360 : sql_subtype *t = exp_subtype(e);
1374 63360 : sql_column *c = exp_find_column(rel, e, -2);
1375 :
1376 63360 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1377 38 : res += 1000;
1378 : /* can we find out if the underlying table is sorted */
1379 63360 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1380 3584 : res += 700;
1381 42134 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1382 3700 : res += 500;
1383 63360 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1384 0 : res += 200;
1385 :
1386 : /* prefer the shorter var types over the longer ones */
1387 63360 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1388 63360 : return res;
1389 : }
1390 :
1391 : /* reorder group by expressions */
1392 : static inline sql_rel *
1393 1258781 : rel_groupby_order(visitor *v, sql_rel *rel)
1394 : {
1395 1258781 : int *scores = NULL;
1396 1258781 : sql_exp **exps = NULL;
1397 :
1398 1258781 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1399 26727 : node *n;
1400 26727 : list *gbe = rel->r;
1401 26727 : int i, ngbe = list_length(gbe);
1402 26727 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1403 26727 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1404 :
1405 : /* first sorting step, give priority for integers and sorted columns */
1406 90087 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1407 63360 : exps[i] = n->data;
1408 63360 : scores[i] = score_gbe(v, rel, exps[i]);
1409 : }
1410 26727 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1411 :
1412 : /* second sorting step, give priority to strings with lower number of digits */
1413 54217 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1414 26727 : if (scores[i])
1415 25725 : i++;
1416 26727 : if (ngbe - i > 1) {
1417 12055 : for (int j = i; j < ngbe; j++) {
1418 9064 : sql_subtype *t = exp_subtype(exps[j]);
1419 9064 : scores[j] = t ? t->digits : 0;
1420 : }
1421 : /* the less number of digits the better, order ascending */
1422 2991 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1423 : }
1424 :
1425 90087 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1426 63360 : n->data = exps[i];
1427 : }
1428 :
1429 1258781 : return rel;
1430 : }
1431 :
1432 : /* This optimization loop contains optimizations that can potentially use statistics */
1433 : static sql_rel *
1434 1258781 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1435 : {
1436 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1437 1258781 : rel = rel_push_select_up(v, rel);
1438 1258781 : rel = rel_select_order(v, rel);
1439 :
1440 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1441 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1442 : rel_distinct_aggregate_on_unique_values */
1443 :
1444 1258781 : rel = rel_groupby_order(v, rel);
1445 1258782 : return rel;
1446 : }
1447 :
1448 : static sql_rel *
1449 67711 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1450 : {
1451 67711 : (void) gp;
1452 67711 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1453 : }
1454 :
1455 : run_optimizer
1456 721425 : bind_final_optimization_loop(visitor *v, global_props *gp)
1457 : {
1458 721425 : int flag = v->sql->sql_optimizer;
1459 : /* At the moment, this optimizer has dependency on 3 flags */
1460 552820 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1461 789138 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1462 : }
|