Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 :
19 : static sql_exp *
20 3463029 : comparison_find_column(sql_exp *input, sql_exp *e)
21 : {
22 3463029 : switch (input->type) {
23 98425 : case e_convert: {
24 98425 : list *types = (list *)input->r;
25 98425 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
26 98425 : if (from == to)
27 189782 : return comparison_find_column(input->l, e) ? input : NULL;
28 : return NULL;
29 : }
30 3052896 : case e_column:
31 3052896 : return exp_match(e, input) ? input : NULL;
32 : default:
33 : return NULL;
34 : }
35 : }
36 :
37 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
38 : * columns */
39 : #define comp_single_column(c) (!c->f)
40 :
41 : static sql_exp *
42 8744632 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
43 : {
44 8827927 : assert(e->type == e_column);
45 8827927 : if (rel) {
46 8827848 : switch(rel->op) {
47 4088024 : case op_left:
48 : case op_right:
49 : case op_full:
50 : case op_join:
51 : case op_select:
52 : case op_anti:
53 : case op_semi: {
54 4088024 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
55 :
56 4088024 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
57 : return NULL; /* nothing will pass, skip */
58 :
59 : /* propagate from the bottom first */
60 4072471 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
61 : found_left = true;
62 2590758 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
63 4072471 : found_right = true;
64 :
65 4072471 : if (!found_left && !found_right)
66 : return NULL;
67 1777034 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
68 3495270 : for (node *n = rel->exps->h ; n ; n = n->next) {
69 1809534 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
70 :
71 1809534 : if (comp->type == e_cmp) {
72 1808532 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
73 125124 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
74 125124 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
75 :
76 : /* not semantics found or if explicitly filtering not null values from the column */
77 125124 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
78 125124 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
79 125124 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
80 19791 : continue;
81 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
82 105333 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
83 4622 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
84 4622 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
85 4622 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
86 4622 : symmetric = is_symmetric(comp);
87 :
88 4622 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
89 1817 : continue;
90 2805 : if (lne && int1 && int2) {
91 104 : if (symmetric) {
92 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
93 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
94 : /* max is min from le and (max from re and fe max) */
95 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
96 : /* min is max from le and (min from re and fe min) */
97 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
98 : } else {
99 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
100 : /* max is min from le and fe max */
101 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
102 : /* min is max from le and re min */
103 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
104 : }
105 2701 : } else if (rne) {
106 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
107 0 : prop *p = find_prop(e->p, PROP_MIN);
108 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
109 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
110 590 : } else if (int1) { /* min is max from le and re min */
111 100 : prop *p = find_prop(e->p, PROP_MIN);
112 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
113 : }
114 2111 : } else if (fne) {
115 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
116 0 : prop *p = find_prop(e->p, PROP_MAX);
117 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
118 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
119 549 : } else if (int2) { /* max is min from le and fe max */
120 266 : prop *p = find_prop(e->p, PROP_MAX);
121 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
122 : }
123 : }
124 100711 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
125 : /* both min and max must be set and the intervals must overlap */
126 43314 : switch (comp->flag) {
127 28978 : case cmp_equal: /* for equality reduce */
128 28978 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
129 28978 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
130 28978 : break;
131 4679 : case cmp_notequal: /* for inequality expand */
132 4679 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
133 4679 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
134 4679 : break;
135 5651 : case cmp_gt:
136 : case cmp_gte:
137 10364 : if (!is_anti(comp) && lne) { /* min is max from both min */
138 4713 : prop *p = find_prop(e->p, PROP_MIN);
139 4713 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
140 938 : } else if (!is_anti(comp)) { /* max is min from both max */
141 938 : prop *p = find_prop(e->p, PROP_MAX);
142 938 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
143 : }
144 : break;
145 4006 : case cmp_lt:
146 : case cmp_lte:
147 7205 : if (!is_anti(comp) && lne) { /* max is min from both max */
148 3199 : prop *p = find_prop(e->p, PROP_MAX);
149 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
150 807 : } else if (!is_anti(comp)) { /* min is max from both min */
151 807 : prop *p = find_prop(e->p, PROP_MIN);
152 807 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
153 : }
154 : break;
155 : default: /* Maybe later I can do cmp_in and cmp_notin */
156 : break;
157 : }
158 : }
159 : }
160 : }
161 : }
162 : }
163 1777034 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
164 38014 : set_has_nil(e);
165 1777034 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
166 94277 : set_has_no_nil(e);
167 1777034 : if (is_join(rel->op) && is_unique(e) && !still_unique)
168 118919 : set_not_unique(e);
169 1777034 : return e;
170 : }
171 4637182 : case op_table:
172 : case op_basetable:
173 : case op_union:
174 : case op_except:
175 : case op_inter:
176 : case op_munion:
177 : case op_project:
178 : case op_groupby: {
179 4637182 : sql_exp *found;
180 4637182 : atom *fval;
181 4637182 : prop *est;
182 4637182 : if ((found = rel_find_exp(rel, e))) {
183 2175402 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
184 2131078 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
185 1133316 : set_minmax_property(sql, e, PROP_MAX, fval);
186 2131075 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
187 1140623 : set_minmax_property(sql, e, PROP_MIN, fval);
188 2131072 : if (!has_nil(found))
189 1377483 : set_has_no_nil(e);
190 2131072 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
191 1720643 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
192 421303 : set_unique(e);
193 : /* propagate unique estimation for known cases */
194 2131071 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
195 7538 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
196 7538 : p->value.dval = 1;
197 2123533 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
198 66647 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
199 2066028 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
200 187990 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
201 187990 : p->value.dval = est->value.dval;
202 : }
203 : }
204 2175397 : return e;
205 : }
206 : return NULL;
207 : }
208 83295 : case op_topn:
209 : case op_sample:
210 83295 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
211 : default:
212 : break;
213 : }
214 : }
215 : return NULL;
216 : }
217 :
218 : static atom *
219 4452544 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
220 : {
221 4452544 : atom *a = SA_NEW(sa, atom);
222 :
223 4452494 : assert(!VALisnil(v));
224 4452386 : *a = (atom) {.tpe = *tpe,};
225 4452386 : SA_VALcopy(sa, &a->data, v);
226 4452311 : return a;
227 : }
228 :
229 : void
230 4060308 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
231 : {
232 4060308 : bool nonil = false, unique = false;
233 4060308 : double unique_est = 0.0;
234 4060308 : ValRecord min, max;
235 4060308 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
236 :
237 4060443 : if (has_nil(e) && nonil)
238 2653276 : set_has_no_nil(e);
239 4060443 : if (!is_unique(e) && unique)
240 1100513 : set_unique(e);
241 4060443 : if (unique_est != 0.0) {
242 2823991 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
243 2823984 : p->value.dval = unique_est;
244 : }
245 4060436 : unsigned int digits = 0;
246 4060436 : sql_subtype *et = exp_subtype(e);
247 4060444 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
248 2641293 : digits = et->digits;
249 4060444 : if ((ok & 2) == 2) {
250 2223211 : if (!VALisnil(&max)) {
251 2223201 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
252 2223178 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
253 2223125 : if (digits) {
254 1656015 : unsigned int nd = atom_digits(p->value.pval);
255 1656050 : if (nd < digits)
256 : digits = nd;
257 1656050 : if (!digits)
258 : digits = 1;
259 : }
260 : }
261 2223161 : VALclear(&max);
262 : }
263 4060384 : if ((ok & 1) == 1) {
264 2229561 : if (!VALisnil(&min)) {
265 2229552 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
266 2229547 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
267 2229511 : if (digits) {
268 1663484 : unsigned int nd = atom_digits(p->value.pval);
269 1663479 : if (nd > digits) /* possibly set to low by max value */
270 : digits = nd;
271 : if (!digits)
272 : digits = 1;
273 : }
274 : }
275 2229502 : VALclear(&min);
276 : }
277 4060318 : if (digits)
278 2641170 : et->digits = digits;
279 4060318 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
280 216 : et->digits = et->scale + 1;
281 4060318 : }
282 :
283 : static void
284 935973 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
285 : {
286 935973 : if (e->p)
287 : return;
288 302287 : sql_column *c = NULL;
289 :
290 302287 : if ((c = rel_base_find_column(rel, e->nid))) {
291 204214 : sql_column_get_statistics(sql, c, e);
292 : }
293 : }
294 :
295 : static bool
296 8872 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
297 : {
298 8872 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
299 8872 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
300 8872 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
301 8872 : prop *est;
302 :
303 : /* for the intersection, if both expressions don't overlap, it can be pruned */
304 8872 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
305 511 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
306 26 : return true;
307 :
308 8846 : if (lval_max && rval_max) {
309 2557 : if (is_union(rel->op))
310 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
311 2554 : else if (is_inter(rel->op))
312 399 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
313 : else /* except */
314 2155 : set_minmax_property(sql, e, PROP_MAX, lval_max);
315 : }
316 8846 : if (lval_min && rval_min) {
317 2557 : if (is_union(rel->op))
318 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
319 2554 : else if (is_inter(rel->op))
320 399 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
321 : else /* except */
322 2155 : set_minmax_property(sql, e, PROP_MIN, lval_min);
323 : }
324 :
325 8846 : if (is_union(rel->op) || is_munion(rel->op)) {
326 5986 : if (!has_nil(le) && !has_nil(re))
327 879 : set_has_no_nil(e);
328 5986 : if (need_distinct(rel) && list_length(rel->exps) == 1)
329 0 : set_unique(e);
330 2860 : } else if (is_inter(rel->op)) {
331 564 : if (!has_nil(le) || !has_nil(re))
332 509 : set_has_no_nil(e);
333 564 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
334 345 : set_unique(e);
335 : } else {
336 2296 : assert(is_except(rel->op));
337 2296 : if (!has_nil(le))
338 2233 : set_has_no_nil(e);
339 2296 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
340 2009 : set_unique(e);
341 : }
342 : /* propagate unique estimation for known cases */
343 8846 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
344 205 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
345 205 : p->value.dval = est->value.dval;
346 : }
347 : return false;
348 : }
349 :
350 :
351 : static void
352 60688 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
353 : {
354 60688 : assert(is_munion(rel->op));
355 :
356 60688 : sql_rel *l = rels->h->data;
357 60688 : sql_exp *le = list_fetch(l->exps, i);
358 60688 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
359 60688 : bool has_nonil = !has_nil(le);
360 :
361 175904 : for(node *n = rels->h->next; n; n = n->next) {
362 115216 : sql_rel *r = n->data;
363 115216 : sql_exp *re = list_fetch(r->exps, i);
364 115216 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
365 :
366 115216 : if (lval_max && rval_max) {
367 42563 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
368 42563 : lval_max = find_prop_and_get(e->p, PROP_MAX);
369 : }
370 115216 : if (lval_min && rval_min) {
371 42064 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
372 42064 : lval_min = find_prop_and_get(e->p, PROP_MIN);
373 : }
374 115216 : has_nonil &= !has_nil(re);
375 :
376 : }
377 :
378 60688 : if (has_nonil)
379 41542 : set_has_no_nil(e);
380 :
381 60688 : if (need_distinct(rel) && list_length(rel->exps) == 1)
382 1594 : set_unique(e);
383 60688 : }
384 :
385 :
386 : static sql_exp *
387 3478817 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
388 : {
389 3478817 : mvc *sql = v->sql;
390 3478817 : atom *lval;
391 :
392 3478817 : (void) depth;
393 3478817 : switch(e->type) {
394 2181222 : case e_column:
395 2181222 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
396 278817 : case op_join:
397 : case op_left:
398 : case op_right:
399 : case op_full:
400 : case op_semi:
401 : case op_anti: {
402 278817 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
403 278817 : if (!found)
404 139853 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
405 : break;
406 : }
407 1902405 : case op_select:
408 : case op_project:
409 : case op_groupby: {
410 1902405 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
411 1902399 : if (!found && is_simple_project(rel->op))
412 125477 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
413 : break;
414 : }
415 0 : case op_insert:
416 : case op_update:
417 : case op_delete:
418 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
419 0 : break;
420 : default:
421 : break;
422 : }
423 : break;
424 98466 : case e_convert: {
425 98466 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
426 98466 : sql_exp *l = e->l;
427 98466 : sql_class fr = from->type->eclass, too = to->type->eclass;
428 98466 : prop *est;
429 :
430 98466 : if (fr == too) {
431 89388 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
432 58830 : atom *res = atom_copy(sql->sa, lval);
433 58830 : if ((res = atom_cast(sql->sa, res, to)))
434 58807 : set_minmax_property(sql, e, PROP_MAX, res);
435 : }
436 89388 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
437 59448 : atom *res = atom_copy(sql->sa, lval);
438 59448 : if ((res = atom_cast(sql->sa, res, to)))
439 59425 : set_minmax_property(sql, e, PROP_MIN, res);
440 : }
441 : }
442 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
443 98466 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
444 61096 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
445 61071 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
446 57642 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
447 57642 : p->value.dval = est->value.dval;
448 : }
449 98466 : if (!has_nil(l))
450 55522 : set_has_no_nil(e);
451 : break;
452 : }
453 341643 : case e_aggr:
454 : case e_func: {
455 341643 : BUN lv;
456 341643 : sql_subfunc *f = e->f;
457 :
458 341643 : if (!f->func->s) {
459 315170 : int key = hash_key(f->func->base.name); /* Using hash lookup */
460 315170 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
461 315170 : lookup_function look = NULL;
462 :
463 688413 : for (; he && !look; he = he->chain) {
464 373243 : struct function_properties* fp = (struct function_properties*) he->value;
465 :
466 373243 : if (!strcmp(f->func->base.name, fp->name))
467 107717 : look = fp->func;
468 : }
469 315170 : if (look)
470 107717 : look(sql, e);
471 : }
472 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
473 341643 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
474 89137 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
475 88814 : set_has_no_nil(e);
476 : /* set properties for global aggregates */
477 341643 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
478 7871 : if (!find_prop(e->p, PROP_NUNIQUES)) {
479 7871 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
480 7871 : p->value.dval = 1;
481 : }
482 7871 : set_unique(e);
483 : }
484 : break;
485 : }
486 570316 : case e_atom:
487 570316 : if (e->l) {
488 553597 : atom *a = (atom*) e->l;
489 553597 : if (!a->isnull) {
490 490281 : set_minmax_property(sql, e, PROP_MAX, a);
491 490281 : set_minmax_property(sql, e, PROP_MIN, a);
492 : }
493 553597 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
494 553597 : p->value.dval = 1;
495 16719 : } else if (e->f) {
496 3194 : list *vals = (list *) e->f;
497 3194 : sql_exp *first = vals->h ? vals->h->data : NULL;
498 3194 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
499 3194 : int has_nil = 0;
500 :
501 3194 : if (first) {
502 3194 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
503 3194 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
504 3194 : has_nil |= has_nil(first);
505 : }
506 :
507 13934 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
508 7546 : sql_exp *ee = n->data;
509 :
510 7546 : if (min && max) {
511 7093 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
512 7047 : max = atom_cmp(lval, max) > 0 ? lval : max;
513 : } else {
514 : max = NULL;
515 : }
516 7093 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
517 7047 : min = atom_cmp(min, lval) > 0 ? lval : min;
518 : } else {
519 : min = NULL;
520 : }
521 : }
522 7546 : has_nil |= has_nil(ee);
523 : }
524 :
525 3194 : if (!has_nil)
526 2828 : set_has_no_nil(e);
527 3194 : if (min && max) {
528 2805 : set_minmax_property(sql, e, PROP_MAX, max);
529 2805 : set_minmax_property(sql, e, PROP_MIN, min);
530 : }
531 3194 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
532 3194 : p->value.dval = (dbl) list_length(vals);
533 : } else {
534 13525 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
535 13525 : p->value.dval = 1;
536 : }
537 : break;
538 287170 : case e_cmp:
539 : /* TODO? propagating min/max/unique of booleans is not very worth it */
540 287170 : if (e->flag == cmp_or || e->flag == cmp_filter) {
541 18059 : if (!have_nil(e->l) && !have_nil(e->r))
542 13589 : set_has_no_nil(e);
543 269111 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
544 21242 : sql_exp *le = e->l;
545 21242 : if (!has_nil(le) && !have_nil(e->r))
546 18196 : set_has_no_nil(e);
547 : } else {
548 247869 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
549 247869 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
550 177932 : set_has_no_nil(e);
551 : }
552 : break;
553 : case e_psm:
554 : break;
555 : }
556 :
557 : #ifndef NDEBUG
558 : {
559 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
560 3478811 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
561 :
562 3478809 : (void) min;
563 3478809 : (void) max;
564 3478809 : assert(!min || !min->isnull);
565 3478809 : assert(!max || !max->isnull);
566 3478809 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
567 : }
568 : #endif
569 3478809 : return e;
570 : }
571 :
572 : static list * /* Remove predicates always false from min/max values */
573 139283 : rel_prune_predicates(visitor *v, sql_rel *rel)
574 : {
575 139283 : if (rel->l) {
576 139283 : sql_rel *l = rel->l;
577 139283 : if (is_single(l))
578 3391 : return rel->exps;
579 : }
580 135892 : if (!list_empty(rel->attr))
581 2961 : return rel->exps;
582 281968 : for (node *n = rel->exps->h ; n ; n = n->next) {
583 149037 : sql_exp *e = n->data;
584 :
585 149037 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
586 122640 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
587 122640 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
588 122640 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
589 122640 : bool always_false = false, always_true = false;
590 :
591 122640 : if (fe && !is_symmetric(e)) {
592 3058 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
593 3058 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
594 3660 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
595 1663 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
596 4072 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
597 2485 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
598 3058 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
599 1288 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
600 :
601 3058 : always_false |= not_int1 || not_int2 || not_int3;
602 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
603 4113 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
604 3957 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
605 575 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
606 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
607 : } else if (!fe) {
608 119564 : if (!is_semantics(e)) /* trivial not null cmp null case */
609 227704 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
610 119564 : switch (e->flag) {
611 104093 : case cmp_equal:
612 104093 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
613 133448 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
614 104093 : if (is_semantics(e)) { /* prune *= NULL cases */
615 5686 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
616 11372 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
617 : }
618 : break;
619 7208 : case cmp_notequal:
620 7208 : if (lval_min && lval_max && rval_min && rval_max)
621 11348 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
622 7208 : if (is_semantics(e)) {
623 26 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
624 52 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
625 : }
626 : break;
627 5632 : case cmp_gt:
628 5632 : if (lval_max && rval_min)
629 1947 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
630 5632 : if (lval_min && rval_max)
631 3894 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
632 : break;
633 119 : case cmp_gte:
634 119 : if (lval_max && rval_min)
635 49 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
636 119 : if (lval_min && rval_max)
637 98 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
638 : break;
639 2428 : case cmp_lt:
640 2428 : if (lval_min && rval_max)
641 1376 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
642 2428 : if (lval_max && rval_min)
643 2800 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
644 : break;
645 84 : case cmp_lte:
646 84 : if (lval_min && rval_max)
647 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
648 84 : if (lval_max && rval_min)
649 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
650 : break;
651 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
652 : break;
653 : }
654 : }
655 122640 : assert(!always_false || !always_true);
656 122640 : if (always_false || always_true) {
657 3651 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
658 3651 : if (exp_name(e))
659 0 : exp_prop_alias(v->sql->sa, ne, e);
660 3651 : n->data = ne;
661 3651 : v->changes++;
662 : }
663 : }
664 : }
665 132931 : return rel->exps;
666 : }
667 :
668 : static sql_rel *
669 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
670 : {
671 14 : if (side == rel->r)
672 0 : rel->r = NULL;
673 : else
674 14 : rel->l = NULL;
675 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
676 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
677 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
678 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
679 : }
680 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
681 21 : sql_exp *e1 = n->data, *e2 = m->data;
682 21 : exp_prop_alias(v->sql->sa, e2, e1);
683 : }
684 14 : list_hash_clear(side->exps);
685 :
686 14 : if (need_distinct(rel))
687 0 : set_distinct(side);
688 14 : side->p = prop_copy(v->sql->sa, rel->p);
689 14 : rel_destroy(rel);
690 14 : return side;
691 : }
692 :
693 : static BUN
694 21876 : trivial_project_exp_card(sql_exp *e)
695 : {
696 22170 : if (e->type == e_convert)
697 294 : return trivial_project_exp_card(e->l);
698 21876 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
699 : }
700 :
701 : static BUN
702 24248 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
703 : {
704 24248 : BUN lv = get_rel_count(l);
705 :
706 24248 : if (lv == 0)
707 : return 0;
708 21539 : if (!list_empty(exps)) {
709 21539 : BUN nuniques = 0;
710 : /* compute the highest number of unique values */
711 51130 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
712 29592 : sql_exp *e = n->data;
713 29592 : sql_rel *bt = NULL;
714 29592 : prop *p = NULL;
715 29592 : BUN euniques = BUN_NONE;
716 29592 : atom *min, *max, *sub = NULL;
717 29592 : sql_subtype *tp = exp_subtype(e);
718 29592 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
719 :
720 29592 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
721 20956 : euniques = (BUN) p->value.dval;
722 8636 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
723 6593 : euniques = (BUN) p->value.lval;
724 : }
725 : /* use min to max range to compute number of possible values in the domain for number types */
726 29592 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
727 22363 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
728 : /* the range includes min and max, so the atom_inc call is needed */
729 : /* if 'euniques' has number of distinct values, compute min between both */
730 17543 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
731 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
732 : }
733 29591 : if (euniques != BUN_NONE)
734 27548 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
735 : else
736 : nuniques = BUN_NONE;
737 : }
738 21538 : if (nuniques != BUN_NONE)
739 : return nuniques;
740 : }
741 : return lv;
742 : }
743 :
744 : static sql_rel *
745 2072523 : rel_get_statistics_(visitor *v, sql_rel *rel)
746 : {
747 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
748 2072523 : uint8_t has_special_modify = *(uint8_t*) v->data;
749 2072523 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
750 :
751 : /* Don't look at the same relation twice */
752 2072523 : if (are_statistics_gathered(rel->used))
753 : return rel;
754 1331304 : rel->used |= statistics_gathered;
755 :
756 1331304 : switch (rel->op) {
757 312616 : case op_basetable: {
758 312616 : sql_table *t = (sql_table *) rel->l;
759 312616 : sqlstore *store = v->sql->session->tr->store;
760 :
761 312616 : if (!list_empty(rel->exps)) {
762 1248541 : for (node *n = rel->exps->h ; n ; n = n->next)
763 935974 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
764 : }
765 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
766 312616 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
767 259073 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
768 : break;
769 : }
770 2790 : case op_union:
771 : case op_inter:
772 : case op_except: {
773 2790 : bool empty_cross = false;
774 2790 : int i = 0;
775 2790 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
776 :
777 2790 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
778 0 : pl = pl->l;
779 2790 : while (is_sample(pr->op) || is_topn(pr->op))
780 0 : pr = pr->l;
781 : /* if it's not a projection, then project and propagate statistics */
782 2790 : if (!is_project(pl->op) && !is_base(pl->op)) {
783 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
784 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
785 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
786 : }
787 2790 : if (!is_project(pr->op) && !is_base(pr->op)) {
788 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
789 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
790 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
791 : }
792 :
793 11662 : for (node *n = rel->exps->h ; n ; n = n->next) {
794 8872 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
795 8872 : i++;
796 : }
797 :
798 : /* propagate row count */
799 2790 : if (is_union(rel->op)) {
800 277 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
801 277 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
802 :
803 277 : if (lv == 0 && rv == 0) { /* both sides empty */
804 2 : if (can_be_pruned)
805 : empty_cross = true;
806 : else
807 2 : set_count_prop(v->sql->sa, rel, 0);
808 275 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
809 0 : rel = set_setop_side(v, rel, r);
810 0 : empty_cross = false; /* don't rewrite again */
811 275 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
812 0 : rel = set_setop_side(v, rel, l);
813 0 : empty_cross = false; /* don't rewrite again */
814 275 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
815 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
816 : }
817 2513 : } else if (is_inter(rel->op) || is_except(rel->op)) {
818 2513 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
819 2513 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
820 :
821 2513 : if (lv == 0) { /* left side empty */
822 62 : if (can_be_pruned)
823 : empty_cross = true;
824 : else
825 5 : set_count_prop(v->sql->sa, rel, 0);
826 2451 : } else if (rv == 0) { /* right side empty */
827 17 : if (is_inter(rel->op)) {
828 3 : if (can_be_pruned)
829 : empty_cross = true;
830 : else
831 0 : set_count_prop(v->sql->sa, rel, 0);
832 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
833 14 : rel = set_setop_side(v, rel, l);
834 14 : empty_cross = false; /* don't rewrite again */
835 : } else {
836 0 : set_count_prop(v->sql->sa, rel, lv);
837 : }
838 : } else {
839 2434 : set_count_prop(v->sql->sa, rel, lv);
840 : }
841 : }
842 2790 : if (can_be_pruned && empty_cross) {
843 80 : rel_destroy(rel->l);
844 80 : rel->l = NULL;
845 80 : rel_destroy(rel->r);
846 80 : rel->r = NULL;
847 244 : for (node *n = rel->exps->h ; n ; n = n->next) {
848 164 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
849 164 : exp_prop_alias(v->sql->sa, a, e);
850 164 : n->data = a;
851 : }
852 80 : list_hash_clear(rel->exps);
853 80 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
854 80 : set_count_prop(v->sql->sa, l, 1);
855 80 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
856 80 : set_count_prop(v->sql->sa, l, 0);
857 80 : rel->op = op_project;
858 80 : rel->l = l;
859 80 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
860 80 : set_count_prop(v->sql->sa, rel, 0);
861 80 : set_nodistinct(rel); /* set relations may have distinct flag set */
862 80 : v->changes++;
863 : }
864 : break;
865 : }
866 12864 : case op_munion: {
867 12864 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
868 12864 : BUN cnt = 0;
869 12864 : bool needs_pruning = false;
870 :
871 49201 : for (node *n = l->h; n; n = n->next) {
872 36337 : sql_rel *r = n->data, *pl = r;
873 :
874 36337 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
875 0 : pl = pl->l;
876 : /* if it's not a projection, then project and propagate statistics */
877 36337 : if (!is_project(pl->op) && !is_base(pl->op)) {
878 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
879 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
880 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
881 : }
882 36337 : nrels = append(nrels, pl);
883 : /* we need new munion statistics */
884 : /* propagate row count */
885 36337 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
886 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
887 36337 : if (rv == BUN_NONE) {
888 1156 : cnt++;
889 1156 : continue;
890 : }
891 35181 : if (!rv && can_be_pruned)
892 6617 : needs_pruning = true;
893 : /* overflow check */
894 35181 : if (rv > (BUN_MAX - cnt))
895 36337 : rv = BUN_MAX;
896 : else
897 35181 : cnt += rv;
898 : }
899 12864 : int i = 0;
900 73552 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
901 60688 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
902 :
903 12864 : if (needs_pruning && !rel_is_ref(rel)) {
904 4456 : v->changes++;
905 4456 : list *nl = sa_list(l->sa);
906 :
907 16441 : for (node *n = nrels->h; n; n = n->next) {
908 11985 : sql_rel *r = n->data;
909 11985 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
910 :
911 11985 : if (!rv) { /* keep last for now */
912 6147 : rel_destroy(r);
913 6147 : continue;
914 : }
915 5838 : nl = append(nl, r);
916 : }
917 4456 : rel->l = nl;
918 4456 : if (list_length(nl) == 1) {
919 4130 : sql_rel *l = rel->l = nl->h->data; /* ugh */
920 4130 : rel->r = NULL;
921 4130 : rel->op = op_project;
922 :
923 20287 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
924 16157 : sql_exp *pe = n->data, *ie = m->data;
925 16157 : sql_exp *ne = exp_ref(v->sql, ie);
926 16157 : exp_prop_alias(v->sql->sa, ne, pe);
927 16157 : n->data = ne;
928 : }
929 4130 : list_hash_clear(rel->exps);
930 326 : } else if (list_empty(nl)) {
931 : /* empty select (project [ nils ] ) */
932 454 : for (node *n = rel->exps->h ; n ; n = n->next) {
933 354 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
934 354 : exp_prop_alias(v->sql->sa, a, e);
935 354 : n->data = a;
936 : }
937 100 : list_hash_clear(rel->exps);
938 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
939 100 : set_count_prop(v->sql->sa, l, 1);
940 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
941 100 : set_count_prop(v->sql->sa, l, 0);
942 100 : rel->op = op_project;
943 100 : rel->r = NULL;
944 100 : rel->l = l;
945 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
946 100 : set_count_prop(v->sql->sa, rel, 0);
947 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
948 : }
949 : } else {
950 8408 : set_count_prop(v->sql->sa, rel, cnt);
951 : }
952 : break;
953 : }
954 532113 : case op_join:
955 : case op_left:
956 : case op_right:
957 : case op_full:
958 : case op_semi:
959 : case op_anti:
960 : case op_select:
961 : case op_project:
962 : case op_groupby: {
963 532113 : if (is_groupby(rel->op) && !list_empty(rel->r))
964 15531 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
965 532113 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
966 532113 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
967 39252 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
968 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
969 532113 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
970 139283 : int changes = v->changes;
971 139283 : rel->exps = rel_prune_predicates(v, rel);
972 139283 : if (v->changes > changes)
973 3618 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
974 : }
975 :
976 : /* propagate row count */
977 532113 : sql_rel *l = rel->l, *r = rel->r;
978 532113 : switch (rel->op) {
979 134385 : case op_join:
980 : case op_left:
981 : case op_right:
982 : case op_full: {
983 134385 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
984 :
985 134385 : if (!list_empty(rel->exps) && !is_single(rel)) {
986 245807 : for (node *n = rel->exps->h ; n ; n = n->next) {
987 125486 : sql_exp *e = n->data, *el = e->l, *er = e->r;
988 :
989 125486 : if (find_prop(e->p, PROP_JOINIDX)) {
990 667 : join_idx_estimate = lv>rv?lv:rv;
991 124819 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
992 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
993 120935 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
994 120539 : BUN lu = 0, ru = 0;
995 120539 : prop *p = NULL;
996 120539 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
997 89461 : lu = (BUN) p->value.dval;
998 120539 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
999 103849 : ru = (BUN) p->value.dval;
1000 120539 : if (is_unique(el) || lu > lv) {
1001 73609 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1002 73609 : uniques_estimate = MIN(uniques_estimate, ncount);
1003 46930 : } else if (is_unique(er) || ru > rv) {
1004 29328 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1005 29328 : uniques_estimate = MIN(uniques_estimate, ncount);
1006 : }
1007 : }
1008 : }
1009 : }
1010 : }
1011 134385 : if (is_single(rel)) {
1012 4731 : set_count_prop(v->sql->sa, rel, lv);
1013 129654 : } else if (join_idx_estimate != BUN_MAX) {
1014 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1015 128989 : } else if (uniques_estimate != BUN_MAX) {
1016 96491 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1017 32498 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1018 : /* corner cases for outer joins */
1019 126 : if (is_left(rel->op)) {
1020 114 : set_count_prop(v->sql->sa, rel, lv);
1021 12 : } else if (is_right(rel->op)) {
1022 3 : set_count_prop(v->sql->sa, rel, rv);
1023 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1024 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1025 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1026 0 : set_count_prop(v->sql->sa, rel, 0);
1027 : }
1028 32372 : } else if (lv == 0) {
1029 1166 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1030 31775 : } else if (rv == 0) {
1031 1568 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1032 30701 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1033 18251 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1034 : }
1035 : break;
1036 : }
1037 2635 : case op_anti:
1038 2635 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1039 2635 : break;
1040 108879 : case op_semi:
1041 : case op_select:
1042 : /* TODO calculate cardinalities using selectivities */
1043 108879 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1044 2743 : set_count_prop(v->sql->sa, rel, 0);
1045 : } else {
1046 209024 : if (!list_empty(rel->exps) && !is_single(rel)) {
1047 102888 : BUN cnt = get_rel_count(l), u = 1;
1048 171648 : for (node *n = rel->exps->h ; n ; n = n->next) {
1049 112729 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1050 :
1051 : /* simple expressions first */
1052 112729 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1053 : /* use selectivity */
1054 57159 : prop *p;
1055 57159 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1056 43969 : u = (BUN) p->value.dval;
1057 43969 : break;
1058 : }
1059 : }
1060 : }
1061 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1062 102888 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1063 : } else {
1064 3248 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1065 : }
1066 : }
1067 : break;
1068 262856 : case op_project:
1069 262856 : if (l) {
1070 252742 : if (need_distinct(rel)) {
1071 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1072 : } else {
1073 252742 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1074 : }
1075 : } else {
1076 10114 : BUN card = 1;
1077 :
1078 10114 : if (!list_empty(rel->exps)) {
1079 30661 : for (node *n = rel->exps->h ; n ; n = n->next) {
1080 20547 : sql_exp *e = n->data;
1081 20547 : card = MAX(card, trivial_project_exp_card(e));
1082 : }
1083 : }
1084 10114 : set_count_prop(v->sql->sa, rel, card);
1085 : }
1086 : break;
1087 22953 : case op_groupby:
1088 22953 : if (list_empty(rel->r)) {
1089 7421 : set_count_prop(v->sql->sa, rel, 1);
1090 : } else {
1091 15532 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1092 : }
1093 : break;
1094 : default:
1095 : break;
1096 : }
1097 : break;
1098 : }
1099 16831 : case op_topn: {
1100 16831 : BUN lv = get_rel_count(rel->l);
1101 :
1102 16831 : if (lv != BUN_NONE) {
1103 16814 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1104 96 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1105 96 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1106 96 : lv = offset >= lv ? 0 : lv - offset;
1107 : }
1108 16814 : if (le->l && exp_is_not_null(le)) {
1109 16776 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1110 16776 : lv = MIN(lv, limit);
1111 : }
1112 16814 : set_count_prop(v->sql->sa, rel, lv);
1113 : }
1114 : break;
1115 : }
1116 22 : case op_sample: {
1117 22 : BUN lv = get_rel_count(rel->l);
1118 :
1119 22 : if (lv != BUN_NONE) {
1120 4 : sql_exp *se = rel->exps->h->data;
1121 4 : sql_subtype *tp = exp_subtype(se);
1122 :
1123 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1124 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1125 4 : lv = MIN(lv, sample);
1126 0 : } else if (se->l) { /* sample is a percentage of rows */
1127 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1128 0 : assert(tp->type->eclass == EC_FLT);
1129 0 : lv = (BUN) ceil((dbl)lv * percent);
1130 : }
1131 4 : set_count_prop(v->sql->sa, rel, lv);
1132 : }
1133 : break;
1134 : }
1135 5975 : case op_table: {
1136 5975 : sql_exp *op = rel->r;
1137 5975 : if (rel->flag != TRIGGER_WRAPPER && op) {
1138 5663 : sql_subfunc *f = op->f;
1139 5663 : if (f->func->lang == FUNC_LANG_SQL) {
1140 97 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1141 5566 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1142 829 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1143 4737 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1144 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1145 4737 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1146 1085 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1147 3652 : } else if (f->func->lang == FUNC_LANG_MAL &&
1148 3542 : (strcmp(f->func->base.name, "queue") == 0 ||
1149 3278 : strcmp(f->func->base.name, "optimizers") == 0 ||
1150 2966 : strcmp(f->func->base.name, "env") == 0 ||
1151 2675 : strcmp(f->func->base.name, "keywords") == 0 ||
1152 2675 : strcmp(f->func->base.name, "statistics") == 0 ||
1153 2008 : strcmp(f->func->base.name, "rejects") == 0 ||
1154 1754 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1155 1754 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1156 1754 : strcmp(f->func->base.name, "sessions") == 0) ) {
1157 2078 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1158 : }
1159 : /* else {
1160 : printf("%%func needs stats : %s\n", f->func->base.name);
1161 : } */
1162 : }
1163 : break;
1164 : }
1165 : /*These relations are less important for now
1166 : TODO later we can tune it */
1167 : case op_insert:
1168 : case op_update:
1169 : case op_delete:
1170 : case op_truncate:
1171 : case op_ddl:
1172 : default:
1173 : break;
1174 : }
1175 :
1176 : return rel;
1177 : }
1178 :
1179 : static sql_rel *
1180 539864 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1181 : {
1182 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1183 539864 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1184 539864 : v->data = &has_special_modify;
1185 539864 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1186 539864 : v->data = gp;
1187 539864 : return rel;
1188 : }
1189 :
1190 : run_optimizer
1191 705687 : bind_get_statistics(visitor *v, global_props *gp)
1192 : {
1193 705687 : (void) v;
1194 705687 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1195 : }
1196 :
1197 :
1198 : static bool
1199 94898 : point_select_on_unique_column(sql_rel *rel)
1200 : {
1201 94898 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1202 131375 : for (node *n = rel->exps->h; n ; n = n->next) {
1203 75652 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1204 :
1205 75652 : if (is_compare(e->type) && e->flag == cmp_equal) {
1206 33623 : if (is_numeric_upcast(el))
1207 0 : el = el->l;
1208 33623 : if (is_numeric_upcast(er))
1209 0 : er = er->l;
1210 33623 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1211 749 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1212 : return true;
1213 32874 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1214 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1215 : return true;
1216 : }
1217 : }
1218 : }
1219 : return false;
1220 : }
1221 :
1222 : /*
1223 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1224 : * join, the opposite side's select can be pushed above the join.
1225 : */
1226 : static inline sql_rel *
1227 1244764 : rel_push_select_up(visitor *v, sql_rel *rel)
1228 : {
1229 1244764 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1230 253995 : sql_rel *l = rel->l, *r = rel->r;
1231 253995 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1232 253995 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1233 :
1234 253995 : if (can_pushup_left || can_pushup_right) {
1235 66899 : if (can_pushup_left)
1236 45431 : can_pushup_left = point_select_on_unique_column(r);
1237 66899 : if (can_pushup_right)
1238 49467 : can_pushup_right = point_select_on_unique_column(l);
1239 :
1240 : /* if both selects retrieve one row each, it's not worth it to push both up */
1241 66899 : if (can_pushup_left && !can_pushup_right) {
1242 683 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1243 683 : nrel->l = l->l;
1244 683 : rel = rel_inplace_select(rel, nrel, l->exps);
1245 683 : assert(is_select(rel->op));
1246 683 : v->changes++;
1247 66216 : } else if (!can_pushup_left && can_pushup_right) {
1248 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1249 14 : nrel->r = r->l;
1250 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1251 14 : assert(is_select(rel->op));
1252 14 : v->changes++;
1253 : }
1254 : }
1255 : }
1256 1244764 : return rel;
1257 : }
1258 :
1259 : static int
1260 93237 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1261 : {
1262 93237 : int de;
1263 :
1264 93237 : if (!t)
1265 : return 0;
1266 93237 : switch (ATOMstorage(t->type->localtype)) {
1267 : case TYPE_bte:
1268 : return 150 - 8;
1269 1593 : case TYPE_sht:
1270 1593 : return 150 - 16;
1271 37290 : case TYPE_int:
1272 37290 : return 150 - 32;
1273 518 : case TYPE_void:
1274 : case TYPE_lng:
1275 518 : return 150 - 64;
1276 : case TYPE_uuid:
1277 : #ifdef HAVE_HGE
1278 : case TYPE_hge:
1279 : #endif
1280 : return 150 - 128;
1281 2 : case TYPE_flt:
1282 2 : return 75 - 24;
1283 : case TYPE_dbl:
1284 : return 75 - 53;
1285 30567 : default:
1286 30567 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1287 1554 : return 150 - de * 8;
1288 : /* strings and blobs not duplicate eliminated don't get any points here */
1289 : return 0;
1290 : }
1291 : }
1292 :
1293 : static int
1294 34338 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1295 : {
1296 34338 : int res = 0;
1297 34338 : sql_subtype *t = exp_subtype(e);
1298 34338 : sql_column *c = NULL;
1299 :
1300 34338 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1301 : return -1000;
1302 : /* can we find out if the underlying table is sorted */
1303 33737 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1304 33737 : res += 600;
1305 :
1306 : /* prefer the shorter var types over the longer ones */
1307 33737 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1308 33737 : return res;
1309 : }
1310 :
1311 : static int
1312 58417 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1313 : {
1314 58417 : int score = 0;
1315 58417 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1316 34338 : sql_exp *l = e->l;
1317 :
1318 34338 : while (l->type == e_cmp) { /* go through nested comparisons */
1319 2 : sql_exp *ll;
1320 :
1321 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1322 0 : ll = ((list*)l->l)->h->data;
1323 : else
1324 2 : ll = l->l;
1325 2 : if (ll->type != e_cmp)
1326 : break;
1327 : l = ll;
1328 : }
1329 34338 : score += score_se_base(v, rel, l);
1330 : }
1331 58417 : score += exp_keyvalue(e);
1332 58417 : return score;
1333 : }
1334 :
1335 : static inline sql_rel *
1336 1244764 : rel_select_order(visitor *v, sql_rel *rel)
1337 : {
1338 1244764 : int *scores = NULL;
1339 1244764 : sql_exp **exps = NULL;
1340 :
1341 1244764 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1342 27326 : node *n;
1343 27326 : int i, nexps = list_length(rel->exps);
1344 27326 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1345 27326 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1346 :
1347 85743 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1348 58447 : exps[i] = n->data;
1349 58447 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1350 : return rel;
1351 58417 : scores[i] = score_se(v, rel, n->data);
1352 : }
1353 27296 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1354 :
1355 85713 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1356 58417 : n->data = exps[i];
1357 : }
1358 :
1359 : return rel;
1360 : }
1361 :
1362 : /* Compute the efficiency of using this expression early in a group by list */
1363 : static int
1364 59500 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1365 : {
1366 59500 : int res = 0;
1367 59500 : sql_subtype *t = exp_subtype(e);
1368 59500 : sql_column *c = exp_find_column(rel, e, -2);
1369 :
1370 59500 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1371 38 : res += 1000;
1372 : /* can we find out if the underlying table is sorted */
1373 59500 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1374 3584 : res += 700;
1375 38910 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1376 3710 : res += 500;
1377 59500 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1378 0 : res += 200;
1379 :
1380 : /* prefer the shorter var types over the longer ones */
1381 59500 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1382 59500 : return res;
1383 : }
1384 :
1385 : /* reorder group by expressions */
1386 : static inline sql_rel *
1387 1244764 : rel_groupby_order(visitor *v, sql_rel *rel)
1388 : {
1389 1244764 : int *scores = NULL;
1390 1244764 : sql_exp **exps = NULL;
1391 :
1392 1244764 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1393 25587 : node *n;
1394 25587 : list *gbe = rel->r;
1395 25587 : int i, ngbe = list_length(gbe);
1396 25587 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1397 25587 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1398 :
1399 : /* first sorting step, give priority for integers and sorted columns */
1400 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1401 59500 : exps[i] = n->data;
1402 59500 : scores[i] = score_gbe(v, rel, exps[i]);
1403 : }
1404 25587 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1405 :
1406 : /* second sorting step, give priority to strings with lower number of digits */
1407 50504 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1408 25587 : if (scores[i])
1409 24589 : i++;
1410 25587 : if (ngbe - i > 1) {
1411 8604 : for (int j = i; j < ngbe; j++) {
1412 6597 : sql_subtype *t = exp_subtype(exps[j]);
1413 6597 : scores[j] = t ? t->digits : 0;
1414 : }
1415 : /* the less number of digits the better, order ascending */
1416 2007 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1417 : }
1418 :
1419 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1420 59500 : n->data = exps[i];
1421 : }
1422 :
1423 1244764 : return rel;
1424 : }
1425 :
1426 : /* This optimization loop contains optimizations that can potentially use statistics */
1427 : static sql_rel *
1428 1244764 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1429 : {
1430 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1431 1244764 : rel = rel_push_select_up(v, rel);
1432 1244764 : rel = rel_select_order(v, rel);
1433 :
1434 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1435 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1436 : rel_distinct_aggregate_on_unique_values */
1437 :
1438 1244764 : rel = rel_groupby_order(v, rel);
1439 1244763 : return rel;
1440 : }
1441 :
1442 : static sql_rel *
1443 66051 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1444 : {
1445 66051 : (void) gp;
1446 66051 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1447 : }
1448 :
1449 : run_optimizer
1450 705682 : bind_final_optimization_loop(visitor *v, global_props *gp)
1451 : {
1452 705682 : int flag = v->sql->sql_optimizer;
1453 : /* At the moment, this optimizer has dependency on 3 flags */
1454 547453 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1455 771735 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1456 : }
|