Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 :
19 : static sql_exp *
20 3453542 : comparison_find_column(sql_exp *input, sql_exp *e)
21 : {
22 3453542 : switch (input->type) {
23 97926 : case e_convert: {
24 97926 : list *types = (list *)input->r;
25 97926 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
26 97926 : if (from == to)
27 188870 : return comparison_find_column(input->l, e) ? input : NULL;
28 : return NULL;
29 : }
30 3043519 : case e_column:
31 3043519 : return exp_match(e, input) ? input : NULL;
32 : default:
33 : return NULL;
34 : }
35 : }
36 :
37 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
38 : * columns */
39 : #define comp_single_column(c) (!c->f)
40 :
41 : static sql_exp *
42 8705869 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
43 : {
44 8788747 : assert(e->type == e_column);
45 8788747 : if (rel) {
46 8788693 : switch(rel->op) {
47 4072480 : case op_left:
48 : case op_right:
49 : case op_full:
50 : case op_join:
51 : case op_select:
52 : case op_anti:
53 : case op_semi: {
54 4072480 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
55 :
56 4072480 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
57 : return NULL; /* nothing will pass, skip */
58 :
59 : /* propagate from the bottom first */
60 4058404 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
61 : found_left = true;
62 2581504 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
63 4058404 : found_right = true;
64 :
65 4058404 : if (!found_left && !found_right)
66 : return NULL;
67 1769764 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
68 3484820 : for (node *n = rel->exps->h ; n ; n = n->next) {
69 1804381 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
70 :
71 1804381 : if (comp->type == e_cmp) {
72 1803379 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
73 124672 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
74 124672 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
75 :
76 : /* not semantics found or if explicitly filtering not null values from the column */
77 124672 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
78 124672 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
79 124672 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
80 19339 : continue;
81 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
82 105333 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
83 4622 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
84 4622 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
85 4622 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
86 4622 : symmetric = is_symmetric(comp);
87 :
88 4622 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
89 1817 : continue;
90 2805 : if (lne && int1 && int2) {
91 104 : if (symmetric) {
92 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
93 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
94 : /* max is min from le and (max from re and fe max) */
95 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
96 : /* min is max from le and (min from re and fe min) */
97 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
98 : } else {
99 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
100 : /* max is min from le and fe max */
101 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
102 : /* min is max from le and re min */
103 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
104 : }
105 2701 : } else if (rne) {
106 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
107 0 : prop *p = find_prop(e->p, PROP_MIN);
108 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
109 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
110 590 : } else if (int1) { /* min is max from le and re min */
111 100 : prop *p = find_prop(e->p, PROP_MIN);
112 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
113 : }
114 2111 : } else if (fne) {
115 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
116 0 : prop *p = find_prop(e->p, PROP_MAX);
117 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
118 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
119 549 : } else if (int2) { /* max is min from le and fe max */
120 266 : prop *p = find_prop(e->p, PROP_MAX);
121 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
122 : }
123 : }
124 100711 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
125 : /* both min and max must be set and the intervals must overlap */
126 42731 : switch (comp->flag) {
127 28522 : case cmp_equal: /* for equality reduce */
128 28522 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
129 28522 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
130 28522 : break;
131 4611 : case cmp_notequal: /* for inequality expand */
132 4611 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
133 4611 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
134 4611 : break;
135 5595 : case cmp_gt:
136 : case cmp_gte:
137 10294 : if (!is_anti(comp) && lne) { /* min is max from both min */
138 4699 : prop *p = find_prop(e->p, PROP_MIN);
139 4699 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
140 896 : } else if (!is_anti(comp)) { /* max is min from both max */
141 896 : prop *p = find_prop(e->p, PROP_MAX);
142 896 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
143 : }
144 : break;
145 4003 : case cmp_lt:
146 : case cmp_lte:
147 7200 : if (!is_anti(comp) && lne) { /* max is min from both max */
148 3197 : prop *p = find_prop(e->p, PROP_MAX);
149 3197 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
150 806 : } else if (!is_anti(comp)) { /* min is max from both min */
151 806 : prop *p = find_prop(e->p, PROP_MIN);
152 806 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
153 : }
154 : break;
155 : default: /* Maybe later I can do cmp_in and cmp_notin */
156 : break;
157 : }
158 : }
159 : }
160 : }
161 : }
162 : }
163 1769764 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
164 37254 : set_has_nil(e);
165 1769764 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
166 94005 : set_has_no_nil(e);
167 1769764 : if (is_join(rel->op) && is_unique(e) && !still_unique)
168 117241 : set_not_unique(e);
169 1769764 : return e;
170 : }
171 4613974 : case op_table:
172 : case op_basetable:
173 : case op_union:
174 : case op_except:
175 : case op_inter:
176 : case op_munion:
177 : case op_project:
178 : case op_groupby: {
179 4613974 : sql_exp *found;
180 4613974 : atom *fval;
181 4613974 : prop *est;
182 4613974 : if ((found = rel_find_exp(rel, e))) {
183 2160349 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
184 2116083 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
185 1121739 : set_minmax_property(sql, e, PROP_MAX, fval);
186 2116082 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
187 1128780 : set_minmax_property(sql, e, PROP_MIN, fval);
188 2116080 : if (!has_nil(found))
189 1344630 : set_has_no_nil(e);
190 2116080 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
191 1710576 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
192 416219 : set_unique(e);
193 : /* propagate unique estimation for known cases */
194 2116080 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
195 7460 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
196 7460 : p->value.dval = 1;
197 2108621 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
198 65209 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
199 2051429 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
200 187324 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
201 187323 : p->value.dval = est->value.dval;
202 : }
203 : }
204 2160344 : return e;
205 : }
206 : return NULL;
207 : }
208 82878 : case op_topn:
209 : case op_sample:
210 82878 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
211 : default:
212 : break;
213 : }
214 : }
215 : return NULL;
216 : }
217 :
218 : static atom *
219 4504191 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
220 : {
221 4504191 : atom *a = SA_NEW(sa, atom);
222 :
223 4504169 : assert(!VALisnil(v));
224 4504219 : *a = (atom) {.tpe = *tpe,};
225 4504219 : SA_VALcopy(sa, &a->data, v);
226 4504318 : return a;
227 : }
228 :
229 : void
230 4137734 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
231 : {
232 4137734 : bool nonil = false, unique = false;
233 4137734 : double unique_est = 0.0;
234 4137734 : ValRecord min, max;
235 4137734 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
236 :
237 4138993 : if (has_nil(e) && nonil)
238 2599467 : set_has_no_nil(e);
239 4138993 : if (!is_unique(e) && unique)
240 1093527 : set_unique(e);
241 4138993 : if (unique_est != 0.0) {
242 2906603 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
243 2906447 : p->value.dval = unique_est;
244 : }
245 4138837 : unsigned int digits = 0;
246 4138837 : sql_subtype *et = exp_subtype(e);
247 4138776 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
248 2687376 : digits = et->digits;
249 4138776 : if ((ok & 2) == 2) {
250 2249449 : if (!VALisnil(&max)) {
251 2249410 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
252 2249366 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
253 2249329 : if (digits) {
254 1673659 : unsigned int nd = atom_digits(p->value.pval);
255 1673571 : if (nd < digits)
256 : digits = nd;
257 1673571 : if (!digits)
258 : digits = 1;
259 : }
260 : }
261 2249199 : VALclear(&max);
262 : }
263 4138511 : if ((ok & 1) == 1) {
264 2255234 : if (!VALisnil(&min)) {
265 2255238 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
266 2255284 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
267 2255426 : if (digits) {
268 1680822 : unsigned int nd = atom_digits(p->value.pval);
269 1680829 : if (nd > digits) /* possibly set to low by max value */
270 : digits = nd;
271 : if (!digits)
272 : digits = 1;
273 : }
274 : }
275 2255419 : VALclear(&min);
276 : }
277 4138753 : if (digits)
278 2687323 : et->digits = digits;
279 4138753 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
280 216 : et->digits = et->scale + 1;
281 4138753 : }
282 :
283 : static void
284 931615 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
285 : {
286 931615 : if (e->p)
287 : return;
288 301132 : sql_column *c = NULL;
289 :
290 301132 : if ((c = rel_base_find_column(rel, e->nid))) {
291 203712 : sql_column_get_statistics(sql, c, e);
292 : }
293 : }
294 :
295 : static bool
296 8834 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
297 : {
298 8834 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
299 8834 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
300 8834 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
301 8834 : prop *est;
302 :
303 : /* for the intersection, if both expressions don't overlap, it can be pruned */
304 8834 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
305 504 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
306 26 : return true;
307 :
308 8808 : if (lval_max && rval_max) {
309 2523 : if (is_union(rel->op))
310 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
311 2520 : else if (is_inter(rel->op))
312 391 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
313 : else /* except */
314 2129 : set_minmax_property(sql, e, PROP_MAX, lval_max);
315 : }
316 8808 : if (lval_min && rval_min) {
317 2523 : if (is_union(rel->op))
318 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
319 2520 : else if (is_inter(rel->op))
320 391 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
321 : else /* except */
322 2129 : set_minmax_property(sql, e, PROP_MIN, lval_min);
323 : }
324 :
325 8808 : if (is_union(rel->op) || is_munion(rel->op)) {
326 5986 : if (!has_nil(le) && !has_nil(re))
327 879 : set_has_no_nil(e);
328 5986 : if (need_distinct(rel) && list_length(rel->exps) == 1)
329 0 : set_unique(e);
330 2822 : } else if (is_inter(rel->op)) {
331 555 : if (!has_nil(le) || !has_nil(re))
332 501 : set_has_no_nil(e);
333 555 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
334 342 : set_unique(e);
335 : } else {
336 2267 : assert(is_except(rel->op));
337 2267 : if (!has_nil(le))
338 2208 : set_has_no_nil(e);
339 2267 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
340 2006 : set_unique(e);
341 : }
342 : /* propagate unique estimation for known cases */
343 8808 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
344 205 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
345 205 : p->value.dval = est->value.dval;
346 : }
347 : return false;
348 : }
349 :
350 :
351 : static void
352 60732 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
353 : {
354 60732 : assert(is_munion(rel->op));
355 :
356 60732 : sql_rel *l = rels->h->data;
357 60732 : sql_exp *le = list_fetch(l->exps, i);
358 60732 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
359 60732 : bool has_nonil = !has_nil(le);
360 :
361 175853 : for(node *n = rels->h->next; n; n = n->next) {
362 115121 : sql_rel *r = n->data;
363 115121 : sql_exp *re = list_fetch(r->exps, i);
364 115121 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
365 :
366 115121 : if (lval_max && rval_max) {
367 42431 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
368 42431 : lval_max = find_prop_and_get(e->p, PROP_MAX);
369 : }
370 115121 : if (lval_min && rval_min) {
371 41906 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
372 41906 : lval_min = find_prop_and_get(e->p, PROP_MIN);
373 : }
374 115121 : has_nonil &= !has_nil(re);
375 :
376 : }
377 :
378 60732 : if (has_nonil)
379 40886 : set_has_no_nil(e);
380 :
381 60732 : if (need_distinct(rel) && list_length(rel->exps) == 1)
382 1593 : set_unique(e);
383 60732 : }
384 :
385 :
386 : static sql_exp *
387 3455045 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
388 : {
389 3455045 : mvc *sql = v->sql;
390 3455045 : atom *lval;
391 :
392 3455045 : (void) depth;
393 3455045 : switch(e->type) {
394 2165905 : case e_column:
395 2165905 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
396 276265 : case op_join:
397 : case op_left:
398 : case op_right:
399 : case op_full:
400 : case op_semi:
401 : case op_anti: {
402 276265 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
403 276265 : if (!found)
404 138575 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
405 : break;
406 : }
407 1889640 : case op_select:
408 : case op_project:
409 : case op_groupby: {
410 1889640 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
411 1889637 : if (!found && is_simple_project(rel->op))
412 123259 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
413 : break;
414 : }
415 0 : case op_insert:
416 : case op_update:
417 : case op_delete:
418 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
419 0 : break;
420 : default:
421 : break;
422 : }
423 : break;
424 97869 : case e_convert: {
425 97869 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
426 97869 : sql_exp *l = e->l;
427 97869 : sql_class fr = from->type->eclass, too = to->type->eclass;
428 97869 : prop *est;
429 :
430 97869 : if (fr == too) {
431 88918 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
432 58483 : atom *res = atom_copy(sql->sa, lval);
433 58483 : if ((res = atom_cast(sql->sa, res, to)))
434 58460 : set_minmax_property(sql, e, PROP_MAX, res);
435 : }
436 88918 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
437 59098 : atom *res = atom_copy(sql->sa, lval);
438 59098 : if ((res = atom_cast(sql->sa, res, to)))
439 59075 : set_minmax_property(sql, e, PROP_MIN, res);
440 : }
441 : }
442 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
443 97870 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
444 60683 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
445 60658 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
446 57285 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
447 57285 : p->value.dval = est->value.dval;
448 : }
449 97869 : if (!has_nil(l))
450 55025 : set_has_no_nil(e);
451 : break;
452 : }
453 339577 : case e_aggr:
454 : case e_func: {
455 339577 : BUN lv;
456 339577 : sql_subfunc *f = e->f;
457 :
458 339577 : if (!f->func->s) {
459 313243 : int key = hash_key(f->func->base.name); /* Using hash lookup */
460 313243 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
461 313243 : lookup_function look = NULL;
462 :
463 684478 : for (; he && !look; he = he->chain) {
464 371235 : struct function_properties* fp = (struct function_properties*) he->value;
465 :
466 371235 : if (!strcmp(f->func->base.name, fp->name))
467 107207 : look = fp->func;
468 : }
469 313243 : if (look)
470 107208 : look(sql, e);
471 : }
472 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
473 339579 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
474 88211 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
475 87885 : set_has_no_nil(e);
476 : /* set properties for global aggregates */
477 339579 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
478 7828 : if (!find_prop(e->p, PROP_NUNIQUES)) {
479 7828 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
480 7828 : p->value.dval = 1;
481 : }
482 7828 : set_unique(e);
483 : }
484 : break;
485 : }
486 568330 : case e_atom:
487 568330 : if (e->l) {
488 550751 : atom *a = (atom*) e->l;
489 550751 : if (!a->isnull) {
490 489138 : set_minmax_property(sql, e, PROP_MAX, a);
491 489137 : set_minmax_property(sql, e, PROP_MIN, a);
492 : }
493 550750 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
494 550750 : p->value.dval = 1;
495 17579 : } else if (e->f) {
496 4117 : list *vals = (list *) e->f;
497 4117 : sql_exp *first = vals->h ? vals->h->data : NULL;
498 4117 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
499 4117 : int has_nil = 0;
500 :
501 4117 : if (first) {
502 4117 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
503 4117 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
504 4117 : has_nil |= has_nil(first);
505 : }
506 :
507 17730 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
508 9496 : sql_exp *ee = n->data;
509 :
510 9496 : if (min && max) {
511 9005 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
512 8959 : max = atom_cmp(lval, max) > 0 ? lval : max;
513 : } else {
514 : max = NULL;
515 : }
516 9005 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
517 8959 : min = atom_cmp(min, lval) > 0 ? lval : min;
518 : } else {
519 : min = NULL;
520 : }
521 : }
522 9496 : has_nil |= has_nil(ee);
523 : }
524 :
525 4117 : if (!has_nil)
526 3750 : set_has_no_nil(e);
527 4117 : if (min && max) {
528 3708 : set_minmax_property(sql, e, PROP_MAX, max);
529 3708 : set_minmax_property(sql, e, PROP_MIN, min);
530 : }
531 4117 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
532 4117 : p->value.dval = (dbl) list_length(vals);
533 : } else {
534 13462 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
535 13462 : p->value.dval = 1;
536 : }
537 : break;
538 283364 : case e_cmp:
539 : /* TODO? propagating min/max/unique of booleans is not very worth it */
540 283364 : if (e->flag == cmp_or || e->flag == cmp_filter) {
541 17362 : if (!have_nil(e->l) && !have_nil(e->r))
542 12920 : set_has_no_nil(e);
543 266002 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
544 21693 : sql_exp *le = e->l;
545 21693 : if (!has_nil(le) && !have_nil(e->r))
546 18491 : set_has_no_nil(e);
547 : } else {
548 244309 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
549 244309 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
550 169713 : set_has_no_nil(e);
551 : }
552 : break;
553 : case e_psm:
554 : break;
555 : }
556 :
557 : #ifndef NDEBUG
558 : {
559 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
560 3455043 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
561 :
562 3455035 : (void) min;
563 3455035 : (void) max;
564 3455035 : assert(!min || !min->isnull);
565 3455035 : assert(!max || !max->isnull);
566 3455035 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
567 : }
568 : #endif
569 3455035 : return e;
570 : }
571 :
572 : static list * /* Remove predicates always false from min/max values */
573 137650 : rel_prune_predicates(visitor *v, sql_rel *rel)
574 : {
575 137650 : if (rel->l) {
576 137650 : sql_rel *l = rel->l;
577 137650 : if (is_single(l))
578 3259 : return rel->exps;
579 : }
580 134391 : if (!list_empty(rel->attr))
581 2868 : return rel->exps;
582 279067 : for (node *n = rel->exps->h ; n ; n = n->next) {
583 147544 : sql_exp *e = n->data;
584 :
585 147544 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
586 121455 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
587 121455 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
588 121455 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
589 121455 : bool always_false = false, always_true = false;
590 :
591 121455 : if (fe && !is_symmetric(e)) {
592 3058 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
593 3058 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
594 3660 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
595 1663 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
596 4072 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
597 2485 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
598 3058 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
599 1287 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
600 :
601 3058 : always_false |= not_int1 || not_int2 || not_int3;
602 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
603 4113 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
604 3957 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
605 575 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
606 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
607 : } else if (!fe) {
608 118379 : if (!is_semantics(e)) /* trivial not null cmp null case */
609 225476 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
610 118379 : switch (e->flag) {
611 102930 : case cmp_equal:
612 102930 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
613 131024 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
614 102930 : if (is_semantics(e)) { /* prune *= NULL cases */
615 5612 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
616 11224 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
617 : }
618 : break;
619 7210 : case cmp_notequal:
620 7210 : if (lval_min && lval_max && rval_min && rval_max)
621 11318 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
622 7210 : if (is_semantics(e)) {
623 29 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
624 58 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
625 : }
626 : break;
627 5598 : case cmp_gt:
628 5598 : if (lval_max && rval_min)
629 1916 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
630 5598 : if (lval_min && rval_max)
631 3832 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
632 : break;
633 119 : case cmp_gte:
634 119 : if (lval_max && rval_min)
635 49 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
636 119 : if (lval_min && rval_max)
637 98 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
638 : break;
639 2440 : case cmp_lt:
640 2440 : if (lval_min && rval_max)
641 1382 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
642 2440 : if (lval_max && rval_min)
643 2812 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
644 : break;
645 82 : case cmp_lte:
646 82 : if (lval_min && rval_max)
647 10 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
648 82 : if (lval_max && rval_min)
649 20 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
650 : break;
651 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
652 : break;
653 : }
654 : }
655 121455 : assert(!always_false || !always_true);
656 121455 : if (always_false || always_true) {
657 3534 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
658 3534 : if (exp_name(e))
659 0 : exp_prop_alias(v->sql->sa, ne, e);
660 3534 : n->data = ne;
661 3534 : v->changes++;
662 : }
663 : }
664 : }
665 131523 : return rel->exps;
666 : }
667 :
668 : static sql_rel *
669 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
670 : {
671 14 : if (side == rel->r)
672 0 : rel->r = NULL;
673 : else
674 14 : rel->l = NULL;
675 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
676 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
677 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
678 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
679 : }
680 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
681 21 : sql_exp *e1 = n->data, *e2 = m->data;
682 21 : exp_prop_alias(v->sql->sa, e2, e1);
683 : }
684 14 : list_hash_clear(side->exps);
685 :
686 14 : if (need_distinct(rel))
687 0 : set_distinct(side);
688 14 : side->p = prop_copy(v->sql->sa, rel->p);
689 14 : rel_destroy(rel);
690 14 : return side;
691 : }
692 :
693 : static BUN
694 20042 : trivial_project_exp_card(sql_exp *e)
695 : {
696 20343 : if (e->type == e_convert)
697 301 : return trivial_project_exp_card(e->l);
698 20042 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
699 : }
700 :
701 : static BUN
702 23800 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
703 : {
704 23800 : BUN lv = get_rel_count(l);
705 :
706 23800 : if (lv == 0)
707 : return 0;
708 21099 : if (!list_empty(exps)) {
709 21099 : BUN nuniques = 0;
710 : /* compute the highest number of unique values */
711 50072 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
712 28972 : sql_exp *e = n->data;
713 28972 : sql_rel *bt = NULL;
714 28972 : prop *p = NULL;
715 28972 : BUN euniques = BUN_NONE;
716 28972 : atom *min, *max, *sub = NULL;
717 28972 : sql_subtype *tp = exp_subtype(e);
718 28972 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
719 :
720 28972 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
721 20449 : euniques = (BUN) p->value.dval;
722 8523 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
723 6497 : euniques = (BUN) p->value.lval;
724 : }
725 : /* use min to max range to compute number of possible values in the domain for number types */
726 28972 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
727 21773 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
728 : /* the range includes min and max, so the atom_inc call is needed */
729 : /* if 'euniques' has number of distinct values, compute min between both */
730 16976 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
731 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
732 : }
733 28973 : if (euniques != BUN_NONE)
734 26947 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
735 : else
736 : nuniques = BUN_NONE;
737 : }
738 21100 : if (nuniques != BUN_NONE)
739 : return nuniques;
740 : }
741 : return lv;
742 : }
743 :
744 : static sql_rel *
745 2061074 : rel_get_statistics_(visitor *v, sql_rel *rel)
746 : {
747 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
748 2061074 : uint8_t has_special_modify = *(uint8_t*) v->data;
749 2061074 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
750 :
751 : /* Don't look at the same relation twice */
752 2061074 : if (are_statistics_gathered(rel->used))
753 : return rel;
754 1322991 : rel->used |= statistics_gathered;
755 :
756 1322991 : switch (rel->op) {
757 310036 : case op_basetable: {
758 310036 : sql_table *t = (sql_table *) rel->l;
759 310036 : sqlstore *store = v->sql->session->tr->store;
760 :
761 310036 : if (!list_empty(rel->exps)) {
762 1241800 : for (node *n = rel->exps->h ; n ; n = n->next)
763 931585 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
764 : }
765 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
766 310225 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
767 257022 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
768 : break;
769 : }
770 2779 : case op_union:
771 : case op_inter:
772 : case op_except: {
773 2779 : bool empty_cross = false;
774 2779 : int i = 0;
775 2779 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
776 :
777 2779 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
778 0 : pl = pl->l;
779 2779 : while (is_sample(pr->op) || is_topn(pr->op))
780 0 : pr = pr->l;
781 : /* if it's not a projection, then project and propagate statistics */
782 2779 : if (!is_project(pl->op) && !is_base(pl->op)) {
783 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
784 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
785 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
786 : }
787 2779 : if (!is_project(pr->op) && !is_base(pr->op)) {
788 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
789 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
790 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
791 : }
792 :
793 11613 : for (node *n = rel->exps->h ; n ; n = n->next) {
794 8834 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
795 8834 : i++;
796 : }
797 :
798 : /* propagate row count */
799 2779 : if (is_union(rel->op)) {
800 277 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
801 277 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
802 :
803 277 : if (lv == 0 && rv == 0) { /* both sides empty */
804 2 : if (can_be_pruned)
805 : empty_cross = true;
806 : else
807 2 : set_count_prop(v->sql->sa, rel, 0);
808 275 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
809 0 : rel = set_setop_side(v, rel, r);
810 0 : empty_cross = false; /* don't rewrite again */
811 275 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
812 0 : rel = set_setop_side(v, rel, l);
813 0 : empty_cross = false; /* don't rewrite again */
814 275 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
815 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
816 : }
817 2502 : } else if (is_inter(rel->op) || is_except(rel->op)) {
818 2502 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
819 2502 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
820 :
821 2502 : if (lv == 0) { /* left side empty */
822 62 : if (can_be_pruned)
823 : empty_cross = true;
824 : else
825 5 : set_count_prop(v->sql->sa, rel, 0);
826 2440 : } else if (rv == 0) { /* right side empty */
827 17 : if (is_inter(rel->op)) {
828 3 : if (can_be_pruned)
829 : empty_cross = true;
830 : else
831 0 : set_count_prop(v->sql->sa, rel, 0);
832 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
833 14 : rel = set_setop_side(v, rel, l);
834 14 : empty_cross = false; /* don't rewrite again */
835 : } else {
836 0 : set_count_prop(v->sql->sa, rel, lv);
837 : }
838 : } else {
839 2423 : set_count_prop(v->sql->sa, rel, lv);
840 : }
841 : }
842 2779 : if (can_be_pruned && empty_cross) {
843 80 : rel_destroy(rel->l);
844 80 : rel->l = NULL;
845 80 : rel_destroy(rel->r);
846 80 : rel->r = NULL;
847 244 : for (node *n = rel->exps->h ; n ; n = n->next) {
848 164 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
849 164 : exp_prop_alias(v->sql->sa, a, e);
850 164 : n->data = a;
851 : }
852 80 : list_hash_clear(rel->exps);
853 80 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
854 80 : set_count_prop(v->sql->sa, l, 1);
855 80 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
856 80 : set_count_prop(v->sql->sa, l, 0);
857 80 : rel->op = op_project;
858 80 : rel->l = l;
859 80 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
860 80 : set_count_prop(v->sql->sa, rel, 0);
861 80 : set_nodistinct(rel); /* set relations may have distinct flag set */
862 80 : v->changes++;
863 : }
864 : break;
865 : }
866 12838 : case op_munion: {
867 12838 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
868 12838 : BUN cnt = 0;
869 12838 : bool needs_pruning = false;
870 :
871 49097 : for (node *n = l->h; n; n = n->next) {
872 36259 : sql_rel *r = n->data, *pl = r;
873 :
874 36259 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
875 0 : pl = pl->l;
876 : /* if it's not a projection, then project and propagate statistics */
877 36259 : if (!is_project(pl->op) && !is_base(pl->op)) {
878 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
879 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
880 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
881 : }
882 36259 : nrels = append(nrels, pl);
883 : /* we need new munion statistics */
884 : /* propagate row count */
885 36259 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
886 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
887 36259 : if (rv == BUN_NONE) {
888 1152 : cnt++;
889 1152 : continue;
890 : }
891 35107 : if (!rv && can_be_pruned)
892 6625 : needs_pruning = true;
893 : /* overflow check */
894 35107 : if (rv > (BUN_MAX - cnt))
895 36259 : rv = BUN_MAX;
896 : else
897 35107 : cnt += rv;
898 : }
899 12838 : int i = 0;
900 73570 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
901 60732 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
902 :
903 12838 : if (needs_pruning && !rel_is_ref(rel)) {
904 4464 : v->changes++;
905 4464 : list *nl = sa_list(l->sa);
906 :
907 16464 : for (node *n = nrels->h; n; n = n->next) {
908 12000 : sql_rel *r = n->data;
909 12000 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
910 :
911 12000 : if (!rv) { /* keep last for now */
912 6155 : rel_destroy(r);
913 6155 : continue;
914 : }
915 5845 : nl = append(nl, r);
916 : }
917 4464 : rel->l = nl;
918 4464 : if (list_length(nl) == 1) {
919 4139 : sql_rel *l = rel->l = nl->h->data; /* ugh */
920 4139 : rel->r = NULL;
921 4139 : rel->op = op_project;
922 :
923 20405 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
924 16266 : sql_exp *pe = n->data, *ie = m->data;
925 16266 : sql_exp *ne = exp_ref(v->sql, ie);
926 16266 : exp_prop_alias(v->sql->sa, ne, pe);
927 16266 : n->data = ne;
928 : }
929 4139 : list_hash_clear(rel->exps);
930 325 : } else if (list_empty(nl)) {
931 : /* empty select (project [ nils ] ) */
932 454 : for (node *n = rel->exps->h ; n ; n = n->next) {
933 354 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
934 354 : exp_prop_alias(v->sql->sa, a, e);
935 354 : n->data = a;
936 : }
937 100 : list_hash_clear(rel->exps);
938 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
939 100 : set_count_prop(v->sql->sa, l, 1);
940 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
941 100 : set_count_prop(v->sql->sa, l, 0);
942 100 : rel->op = op_project;
943 100 : rel->r = NULL;
944 100 : rel->l = l;
945 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
946 100 : set_count_prop(v->sql->sa, rel, 0);
947 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
948 : }
949 : } else {
950 8374 : set_count_prop(v->sql->sa, rel, cnt);
951 : }
952 : break;
953 : }
954 526077 : case op_join:
955 : case op_left:
956 : case op_right:
957 : case op_full:
958 : case op_semi:
959 : case op_anti:
960 : case op_select:
961 : case op_project:
962 : case op_groupby: {
963 526077 : if (is_groupby(rel->op) && !list_empty(rel->r))
964 15116 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
965 526076 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
966 526067 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
967 38422 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
968 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
969 526068 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
970 137650 : int changes = v->changes;
971 137650 : rel->exps = rel_prune_predicates(v, rel);
972 137650 : if (v->changes > changes)
973 3501 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
974 : }
975 :
976 : /* propagate row count */
977 526068 : sql_rel *l = rel->l, *r = rel->r;
978 526068 : switch (rel->op) {
979 132804 : case op_join:
980 : case op_left:
981 : case op_right:
982 : case op_full: {
983 132804 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
984 :
985 132804 : if (!list_empty(rel->exps) && !is_single(rel)) {
986 243710 : for (node *n = rel->exps->h ; n ; n = n->next) {
987 124374 : sql_exp *e = n->data, *el = e->l, *er = e->r;
988 :
989 124374 : if (find_prop(e->p, PROP_JOINIDX)) {
990 667 : join_idx_estimate = lv>rv?lv:rv;
991 123707 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
992 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
993 119886 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
994 119557 : BUN lu = 0, ru = 0;
995 119557 : prop *p = NULL;
996 119557 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
997 88577 : lu = (BUN) p->value.dval;
998 119557 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
999 102982 : ru = (BUN) p->value.dval;
1000 119557 : if (is_unique(el) || lu > lv) {
1001 72871 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1002 72871 : uniques_estimate = MIN(uniques_estimate, ncount);
1003 46686 : } else if (is_unique(er) || ru > rv) {
1004 29181 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1005 29181 : uniques_estimate = MIN(uniques_estimate, ncount);
1006 : }
1007 : }
1008 : }
1009 : }
1010 : }
1011 132804 : if (is_single(rel)) {
1012 4510 : set_count_prop(v->sql->sa, rel, lv);
1013 128294 : } else if (join_idx_estimate != BUN_MAX) {
1014 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1015 127629 : } else if (uniques_estimate != BUN_MAX) {
1016 95691 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1017 31938 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1018 : /* corner cases for outer joins */
1019 124 : if (is_left(rel->op)) {
1020 112 : set_count_prop(v->sql->sa, rel, lv);
1021 12 : } else if (is_right(rel->op)) {
1022 3 : set_count_prop(v->sql->sa, rel, rv);
1023 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1024 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1025 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1026 0 : set_count_prop(v->sql->sa, rel, 0);
1027 : }
1028 31814 : } else if (lv == 0) {
1029 1164 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1030 31218 : } else if (rv == 0) {
1031 1556 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1032 30156 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1033 17758 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1034 : }
1035 : break;
1036 : }
1037 2618 : case op_anti:
1038 2618 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1039 2618 : break;
1040 107905 : case op_semi:
1041 : case op_select:
1042 : /* TODO calculate cardinalities using selectivities */
1043 107905 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1044 2560 : set_count_prop(v->sql->sa, rel, 0);
1045 : } else {
1046 207466 : if (!list_empty(rel->exps) && !is_single(rel)) {
1047 102121 : BUN cnt = get_rel_count(l), u = 1;
1048 170557 : for (node *n = rel->exps->h ; n ; n = n->next) {
1049 111959 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1050 :
1051 : /* simple expressions first */
1052 111959 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1053 : /* use selectivity */
1054 56649 : prop *p;
1055 56649 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1056 43523 : u = (BUN) p->value.dval;
1057 43523 : break;
1058 : }
1059 : }
1060 : }
1061 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1062 102121 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1063 : } else {
1064 3224 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1065 : }
1066 : }
1067 : break;
1068 259922 : case op_project:
1069 259922 : if (l) {
1070 250284 : if (need_distinct(rel)) {
1071 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1072 : } else {
1073 250284 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1074 : }
1075 : } else {
1076 9638 : BUN card = 1;
1077 :
1078 9638 : if (!list_empty(rel->exps)) {
1079 28379 : for (node *n = rel->exps->h ; n ; n = n->next) {
1080 18741 : sql_exp *e = n->data;
1081 18741 : card = MAX(card, trivial_project_exp_card(e));
1082 : }
1083 : }
1084 9638 : set_count_prop(v->sql->sa, rel, card);
1085 : }
1086 : break;
1087 22454 : case op_groupby:
1088 22454 : if (list_empty(rel->r)) {
1089 7338 : set_count_prop(v->sql->sa, rel, 1);
1090 : } else {
1091 15116 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1092 : }
1093 : break;
1094 : default:
1095 : break;
1096 : }
1097 : break;
1098 : }
1099 16744 : case op_topn: {
1100 16744 : BUN lv = get_rel_count(rel->l);
1101 :
1102 16744 : if (lv != BUN_NONE) {
1103 16727 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1104 95 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1105 95 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1106 95 : lv = offset >= lv ? 0 : lv - offset;
1107 : }
1108 16727 : if (le->l && exp_is_not_null(le)) {
1109 16689 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1110 16689 : lv = MIN(lv, limit);
1111 : }
1112 16727 : set_count_prop(v->sql->sa, rel, lv);
1113 : }
1114 : break;
1115 : }
1116 21 : case op_sample: {
1117 21 : BUN lv = get_rel_count(rel->l);
1118 :
1119 21 : if (lv != BUN_NONE) {
1120 3 : sql_exp *se = rel->exps->h->data;
1121 3 : sql_subtype *tp = exp_subtype(se);
1122 :
1123 3 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1124 3 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1125 3 : lv = MIN(lv, sample);
1126 0 : } else if (se->l) { /* sample is a percentage of rows */
1127 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1128 0 : assert(tp->type->eclass == EC_FLT);
1129 0 : lv = (BUN) ceil((dbl)lv * percent);
1130 : }
1131 3 : set_count_prop(v->sql->sa, rel, lv);
1132 : }
1133 : break;
1134 : }
1135 6050 : case op_table: {
1136 6050 : sql_exp *op = rel->r;
1137 6050 : if (rel->flag != TRIGGER_WRAPPER && op) {
1138 5738 : sql_subfunc *f = op->f;
1139 5738 : if (f->func->lang == FUNC_LANG_SQL) {
1140 78 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1141 5660 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1142 827 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1143 4833 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1144 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1145 4833 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1146 1081 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1147 3752 : } else if (f->func->lang == FUNC_LANG_MAL &&
1148 3642 : (strcmp(f->func->base.name, "queue") == 0 ||
1149 3379 : strcmp(f->func->base.name, "optimizers") == 0 ||
1150 3068 : strcmp(f->func->base.name, "env") == 0 ||
1151 2777 : strcmp(f->func->base.name, "keywords") == 0 ||
1152 2777 : strcmp(f->func->base.name, "statistics") == 0 ||
1153 2112 : strcmp(f->func->base.name, "rejects") == 0 ||
1154 1861 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1155 1861 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1156 1861 : strcmp(f->func->base.name, "sessions") == 0) ) {
1157 2070 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1158 : }
1159 : /* else {
1160 : printf("%%func needs stats : %s\n", f->func->base.name);
1161 : } */
1162 : }
1163 : break;
1164 : }
1165 : /*These relations are less important for now
1166 : TODO later we can tune it */
1167 : case op_insert:
1168 : case op_update:
1169 : case op_delete:
1170 : case op_truncate:
1171 : case op_ddl:
1172 : default:
1173 : break;
1174 : }
1175 :
1176 : return rel;
1177 : }
1178 :
1179 : static sql_rel *
1180 538896 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1181 : {
1182 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1183 538896 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1184 538896 : v->data = &has_special_modify;
1185 538896 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1186 539387 : v->data = gp;
1187 539387 : return rel;
1188 : }
1189 :
1190 : run_optimizer
1191 714728 : bind_get_statistics(visitor *v, global_props *gp)
1192 : {
1193 714728 : (void) v;
1194 714728 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1195 : }
1196 :
1197 :
1198 : static bool
1199 94163 : point_select_on_unique_column(sql_rel *rel)
1200 : {
1201 94163 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1202 130326 : for (node *n = rel->exps->h; n ; n = n->next) {
1203 75084 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1204 :
1205 75084 : if (is_compare(e->type) && e->flag == cmp_equal) {
1206 33230 : if (is_numeric_upcast(el))
1207 0 : el = el->l;
1208 33230 : if (is_numeric_upcast(er))
1209 0 : er = er->l;
1210 33230 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1211 746 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1212 : return true;
1213 32484 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1214 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1215 : return true;
1216 : }
1217 : }
1218 : }
1219 : return false;
1220 : }
1221 :
1222 : /*
1223 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1224 : * join, the opposite side's select can be pushed above the join.
1225 : */
1226 : static inline sql_rel *
1227 1234619 : rel_push_select_up(visitor *v, sql_rel *rel)
1228 : {
1229 1234619 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1230 252069 : sql_rel *l = rel->l, *r = rel->r;
1231 252069 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1232 252069 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1233 :
1234 252069 : if (can_pushup_left || can_pushup_right) {
1235 66405 : if (can_pushup_left)
1236 45087 : can_pushup_left = point_select_on_unique_column(r);
1237 66405 : if (can_pushup_right)
1238 49076 : can_pushup_right = point_select_on_unique_column(l);
1239 :
1240 : /* if both selects retrieve one row each, it's not worth it to push both up */
1241 66405 : if (can_pushup_left && !can_pushup_right) {
1242 680 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1243 680 : nrel->l = l->l;
1244 680 : rel = rel_inplace_select(rel, nrel, l->exps);
1245 680 : assert(is_select(rel->op));
1246 680 : v->changes++;
1247 65725 : } else if (!can_pushup_left && can_pushup_right) {
1248 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1249 14 : nrel->r = r->l;
1250 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1251 14 : assert(is_select(rel->op));
1252 14 : v->changes++;
1253 : }
1254 : }
1255 : }
1256 1234619 : return rel;
1257 : }
1258 :
1259 : static int
1260 92654 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1261 : {
1262 92654 : int de;
1263 :
1264 92654 : if (!t)
1265 : return 0;
1266 92654 : switch (ATOMstorage(t->type->localtype)) {
1267 : case TYPE_bte:
1268 : return 150 - 8;
1269 1671 : case TYPE_sht:
1270 1671 : return 150 - 16;
1271 36762 : case TYPE_int:
1272 36762 : return 150 - 32;
1273 515 : case TYPE_void:
1274 : case TYPE_lng:
1275 515 : return 150 - 64;
1276 : case TYPE_uuid:
1277 : #ifdef HAVE_HGE
1278 : case TYPE_hge:
1279 : #endif
1280 : return 150 - 128;
1281 2 : case TYPE_flt:
1282 2 : return 75 - 24;
1283 : case TYPE_dbl:
1284 : return 75 - 53;
1285 30466 : default:
1286 30466 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1287 1550 : return 150 - de * 8;
1288 : /* strings and blobs not duplicate eliminated don't get any points here */
1289 : return 0;
1290 : }
1291 : }
1292 :
1293 : static int
1294 34432 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1295 : {
1296 34432 : int res = 0;
1297 34432 : sql_subtype *t = exp_subtype(e);
1298 34432 : sql_column *c = NULL;
1299 :
1300 34432 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1301 : return -1000;
1302 : /* can we find out if the underlying table is sorted */
1303 33831 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1304 33831 : res += 600;
1305 :
1306 : /* prefer the shorter var types over the longer ones */
1307 33831 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1308 33831 : return res;
1309 : }
1310 :
1311 : static int
1312 58345 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1313 : {
1314 58345 : int score = 0;
1315 58345 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1316 34432 : sql_exp *l = e->l;
1317 :
1318 34432 : while (l->type == e_cmp) { /* go through nested comparisons */
1319 2 : sql_exp *ll;
1320 :
1321 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1322 0 : ll = ((list*)l->l)->h->data;
1323 : else
1324 2 : ll = l->l;
1325 2 : if (ll->type != e_cmp)
1326 : break;
1327 : l = ll;
1328 : }
1329 34432 : score += score_se_base(v, rel, l);
1330 : }
1331 58345 : score += exp_keyvalue(e);
1332 58345 : return score;
1333 : }
1334 :
1335 : static inline sql_rel *
1336 1234620 : rel_select_order(visitor *v, sql_rel *rel)
1337 : {
1338 1234620 : int *scores = NULL;
1339 1234620 : sql_exp **exps = NULL;
1340 :
1341 1234620 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1342 27292 : node *n;
1343 27292 : int i, nexps = list_length(rel->exps);
1344 27292 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1345 27292 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1346 :
1347 85637 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1348 58375 : exps[i] = n->data;
1349 58375 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1350 : return rel;
1351 58345 : scores[i] = score_se(v, rel, n->data);
1352 : }
1353 27262 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1354 :
1355 85607 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1356 58345 : n->data = exps[i];
1357 : }
1358 :
1359 : return rel;
1360 : }
1361 :
1362 : /* Compute the efficiency of using this expression early in a group by list */
1363 : static int
1364 58823 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1365 : {
1366 58823 : int res = 0;
1367 58823 : sql_subtype *t = exp_subtype(e);
1368 58823 : sql_column *c = exp_find_column(rel, e, -2);
1369 :
1370 58823 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1371 38 : res += 1000;
1372 : /* can we find out if the underlying table is sorted */
1373 58823 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1374 3216 : res += 700;
1375 38412 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1376 3357 : res += 500;
1377 58823 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1378 0 : res += 200;
1379 :
1380 : /* prefer the shorter var types over the longer ones */
1381 58823 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1382 58823 : return res;
1383 : }
1384 :
1385 : /* reorder group by expressions */
1386 : static inline sql_rel *
1387 1234620 : rel_groupby_order(visitor *v, sql_rel *rel)
1388 : {
1389 1234620 : int *scores = NULL;
1390 1234620 : sql_exp **exps = NULL;
1391 :
1392 1234620 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1393 25321 : node *n;
1394 25321 : list *gbe = rel->r;
1395 25321 : int i, ngbe = list_length(gbe);
1396 25321 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1397 25321 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1398 :
1399 : /* first sorting step, give priority for integers and sorted columns */
1400 84144 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1401 58823 : exps[i] = n->data;
1402 58823 : scores[i] = score_gbe(v, rel, exps[i]);
1403 : }
1404 25321 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1405 :
1406 : /* second sorting step, give priority to strings with lower number of digits */
1407 50131 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1408 25321 : if (scores[i])
1409 24327 : i++;
1410 25321 : if (ngbe - i > 1) {
1411 8569 : for (int j = i; j < ngbe; j++) {
1412 6570 : sql_subtype *t = exp_subtype(exps[j]);
1413 6570 : scores[j] = t ? t->digits : 0;
1414 : }
1415 : /* the less number of digits the better, order ascending */
1416 1999 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1417 : }
1418 :
1419 84144 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1420 58823 : n->data = exps[i];
1421 : }
1422 :
1423 1234620 : return rel;
1424 : }
1425 :
1426 : /* This optimization loop contains optimizations that can potentially use statistics */
1427 : static sql_rel *
1428 1234619 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1429 : {
1430 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1431 1234619 : rel = rel_push_select_up(v, rel);
1432 1234620 : rel = rel_select_order(v, rel);
1433 :
1434 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1435 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1436 : rel_distinct_aggregate_on_unique_values */
1437 :
1438 1234620 : rel = rel_groupby_order(v, rel);
1439 1234619 : return rel;
1440 : }
1441 :
1442 : static sql_rel *
1443 65446 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1444 : {
1445 65446 : (void) gp;
1446 65446 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1447 : }
1448 :
1449 : run_optimizer
1450 715333 : bind_final_optimization_loop(visitor *v, global_props *gp)
1451 : {
1452 715333 : int flag = v->sql->sql_optimizer;
1453 : /* At the moment, this optimizer has dependency on 3 flags */
1454 546863 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1455 780781 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1456 : }
|