Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 :
19 : static sql_exp *
20 3462283 : comparison_find_column(sql_exp *input, sql_exp *e)
21 : {
22 3462283 : switch (input->type) {
23 98263 : case e_convert: {
24 98263 : list *types = (list *)input->r;
25 98263 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
26 98263 : if (from == to)
27 189472 : return comparison_find_column(input->l, e) ? input : NULL;
28 : return NULL;
29 : }
30 3051104 : case e_column:
31 3051104 : return exp_match(e, input) ? input : NULL;
32 : default:
33 : return NULL;
34 : }
35 : }
36 :
37 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
38 : * columns */
39 : #define comp_single_column(c) (!c->f)
40 :
41 : static sql_exp *
42 8732090 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
43 : {
44 8815172 : assert(e->type == e_column);
45 8815172 : if (rel) {
46 8815098 : switch(rel->op) {
47 4080719 : case op_left:
48 : case op_right:
49 : case op_full:
50 : case op_join:
51 : case op_select:
52 : case op_anti:
53 : case op_semi: {
54 4080719 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
55 :
56 4080719 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
57 : return NULL; /* nothing will pass, skip */
58 :
59 : /* propagate from the bottom first */
60 4066605 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
61 : found_left = true;
62 2585583 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
63 4066605 : found_right = true;
64 :
65 4066605 : if (!found_left && !found_right)
66 : return NULL;
67 1775396 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
68 3493738 : for (node *n = rel->exps->h ; n ; n = n->next) {
69 1809115 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
70 :
71 1809115 : if (comp->type == e_cmp) {
72 1808113 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
73 125605 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
74 125605 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
75 :
76 : /* not semantics found or if explicitly filtering not null values from the column */
77 125605 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
78 125605 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
79 125605 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
80 19749 : continue;
81 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
82 105856 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
83 4622 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
84 4622 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
85 4622 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
86 4622 : symmetric = is_symmetric(comp);
87 :
88 4622 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
89 1817 : continue;
90 2805 : if (lne && int1 && int2) {
91 104 : if (symmetric) {
92 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
93 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
94 : /* max is min from le and (max from re and fe max) */
95 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
96 : /* min is max from le and (min from re and fe min) */
97 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
98 : } else {
99 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
100 : /* max is min from le and fe max */
101 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
102 : /* min is max from le and re min */
103 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
104 : }
105 2701 : } else if (rne) {
106 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
107 0 : prop *p = find_prop(e->p, PROP_MIN);
108 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
109 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
110 590 : } else if (int1) { /* min is max from le and re min */
111 100 : prop *p = find_prop(e->p, PROP_MIN);
112 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
113 : }
114 2111 : } else if (fne) {
115 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
116 0 : prop *p = find_prop(e->p, PROP_MAX);
117 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
118 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
119 549 : } else if (int2) { /* max is min from le and fe max */
120 266 : prop *p = find_prop(e->p, PROP_MAX);
121 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
122 : }
123 : }
124 101234 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
125 : /* both min and max must be set and the intervals must overlap */
126 43126 : switch (comp->flag) {
127 28790 : case cmp_equal: /* for equality reduce */
128 28790 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
129 28790 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
130 28790 : break;
131 4679 : case cmp_notequal: /* for inequality expand */
132 4679 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
133 4679 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
134 4679 : break;
135 5651 : case cmp_gt:
136 : case cmp_gte:
137 10364 : if (!is_anti(comp) && lne) { /* min is max from both min */
138 4713 : prop *p = find_prop(e->p, PROP_MIN);
139 4713 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
140 938 : } else if (!is_anti(comp)) { /* max is min from both max */
141 938 : prop *p = find_prop(e->p, PROP_MAX);
142 938 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
143 : }
144 : break;
145 4006 : case cmp_lt:
146 : case cmp_lte:
147 7205 : if (!is_anti(comp) && lne) { /* max is min from both max */
148 3199 : prop *p = find_prop(e->p, PROP_MAX);
149 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
150 807 : } else if (!is_anti(comp)) { /* min is max from both min */
151 807 : prop *p = find_prop(e->p, PROP_MIN);
152 807 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
153 : }
154 : break;
155 : default: /* Maybe later I can do cmp_in and cmp_notin */
156 : break;
157 : }
158 : }
159 : }
160 : }
161 : }
162 : }
163 1775396 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
164 37930 : set_has_nil(e);
165 1775396 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
166 94467 : set_has_no_nil(e);
167 1775396 : if (is_join(rel->op) && is_unique(e) && !still_unique)
168 118627 : set_not_unique(e);
169 1775396 : return e;
170 : }
171 4631950 : case op_table:
172 : case op_basetable:
173 : case op_union:
174 : case op_except:
175 : case op_inter:
176 : case op_munion:
177 : case op_project:
178 : case op_groupby: {
179 4631950 : sql_exp *found;
180 4631950 : atom *fval;
181 4631950 : prop *est;
182 4631950 : if ((found = rel_find_exp(rel, e))) {
183 2173278 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
184 2128849 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
185 1130712 : set_minmax_property(sql, e, PROP_MAX, fval);
186 2128845 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
187 1138005 : set_minmax_property(sql, e, PROP_MIN, fval);
188 2128841 : if (!has_nil(found))
189 1351406 : set_has_no_nil(e);
190 2128841 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
191 1718543 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
192 421174 : set_unique(e);
193 : /* propagate unique estimation for known cases */
194 2128842 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
195 7769 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
196 7769 : p->value.dval = 1;
197 2121073 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
198 66649 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
199 2063427 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
200 188124 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
201 188124 : p->value.dval = est->value.dval;
202 : }
203 : }
204 2173273 : return e;
205 : }
206 : return NULL;
207 : }
208 83082 : case op_topn:
209 : case op_sample:
210 83082 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
211 : default:
212 : break;
213 : }
214 : }
215 : return NULL;
216 : }
217 :
218 : static atom *
219 4440786 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
220 : {
221 4440786 : atom *a = SA_NEW(sa, atom);
222 :
223 4440730 : assert(!VALisnil(v));
224 4440595 : *a = (atom) {.tpe = *tpe,};
225 4440595 : SA_VALcopy(sa, &a->data, v);
226 4440549 : return a;
227 : }
228 :
229 : void
230 4053783 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
231 : {
232 4053783 : bool nonil = false, unique = false;
233 4053783 : double unique_est = 0.0;
234 4053783 : ValRecord min, max;
235 4053783 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
236 :
237 4053924 : if (has_nil(e) && nonil)
238 2565399 : set_has_no_nil(e);
239 4053924 : if (!is_unique(e) && unique)
240 1098858 : set_unique(e);
241 4053924 : if (unique_est != 0.0) {
242 2817694 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
243 2817676 : p->value.dval = unique_est;
244 : }
245 4053906 : unsigned int digits = 0;
246 4053906 : sql_subtype *et = exp_subtype(e);
247 4053928 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
248 2637245 : digits = et->digits;
249 4053928 : if ((ok & 2) == 2) {
250 2217376 : if (!VALisnil(&max)) {
251 2217371 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
252 2217338 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
253 2217287 : if (digits) {
254 1652189 : unsigned int nd = atom_digits(p->value.pval);
255 1652211 : if (nd < digits)
256 : digits = nd;
257 1652211 : if (!digits)
258 : digits = 1;
259 : }
260 : }
261 2217317 : VALclear(&max);
262 : }
263 4053847 : if ((ok & 1) == 1) {
264 2223686 : if (!VALisnil(&min)) {
265 2223672 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
266 2223658 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
267 2223632 : if (digits) {
268 1659603 : unsigned int nd = atom_digits(p->value.pval);
269 1659601 : if (nd > digits) /* possibly set to low by max value */
270 : digits = nd;
271 : if (!digits)
272 : digits = 1;
273 : }
274 : }
275 2223626 : VALclear(&min);
276 : }
277 4053774 : if (digits)
278 2637091 : et->digits = digits;
279 4053774 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
280 216 : et->digits = et->scale + 1;
281 4053774 : }
282 :
283 : static void
284 934632 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
285 : {
286 934632 : if (e->p)
287 : return;
288 302290 : sql_column *c = NULL;
289 :
290 302290 : if ((c = rel_base_find_column(rel, e->nid))) {
291 204237 : sql_column_get_statistics(sql, c, e);
292 : }
293 : }
294 :
295 : static bool
296 8872 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
297 : {
298 8872 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
299 8872 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
300 8872 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
301 8872 : prop *est;
302 :
303 : /* for the intersection, if both expressions don't overlap, it can be pruned */
304 8872 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
305 511 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
306 26 : return true;
307 :
308 8846 : if (lval_max && rval_max) {
309 2557 : if (is_union(rel->op))
310 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
311 2554 : else if (is_inter(rel->op))
312 399 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
313 : else /* except */
314 2155 : set_minmax_property(sql, e, PROP_MAX, lval_max);
315 : }
316 8846 : if (lval_min && rval_min) {
317 2557 : if (is_union(rel->op))
318 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
319 2554 : else if (is_inter(rel->op))
320 399 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
321 : else /* except */
322 2155 : set_minmax_property(sql, e, PROP_MIN, lval_min);
323 : }
324 :
325 8846 : if (is_union(rel->op) || is_munion(rel->op)) {
326 5986 : if (!has_nil(le) && !has_nil(re))
327 879 : set_has_no_nil(e);
328 5986 : if (need_distinct(rel) && list_length(rel->exps) == 1)
329 0 : set_unique(e);
330 2860 : } else if (is_inter(rel->op)) {
331 564 : if (!has_nil(le) || !has_nil(re))
332 509 : set_has_no_nil(e);
333 564 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
334 345 : set_unique(e);
335 : } else {
336 2296 : assert(is_except(rel->op));
337 2296 : if (!has_nil(le))
338 2233 : set_has_no_nil(e);
339 2296 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
340 2009 : set_unique(e);
341 : }
342 : /* propagate unique estimation for known cases */
343 8846 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
344 205 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
345 205 : p->value.dval = est->value.dval;
346 : }
347 : return false;
348 : }
349 :
350 :
351 : static void
352 60906 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
353 : {
354 60906 : assert(is_munion(rel->op));
355 :
356 60906 : sql_rel *l = rels->h->data;
357 60906 : sql_exp *le = list_fetch(l->exps, i);
358 60906 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
359 60906 : bool has_nonil = !has_nil(le);
360 :
361 176339 : for(node *n = rels->h->next; n; n = n->next) {
362 115433 : sql_rel *r = n->data;
363 115433 : sql_exp *re = list_fetch(r->exps, i);
364 115433 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
365 :
366 115433 : if (lval_max && rval_max) {
367 42595 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
368 42595 : lval_max = find_prop_and_get(e->p, PROP_MAX);
369 : }
370 115433 : if (lval_min && rval_min) {
371 42097 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
372 42097 : lval_min = find_prop_and_get(e->p, PROP_MIN);
373 : }
374 115433 : has_nonil &= !has_nil(re);
375 :
376 : }
377 :
378 60906 : if (has_nonil)
379 40993 : set_has_no_nil(e);
380 :
381 60906 : if (need_distinct(rel) && list_length(rel->exps) == 1)
382 1594 : set_unique(e);
383 60906 : }
384 :
385 :
386 : static sql_exp *
387 3473333 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
388 : {
389 3473333 : mvc *sql = v->sql;
390 3473333 : atom *lval;
391 :
392 3473333 : (void) depth;
393 3473333 : switch(e->type) {
394 2178896 : case e_column:
395 2178896 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
396 277903 : case op_join:
397 : case op_left:
398 : case op_right:
399 : case op_full:
400 : case op_semi:
401 : case op_anti: {
402 277903 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
403 277903 : if (!found)
404 139396 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
405 : break;
406 : }
407 1900993 : case op_select:
408 : case op_project:
409 : case op_groupby: {
410 1900993 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
411 1900988 : if (!found && is_simple_project(rel->op))
412 124031 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
413 : break;
414 : }
415 0 : case op_insert:
416 : case op_update:
417 : case op_delete:
418 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
419 0 : break;
420 : default:
421 : break;
422 : }
423 : break;
424 98450 : case e_convert: {
425 98450 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
426 98450 : sql_exp *l = e->l;
427 98450 : sql_class fr = from->type->eclass, too = to->type->eclass;
428 98450 : prop *est;
429 :
430 98450 : if (fr == too) {
431 89427 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
432 58820 : atom *res = atom_copy(sql->sa, lval);
433 58820 : if ((res = atom_cast(sql->sa, res, to)))
434 58797 : set_minmax_property(sql, e, PROP_MAX, res);
435 : }
436 89427 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
437 59434 : atom *res = atom_copy(sql->sa, lval);
438 59434 : if ((res = atom_cast(sql->sa, res, to)))
439 59411 : set_minmax_property(sql, e, PROP_MIN, res);
440 : }
441 : }
442 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
443 98450 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
444 61046 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
445 61021 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
446 57625 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
447 57625 : p->value.dval = est->value.dval;
448 : }
449 98450 : if (!has_nil(l))
450 55382 : set_has_no_nil(e);
451 : break;
452 : }
453 341951 : case e_aggr:
454 : case e_func: {
455 341951 : BUN lv;
456 341951 : sql_subfunc *f = e->f;
457 :
458 341951 : if (!f->func->s) {
459 315514 : int key = hash_key(f->func->base.name); /* Using hash lookup */
460 315514 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
461 315514 : lookup_function look = NULL;
462 :
463 688994 : for (; he && !look; he = he->chain) {
464 373480 : struct function_properties* fp = (struct function_properties*) he->value;
465 :
466 373480 : if (!strcmp(f->func->base.name, fp->name))
467 107858 : look = fp->func;
468 : }
469 315514 : if (look)
470 107858 : look(sql, e);
471 : }
472 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
473 341951 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
474 88656 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
475 88333 : set_has_no_nil(e);
476 : /* set properties for global aggregates */
477 341951 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
478 8091 : if (!find_prop(e->p, PROP_NUNIQUES)) {
479 8091 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
480 8091 : p->value.dval = 1;
481 : }
482 8091 : set_unique(e);
483 : }
484 : break;
485 : }
486 567982 : case e_atom:
487 567982 : if (e->l) {
488 551308 : atom *a = (atom*) e->l;
489 551308 : if (!a->isnull) {
490 489346 : set_minmax_property(sql, e, PROP_MAX, a);
491 489346 : set_minmax_property(sql, e, PROP_MIN, a);
492 : }
493 551308 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
494 551308 : p->value.dval = 1;
495 16674 : } else if (e->f) {
496 3149 : list *vals = (list *) e->f;
497 3149 : sql_exp *first = vals->h ? vals->h->data : NULL;
498 3149 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
499 3149 : int has_nil = 0;
500 :
501 3149 : if (first) {
502 3149 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
503 3149 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
504 3149 : has_nil |= has_nil(first);
505 : }
506 :
507 13760 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
508 7462 : sql_exp *ee = n->data;
509 :
510 7462 : if (min && max) {
511 7009 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
512 6963 : max = atom_cmp(lval, max) > 0 ? lval : max;
513 : } else {
514 : max = NULL;
515 : }
516 7009 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
517 6963 : min = atom_cmp(min, lval) > 0 ? lval : min;
518 : } else {
519 : min = NULL;
520 : }
521 : }
522 7462 : has_nil |= has_nil(ee);
523 : }
524 :
525 3149 : if (!has_nil)
526 2783 : set_has_no_nil(e);
527 3149 : if (min && max) {
528 2760 : set_minmax_property(sql, e, PROP_MAX, max);
529 2760 : set_minmax_property(sql, e, PROP_MIN, min);
530 : }
531 3149 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
532 3149 : p->value.dval = (dbl) list_length(vals);
533 : } else {
534 13525 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
535 13525 : p->value.dval = 1;
536 : }
537 : break;
538 286054 : case e_cmp:
539 : /* TODO? propagating min/max/unique of booleans is not very worth it */
540 286054 : if (e->flag == cmp_or || e->flag == cmp_filter) {
541 18009 : if (!have_nil(e->l) && !have_nil(e->r))
542 13554 : set_has_no_nil(e);
543 268045 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
544 21193 : sql_exp *le = e->l;
545 21193 : if (!has_nil(le) && !have_nil(e->r))
546 17994 : set_has_no_nil(e);
547 : } else {
548 246852 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
549 246852 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
550 171615 : set_has_no_nil(e);
551 : }
552 : break;
553 : case e_psm:
554 : break;
555 : }
556 :
557 : #ifndef NDEBUG
558 : {
559 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
560 3473328 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
561 :
562 3473326 : (void) min;
563 3473326 : (void) max;
564 3473326 : assert(!min || !min->isnull);
565 3473326 : assert(!max || !max->isnull);
566 3473326 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
567 : }
568 : #endif
569 3473326 : return e;
570 : }
571 :
572 : static list * /* Remove predicates always false from min/max values */
573 138136 : rel_prune_predicates(visitor *v, sql_rel *rel)
574 : {
575 138136 : if (rel->l) {
576 138136 : sql_rel *l = rel->l;
577 138136 : if (is_single(l))
578 3263 : return rel->exps;
579 : }
580 134873 : if (!list_empty(rel->attr))
581 2959 : return rel->exps;
582 279905 : for (node *n = rel->exps->h ; n ; n = n->next) {
583 147991 : sql_exp *e = n->data;
584 :
585 147991 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
586 121840 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
587 121840 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
588 121840 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
589 121840 : bool always_false = false, always_true = false;
590 :
591 121840 : if (fe && !is_symmetric(e)) {
592 3058 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
593 3058 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
594 3660 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
595 1663 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
596 4072 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
597 2485 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
598 3058 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
599 1288 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
600 :
601 3058 : always_false |= not_int1 || not_int2 || not_int3;
602 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
603 4113 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
604 3957 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
605 575 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
606 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
607 : } else if (!fe) {
608 118764 : if (!is_semantics(e)) /* trivial not null cmp null case */
609 225762 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
610 118764 : switch (e->flag) {
611 103286 : case cmp_equal:
612 103286 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
613 131450 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
614 103286 : if (is_semantics(e)) { /* prune *= NULL cases */
615 5857 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
616 11714 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
617 : }
618 : break;
619 7215 : case cmp_notequal:
620 7215 : if (lval_min && lval_max && rval_min && rval_max)
621 11374 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
622 7215 : if (is_semantics(e)) {
623 26 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
624 52 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
625 : }
626 : break;
627 5632 : case cmp_gt:
628 5632 : if (lval_max && rval_min)
629 1947 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
630 5632 : if (lval_min && rval_max)
631 3894 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
632 : break;
633 119 : case cmp_gte:
634 119 : if (lval_max && rval_min)
635 49 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
636 119 : if (lval_min && rval_max)
637 98 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
638 : break;
639 2428 : case cmp_lt:
640 2428 : if (lval_min && rval_max)
641 1376 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
642 2428 : if (lval_max && rval_min)
643 2800 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
644 : break;
645 84 : case cmp_lte:
646 84 : if (lval_min && rval_max)
647 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
648 84 : if (lval_max && rval_min)
649 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
650 : break;
651 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
652 : break;
653 : }
654 : }
655 121840 : assert(!always_false || !always_true);
656 121840 : if (always_false || always_true) {
657 3554 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
658 3554 : if (exp_name(e))
659 0 : exp_prop_alias(v->sql->sa, ne, e);
660 3554 : n->data = ne;
661 3554 : v->changes++;
662 : }
663 : }
664 : }
665 131914 : return rel->exps;
666 : }
667 :
668 : static sql_rel *
669 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
670 : {
671 14 : if (side == rel->r)
672 0 : rel->r = NULL;
673 : else
674 14 : rel->l = NULL;
675 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
676 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
677 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
678 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
679 : }
680 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
681 21 : sql_exp *e1 = n->data, *e2 = m->data;
682 21 : exp_prop_alias(v->sql->sa, e2, e1);
683 : }
684 14 : list_hash_clear(side->exps);
685 :
686 14 : if (need_distinct(rel))
687 0 : set_distinct(side);
688 14 : side->p = prop_copy(v->sql->sa, rel->p);
689 14 : rel_destroy(rel);
690 14 : return side;
691 : }
692 :
693 : static BUN
694 20332 : trivial_project_exp_card(sql_exp *e)
695 : {
696 20623 : if (e->type == e_convert)
697 291 : return trivial_project_exp_card(e->l);
698 20332 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
699 : }
700 :
701 : static BUN
702 24248 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
703 : {
704 24248 : BUN lv = get_rel_count(l);
705 :
706 24248 : if (lv == 0)
707 : return 0;
708 21539 : if (!list_empty(exps)) {
709 21539 : BUN nuniques = 0;
710 : /* compute the highest number of unique values */
711 51131 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
712 29592 : sql_exp *e = n->data;
713 29592 : sql_rel *bt = NULL;
714 29592 : prop *p = NULL;
715 29592 : BUN euniques = BUN_NONE;
716 29592 : atom *min, *max, *sub = NULL;
717 29592 : sql_subtype *tp = exp_subtype(e);
718 29592 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
719 :
720 29592 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
721 20956 : euniques = (BUN) p->value.dval;
722 8636 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
723 6593 : euniques = (BUN) p->value.lval;
724 : }
725 : /* use min to max range to compute number of possible values in the domain for number types */
726 29592 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
727 22363 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
728 : /* the range includes min and max, so the atom_inc call is needed */
729 : /* if 'euniques' has number of distinct values, compute min between both */
730 17543 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
731 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
732 : }
733 29592 : if (euniques != BUN_NONE)
734 27549 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
735 : else
736 : nuniques = BUN_NONE;
737 : }
738 21539 : if (nuniques != BUN_NONE)
739 : return nuniques;
740 : }
741 : return lv;
742 : }
743 :
744 : static sql_rel *
745 2070084 : rel_get_statistics_(visitor *v, sql_rel *rel)
746 : {
747 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
748 2070084 : uint8_t has_special_modify = *(uint8_t*) v->data;
749 2070084 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
750 :
751 : /* Don't look at the same relation twice */
752 2070084 : if (are_statistics_gathered(rel->used))
753 : return rel;
754 1328861 : rel->used |= statistics_gathered;
755 :
756 1328861 : switch (rel->op) {
757 311866 : case op_basetable: {
758 311866 : sql_table *t = (sql_table *) rel->l;
759 311866 : sqlstore *store = v->sql->session->tr->store;
760 :
761 311866 : if (!list_empty(rel->exps)) {
762 1246450 : for (node *n = rel->exps->h ; n ; n = n->next)
763 934633 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
764 : }
765 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
766 311866 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
767 258348 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
768 : break;
769 : }
770 2790 : case op_union:
771 : case op_inter:
772 : case op_except: {
773 2790 : bool empty_cross = false;
774 2790 : int i = 0;
775 2790 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
776 :
777 2790 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
778 0 : pl = pl->l;
779 2790 : while (is_sample(pr->op) || is_topn(pr->op))
780 0 : pr = pr->l;
781 : /* if it's not a projection, then project and propagate statistics */
782 2790 : if (!is_project(pl->op) && !is_base(pl->op)) {
783 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
784 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
785 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
786 : }
787 2790 : if (!is_project(pr->op) && !is_base(pr->op)) {
788 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
789 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
790 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
791 : }
792 :
793 11662 : for (node *n = rel->exps->h ; n ; n = n->next) {
794 8872 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
795 8872 : i++;
796 : }
797 :
798 : /* propagate row count */
799 2790 : if (is_union(rel->op)) {
800 277 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
801 277 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
802 :
803 277 : if (lv == 0 && rv == 0) { /* both sides empty */
804 2 : if (can_be_pruned)
805 : empty_cross = true;
806 : else
807 2 : set_count_prop(v->sql->sa, rel, 0);
808 275 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
809 0 : rel = set_setop_side(v, rel, r);
810 0 : empty_cross = false; /* don't rewrite again */
811 275 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
812 0 : rel = set_setop_side(v, rel, l);
813 0 : empty_cross = false; /* don't rewrite again */
814 275 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
815 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
816 : }
817 2513 : } else if (is_inter(rel->op) || is_except(rel->op)) {
818 2513 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
819 2513 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
820 :
821 2513 : if (lv == 0) { /* left side empty */
822 62 : if (can_be_pruned)
823 : empty_cross = true;
824 : else
825 5 : set_count_prop(v->sql->sa, rel, 0);
826 2451 : } else if (rv == 0) { /* right side empty */
827 17 : if (is_inter(rel->op)) {
828 3 : if (can_be_pruned)
829 : empty_cross = true;
830 : else
831 0 : set_count_prop(v->sql->sa, rel, 0);
832 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
833 14 : rel = set_setop_side(v, rel, l);
834 14 : empty_cross = false; /* don't rewrite again */
835 : } else {
836 0 : set_count_prop(v->sql->sa, rel, lv);
837 : }
838 : } else {
839 2434 : set_count_prop(v->sql->sa, rel, lv);
840 : }
841 : }
842 2790 : if (can_be_pruned && empty_cross) {
843 80 : rel_destroy(rel->l);
844 80 : rel->l = NULL;
845 80 : rel_destroy(rel->r);
846 80 : rel->r = NULL;
847 244 : for (node *n = rel->exps->h ; n ; n = n->next) {
848 164 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
849 164 : exp_prop_alias(v->sql->sa, a, e);
850 164 : n->data = a;
851 : }
852 80 : list_hash_clear(rel->exps);
853 80 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
854 80 : set_count_prop(v->sql->sa, l, 1);
855 80 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
856 80 : set_count_prop(v->sql->sa, l, 0);
857 80 : rel->op = op_project;
858 80 : rel->l = l;
859 80 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
860 80 : set_count_prop(v->sql->sa, rel, 0);
861 80 : set_nodistinct(rel); /* set relations may have distinct flag set */
862 80 : v->changes++;
863 : }
864 : break;
865 : }
866 12886 : case op_munion: {
867 12886 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
868 12886 : BUN cnt = 0;
869 12886 : bool needs_pruning = false;
870 :
871 49266 : for (node *n = l->h; n; n = n->next) {
872 36380 : sql_rel *r = n->data, *pl = r;
873 :
874 36380 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
875 0 : pl = pl->l;
876 : /* if it's not a projection, then project and propagate statistics */
877 36380 : if (!is_project(pl->op) && !is_base(pl->op)) {
878 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
879 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
880 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
881 : }
882 36380 : nrels = append(nrels, pl);
883 : /* we need new munion statistics */
884 : /* propagate row count */
885 36380 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
886 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
887 36380 : if (rv == BUN_NONE) {
888 1156 : cnt++;
889 1156 : continue;
890 : }
891 35224 : if (!rv && can_be_pruned)
892 6625 : needs_pruning = true;
893 : /* overflow check */
894 35224 : if (rv > (BUN_MAX - cnt))
895 36380 : rv = BUN_MAX;
896 : else
897 35224 : cnt += rv;
898 : }
899 12886 : int i = 0;
900 73792 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
901 60906 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
902 :
903 12886 : if (needs_pruning && !rel_is_ref(rel)) {
904 4464 : v->changes++;
905 4464 : list *nl = sa_list(l->sa);
906 :
907 16464 : for (node *n = nrels->h; n; n = n->next) {
908 12000 : sql_rel *r = n->data;
909 12000 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
910 :
911 12000 : if (!rv) { /* keep last for now */
912 6155 : rel_destroy(r);
913 6155 : continue;
914 : }
915 5845 : nl = append(nl, r);
916 : }
917 4464 : rel->l = nl;
918 4464 : if (list_length(nl) == 1) {
919 4139 : sql_rel *l = rel->l = nl->h->data; /* ugh */
920 4139 : rel->r = NULL;
921 4139 : rel->op = op_project;
922 :
923 20389 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
924 16250 : sql_exp *pe = n->data, *ie = m->data;
925 16250 : sql_exp *ne = exp_ref(v->sql, ie);
926 16250 : exp_prop_alias(v->sql->sa, ne, pe);
927 16250 : n->data = ne;
928 : }
929 4139 : list_hash_clear(rel->exps);
930 325 : } else if (list_empty(nl)) {
931 : /* empty select (project [ nils ] ) */
932 454 : for (node *n = rel->exps->h ; n ; n = n->next) {
933 354 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
934 354 : exp_prop_alias(v->sql->sa, a, e);
935 354 : n->data = a;
936 : }
937 100 : list_hash_clear(rel->exps);
938 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
939 100 : set_count_prop(v->sql->sa, l, 1);
940 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
941 100 : set_count_prop(v->sql->sa, l, 0);
942 100 : rel->op = op_project;
943 100 : rel->r = NULL;
944 100 : rel->l = l;
945 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
946 100 : set_count_prop(v->sql->sa, rel, 0);
947 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
948 : }
949 : } else {
950 8422 : set_count_prop(v->sql->sa, rel, cnt);
951 : }
952 : break;
953 : }
954 530375 : case op_join:
955 : case op_left:
956 : case op_right:
957 : case op_full:
958 : case op_semi:
959 : case op_anti:
960 : case op_select:
961 : case op_project:
962 : case op_groupby: {
963 530375 : if (is_groupby(rel->op) && !list_empty(rel->r))
964 15531 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
965 530375 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
966 530374 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
967 39124 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
968 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
969 530375 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
970 138136 : int changes = v->changes;
971 138136 : rel->exps = rel_prune_predicates(v, rel);
972 138136 : if (v->changes > changes)
973 3521 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
974 : }
975 :
976 : /* propagate row count */
977 530375 : sql_rel *l = rel->l, *r = rel->r;
978 530375 : switch (rel->op) {
979 133836 : case op_join:
980 : case op_left:
981 : case op_right:
982 : case op_full: {
983 133836 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
984 :
985 133836 : if (!list_empty(rel->exps) && !is_single(rel)) {
986 244944 : for (node *n = rel->exps->h ; n ; n = n->next) {
987 125044 : sql_exp *e = n->data, *el = e->l, *er = e->r;
988 :
989 125044 : if (find_prop(e->p, PROP_JOINIDX)) {
990 667 : join_idx_estimate = lv>rv?lv:rv;
991 124377 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
992 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
993 120493 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
994 120097 : BUN lu = 0, ru = 0;
995 120097 : prop *p = NULL;
996 120097 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
997 88970 : lu = (BUN) p->value.dval;
998 120097 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
999 103407 : ru = (BUN) p->value.dval;
1000 120097 : if (is_unique(el) || lu > lv) {
1001 73153 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1002 73153 : uniques_estimate = MIN(uniques_estimate, ncount);
1003 46944 : } else if (is_unique(er) || ru > rv) {
1004 29342 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1005 29342 : uniques_estimate = MIN(uniques_estimate, ncount);
1006 : }
1007 : }
1008 : }
1009 : }
1010 : }
1011 133836 : if (is_single(rel)) {
1012 4603 : set_count_prop(v->sql->sa, rel, lv);
1013 129233 : } else if (join_idx_estimate != BUN_MAX) {
1014 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1015 128568 : } else if (uniques_estimate != BUN_MAX) {
1016 96072 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1017 32496 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1018 : /* corner cases for outer joins */
1019 126 : if (is_left(rel->op)) {
1020 114 : set_count_prop(v->sql->sa, rel, lv);
1021 12 : } else if (is_right(rel->op)) {
1022 3 : set_count_prop(v->sql->sa, rel, rv);
1023 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1024 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1025 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1026 0 : set_count_prop(v->sql->sa, rel, 0);
1027 : }
1028 32370 : } else if (lv == 0) {
1029 1166 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1030 31773 : } else if (rv == 0) {
1031 1568 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1032 30699 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1033 18251 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1034 : }
1035 : break;
1036 : }
1037 2626 : case op_anti:
1038 2626 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1039 2626 : break;
1040 108189 : case op_semi:
1041 : case op_select:
1042 : /* TODO calculate cardinalities using selectivities */
1043 108189 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1044 2588 : set_count_prop(v->sql->sa, rel, 0);
1045 : } else {
1046 207954 : if (!list_empty(rel->exps) && !is_single(rel)) {
1047 102353 : BUN cnt = get_rel_count(l), u = 1;
1048 171015 : for (node *n = rel->exps->h ; n ; n = n->next) {
1049 112256 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1050 :
1051 : /* simple expressions first */
1052 112256 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1053 : /* use selectivity */
1054 56787 : prop *p;
1055 56787 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1056 43594 : u = (BUN) p->value.dval;
1057 43594 : break;
1058 : }
1059 : }
1060 : }
1061 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1062 102353 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1063 : } else {
1064 3248 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1065 : }
1066 : }
1067 : break;
1068 262311 : case op_project:
1069 262311 : if (l) {
1070 252407 : if (need_distinct(rel)) {
1071 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1072 : } else {
1073 252407 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1074 : }
1075 : } else {
1076 9904 : BUN card = 1;
1077 :
1078 9904 : if (!list_empty(rel->exps)) {
1079 28928 : for (node *n = rel->exps->h ; n ; n = n->next) {
1080 19024 : sql_exp *e = n->data;
1081 19024 : card = MAX(card, trivial_project_exp_card(e));
1082 : }
1083 : }
1084 9904 : set_count_prop(v->sql->sa, rel, card);
1085 : }
1086 : break;
1087 23057 : case op_groupby:
1088 23057 : if (list_empty(rel->r)) {
1089 7525 : set_count_prop(v->sql->sa, rel, 1);
1090 : } else {
1091 15532 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1092 : }
1093 : break;
1094 : default:
1095 : break;
1096 : }
1097 : break;
1098 : }
1099 16788 : case op_topn: {
1100 16788 : BUN lv = get_rel_count(rel->l);
1101 :
1102 16788 : if (lv != BUN_NONE) {
1103 16771 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1104 96 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1105 96 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1106 96 : lv = offset >= lv ? 0 : lv - offset;
1107 : }
1108 16771 : if (le->l && exp_is_not_null(le)) {
1109 16733 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1110 16733 : lv = MIN(lv, limit);
1111 : }
1112 16771 : set_count_prop(v->sql->sa, rel, lv);
1113 : }
1114 : break;
1115 : }
1116 22 : case op_sample: {
1117 22 : BUN lv = get_rel_count(rel->l);
1118 :
1119 22 : if (lv != BUN_NONE) {
1120 4 : sql_exp *se = rel->exps->h->data;
1121 4 : sql_subtype *tp = exp_subtype(se);
1122 :
1123 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1124 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1125 4 : lv = MIN(lv, sample);
1126 0 : } else if (se->l) { /* sample is a percentage of rows */
1127 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1128 0 : assert(tp->type->eclass == EC_FLT);
1129 0 : lv = (BUN) ceil((dbl)lv * percent);
1130 : }
1131 4 : set_count_prop(v->sql->sa, rel, lv);
1132 : }
1133 : break;
1134 : }
1135 6086 : case op_table: {
1136 6086 : sql_exp *op = rel->r;
1137 6086 : if (rel->flag != TRIGGER_WRAPPER && op) {
1138 5774 : sql_subfunc *f = op->f;
1139 5774 : if (f->func->lang == FUNC_LANG_SQL) {
1140 97 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1141 5677 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1142 829 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1143 4848 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1144 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1145 4848 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1146 1085 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1147 3763 : } else if (f->func->lang == FUNC_LANG_MAL &&
1148 3653 : (strcmp(f->func->base.name, "queue") == 0 ||
1149 3389 : strcmp(f->func->base.name, "optimizers") == 0 ||
1150 3077 : strcmp(f->func->base.name, "env") == 0 ||
1151 2786 : strcmp(f->func->base.name, "keywords") == 0 ||
1152 2786 : strcmp(f->func->base.name, "statistics") == 0 ||
1153 2119 : strcmp(f->func->base.name, "rejects") == 0 ||
1154 1867 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1155 1867 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1156 1867 : strcmp(f->func->base.name, "sessions") == 0) ) {
1157 2076 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1158 : }
1159 : /* else {
1160 : printf("%%func needs stats : %s\n", f->func->base.name);
1161 : } */
1162 : }
1163 : break;
1164 : }
1165 : /*These relations are less important for now
1166 : TODO later we can tune it */
1167 : case op_insert:
1168 : case op_update:
1169 : case op_delete:
1170 : case op_truncate:
1171 : case op_ddl:
1172 : default:
1173 : break;
1174 : }
1175 :
1176 : return rel;
1177 : }
1178 :
1179 : static sql_rel *
1180 539569 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1181 : {
1182 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1183 539569 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1184 539569 : v->data = &has_special_modify;
1185 539569 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1186 539570 : v->data = gp;
1187 539570 : return rel;
1188 : }
1189 :
1190 : run_optimizer
1191 705405 : bind_get_statistics(visitor *v, global_props *gp)
1192 : {
1193 705405 : (void) v;
1194 705405 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1195 : }
1196 :
1197 :
1198 : static bool
1199 94482 : point_select_on_unique_column(sql_rel *rel)
1200 : {
1201 94482 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1202 130805 : for (node *n = rel->exps->h; n ; n = n->next) {
1203 75366 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1204 :
1205 75366 : if (is_compare(e->type) && e->flag == cmp_equal) {
1206 33337 : if (is_numeric_upcast(el))
1207 0 : el = el->l;
1208 33337 : if (is_numeric_upcast(er))
1209 0 : er = er->l;
1210 33337 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1211 749 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1212 : return true;
1213 32588 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1214 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1215 : return true;
1216 : }
1217 : }
1218 : }
1219 : return false;
1220 : }
1221 :
1222 : /*
1223 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1224 : * join, the opposite side's select can be pushed above the join.
1225 : */
1226 : static inline sql_rel *
1227 1242625 : rel_push_select_up(visitor *v, sql_rel *rel)
1228 : {
1229 1242625 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1230 253575 : sql_rel *l = rel->l, *r = rel->r;
1231 253575 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1232 253575 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1233 :
1234 253575 : if (can_pushup_left || can_pushup_right) {
1235 66625 : if (can_pushup_left)
1236 45243 : can_pushup_left = point_select_on_unique_column(r);
1237 66625 : if (can_pushup_right)
1238 49239 : can_pushup_right = point_select_on_unique_column(l);
1239 :
1240 : /* if both selects retrieve one row each, it's not worth it to push both up */
1241 66625 : if (can_pushup_left && !can_pushup_right) {
1242 683 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1243 683 : nrel->l = l->l;
1244 683 : rel = rel_inplace_select(rel, nrel, l->exps);
1245 683 : assert(is_select(rel->op));
1246 683 : v->changes++;
1247 65942 : } else if (!can_pushup_left && can_pushup_right) {
1248 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1249 14 : nrel->r = r->l;
1250 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1251 14 : assert(is_select(rel->op));
1252 14 : v->changes++;
1253 : }
1254 : }
1255 : }
1256 1242625 : return rel;
1257 : }
1258 :
1259 : static int
1260 93335 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1261 : {
1262 93335 : int de;
1263 :
1264 93335 : if (!t)
1265 : return 0;
1266 93335 : switch (ATOMstorage(t->type->localtype)) {
1267 : case TYPE_bte:
1268 : return 150 - 8;
1269 1665 : case TYPE_sht:
1270 1665 : return 150 - 16;
1271 37285 : case TYPE_int:
1272 37285 : return 150 - 32;
1273 516 : case TYPE_void:
1274 : case TYPE_lng:
1275 516 : return 150 - 64;
1276 : case TYPE_uuid:
1277 : #ifdef HAVE_HGE
1278 : case TYPE_hge:
1279 : #endif
1280 : return 150 - 128;
1281 2 : case TYPE_flt:
1282 2 : return 75 - 24;
1283 : case TYPE_dbl:
1284 : return 75 - 53;
1285 30551 : default:
1286 30551 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1287 1554 : return 150 - de * 8;
1288 : /* strings and blobs not duplicate eliminated don't get any points here */
1289 : return 0;
1290 : }
1291 : }
1292 :
1293 : static int
1294 34436 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1295 : {
1296 34436 : int res = 0;
1297 34436 : sql_subtype *t = exp_subtype(e);
1298 34436 : sql_column *c = NULL;
1299 :
1300 34436 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1301 : return -1000;
1302 : /* can we find out if the underlying table is sorted */
1303 33835 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1304 33835 : res += 600;
1305 :
1306 : /* prefer the shorter var types over the longer ones */
1307 33835 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1308 33835 : return res;
1309 : }
1310 :
1311 : static int
1312 58510 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1313 : {
1314 58510 : int score = 0;
1315 58510 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1316 34436 : sql_exp *l = e->l;
1317 :
1318 34436 : while (l->type == e_cmp) { /* go through nested comparisons */
1319 2 : sql_exp *ll;
1320 :
1321 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1322 0 : ll = ((list*)l->l)->h->data;
1323 : else
1324 2 : ll = l->l;
1325 2 : if (ll->type != e_cmp)
1326 : break;
1327 : l = ll;
1328 : }
1329 34436 : score += score_se_base(v, rel, l);
1330 : }
1331 58510 : score += exp_keyvalue(e);
1332 58510 : return score;
1333 : }
1334 :
1335 : static inline sql_rel *
1336 1242625 : rel_select_order(visitor *v, sql_rel *rel)
1337 : {
1338 1242625 : int *scores = NULL;
1339 1242625 : sql_exp **exps = NULL;
1340 :
1341 1242625 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1342 27372 : node *n;
1343 27372 : int i, nexps = list_length(rel->exps);
1344 27372 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1345 27372 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1346 :
1347 85882 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1348 58540 : exps[i] = n->data;
1349 58540 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1350 : return rel;
1351 58510 : scores[i] = score_se(v, rel, n->data);
1352 : }
1353 27342 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1354 :
1355 85852 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1356 58510 : n->data = exps[i];
1357 : }
1358 :
1359 : return rel;
1360 : }
1361 :
1362 : /* Compute the efficiency of using this expression early in a group by list */
1363 : static int
1364 59500 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1365 : {
1366 59500 : int res = 0;
1367 59500 : sql_subtype *t = exp_subtype(e);
1368 59500 : sql_column *c = exp_find_column(rel, e, -2);
1369 :
1370 59500 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1371 38 : res += 1000;
1372 : /* can we find out if the underlying table is sorted */
1373 59500 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1374 3584 : res += 700;
1375 38910 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1376 3710 : res += 500;
1377 59500 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1378 0 : res += 200;
1379 :
1380 : /* prefer the shorter var types over the longer ones */
1381 59500 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1382 59500 : return res;
1383 : }
1384 :
1385 : /* reorder group by expressions */
1386 : static inline sql_rel *
1387 1242625 : rel_groupby_order(visitor *v, sql_rel *rel)
1388 : {
1389 1242625 : int *scores = NULL;
1390 1242625 : sql_exp **exps = NULL;
1391 :
1392 1242625 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1393 25587 : node *n;
1394 25587 : list *gbe = rel->r;
1395 25587 : int i, ngbe = list_length(gbe);
1396 25587 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1397 25587 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1398 :
1399 : /* first sorting step, give priority for integers and sorted columns */
1400 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1401 59500 : exps[i] = n->data;
1402 59500 : scores[i] = score_gbe(v, rel, exps[i]);
1403 : }
1404 25587 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1405 :
1406 : /* second sorting step, give priority to strings with lower number of digits */
1407 50504 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1408 25587 : if (scores[i])
1409 24589 : i++;
1410 25587 : if (ngbe - i > 1) {
1411 8604 : for (int j = i; j < ngbe; j++) {
1412 6597 : sql_subtype *t = exp_subtype(exps[j]);
1413 6597 : scores[j] = t ? t->digits : 0;
1414 : }
1415 : /* the less number of digits the better, order ascending */
1416 2007 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1417 : }
1418 :
1419 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1420 59500 : n->data = exps[i];
1421 : }
1422 :
1423 1242625 : return rel;
1424 : }
1425 :
1426 : /* This optimization loop contains optimizations that can potentially use statistics */
1427 : static sql_rel *
1428 1242625 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1429 : {
1430 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1431 1242625 : rel = rel_push_select_up(v, rel);
1432 1242625 : rel = rel_select_order(v, rel);
1433 :
1434 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1435 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1436 : rel_distinct_aggregate_on_unique_values */
1437 :
1438 1242625 : rel = rel_groupby_order(v, rel);
1439 1242625 : return rel;
1440 : }
1441 :
1442 : static sql_rel *
1443 65868 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1444 : {
1445 65868 : (void) gp;
1446 65868 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1447 : }
1448 :
1449 : run_optimizer
1450 705403 : bind_final_optimization_loop(visitor *v, global_props *gp)
1451 : {
1452 705403 : int flag = v->sql->sql_optimizer;
1453 : /* At the moment, this optimizer has dependency on 3 flags */
1454 547158 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1455 771273 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1456 : }
|