Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 :
19 : static sql_exp *
20 3462911 : comparison_find_column(sql_exp *input, sql_exp *e)
21 : {
22 3462911 : switch (input->type) {
23 98422 : case e_convert: {
24 98422 : list *types = (list *)input->r;
25 98422 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
26 98422 : if (from == to)
27 189776 : return comparison_find_column(input->l, e) ? input : NULL;
28 : return NULL;
29 : }
30 3052820 : case e_column:
31 3052820 : return exp_match(e, input) ? input : NULL;
32 : default:
33 : return NULL;
34 : }
35 : }
36 :
37 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
38 : * columns */
39 : #define comp_single_column(c) (!c->f)
40 :
41 : static sql_exp *
42 8744252 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
43 : {
44 8827329 : assert(e->type == e_column);
45 8827329 : if (rel) {
46 8827250 : switch(rel->op) {
47 4087964 : case op_left:
48 : case op_right:
49 : case op_full:
50 : case op_join:
51 : case op_select:
52 : case op_anti:
53 : case op_semi: {
54 4087964 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
55 :
56 4087964 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
57 : return NULL; /* nothing will pass, skip */
58 :
59 : /* propagate from the bottom first */
60 4072411 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
61 : found_left = true;
62 2590746 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
63 4072411 : found_right = true;
64 :
65 4072411 : if (!found_left && !found_right)
66 : return NULL;
67 1776981 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
68 3495151 : for (node *n = rel->exps->h ; n ; n = n->next) {
69 1809468 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
70 :
71 1809468 : if (comp->type == e_cmp) {
72 1808466 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
73 125119 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
74 125119 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
75 :
76 : /* not semantics found or if explicitly filtering not null values from the column */
77 125119 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
78 125119 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
79 125119 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
80 19791 : continue;
81 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
82 105328 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
83 4622 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
84 4622 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
85 4622 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
86 4622 : symmetric = is_symmetric(comp);
87 :
88 4622 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
89 1817 : continue;
90 2805 : if (lne && int1 && int2) {
91 104 : if (symmetric) {
92 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
93 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
94 : /* max is min from le and (max from re and fe max) */
95 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
96 : /* min is max from le and (min from re and fe min) */
97 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
98 : } else {
99 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
100 : /* max is min from le and fe max */
101 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
102 : /* min is max from le and re min */
103 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
104 : }
105 2701 : } else if (rne) {
106 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
107 0 : prop *p = find_prop(e->p, PROP_MIN);
108 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
109 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
110 590 : } else if (int1) { /* min is max from le and re min */
111 100 : prop *p = find_prop(e->p, PROP_MIN);
112 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
113 : }
114 2111 : } else if (fne) {
115 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
116 0 : prop *p = find_prop(e->p, PROP_MAX);
117 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
118 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
119 549 : } else if (int2) { /* max is min from le and fe max */
120 266 : prop *p = find_prop(e->p, PROP_MAX);
121 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
122 : }
123 : }
124 100706 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
125 : /* both min and max must be set and the intervals must overlap */
126 43314 : switch (comp->flag) {
127 28978 : case cmp_equal: /* for equality reduce */
128 28978 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
129 28978 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
130 28978 : break;
131 4679 : case cmp_notequal: /* for inequality expand */
132 4679 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
133 4679 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
134 4679 : break;
135 5651 : case cmp_gt:
136 : case cmp_gte:
137 10364 : if (!is_anti(comp) && lne) { /* min is max from both min */
138 4713 : prop *p = find_prop(e->p, PROP_MIN);
139 4713 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
140 938 : } else if (!is_anti(comp)) { /* max is min from both max */
141 938 : prop *p = find_prop(e->p, PROP_MAX);
142 938 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
143 : }
144 : break;
145 4006 : case cmp_lt:
146 : case cmp_lte:
147 7205 : if (!is_anti(comp) && lne) { /* max is min from both max */
148 3199 : prop *p = find_prop(e->p, PROP_MAX);
149 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
150 807 : } else if (!is_anti(comp)) { /* min is max from both min */
151 807 : prop *p = find_prop(e->p, PROP_MIN);
152 807 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
153 : }
154 : break;
155 : default: /* Maybe later I can do cmp_in and cmp_notin */
156 : break;
157 : }
158 : }
159 : }
160 : }
161 : }
162 : }
163 1776981 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
164 38014 : set_has_nil(e);
165 1776981 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
166 94272 : set_has_no_nil(e);
167 1776981 : if (is_join(rel->op) && is_unique(e) && !still_unique)
168 118915 : set_not_unique(e);
169 1776981 : return e;
170 : }
171 4636862 : case op_table:
172 : case op_basetable:
173 : case op_union:
174 : case op_except:
175 : case op_inter:
176 : case op_munion:
177 : case op_project:
178 : case op_groupby: {
179 4636862 : sql_exp *found;
180 4636862 : atom *fval;
181 4636862 : prop *est;
182 4636862 : if ((found = rel_find_exp(rel, e))) {
183 2175093 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
184 2130783 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
185 1133067 : set_minmax_property(sql, e, PROP_MAX, fval);
186 2130779 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
187 1140374 : set_minmax_property(sql, e, PROP_MIN, fval);
188 2130785 : if (!has_nil(found))
189 1377434 : set_has_no_nil(e);
190 2130785 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
191 1720382 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
192 421278 : set_unique(e);
193 : /* propagate unique estimation for known cases */
194 2130784 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
195 7538 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
196 7538 : p->value.dval = 1;
197 2123246 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
198 66648 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
199 2065733 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
200 187991 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
201 187991 : p->value.dval = est->value.dval;
202 : }
203 : }
204 2175099 : return e;
205 : }
206 : return NULL;
207 : }
208 83077 : case op_topn:
209 : case op_sample:
210 83077 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
211 : default:
212 : break;
213 : }
214 : }
215 : return NULL;
216 : }
217 :
218 : static atom *
219 4451390 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
220 : {
221 4451390 : atom *a = SA_NEW(sa, atom);
222 :
223 4451375 : assert(!VALisnil(v));
224 4451410 : *a = (atom) {.tpe = *tpe,};
225 4451410 : SA_VALcopy(sa, &a->data, v);
226 4451447 : return a;
227 : }
228 :
229 : void
230 4058648 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
231 : {
232 4058648 : bool nonil = false, unique = false;
233 4058648 : double unique_est = 0.0;
234 4058648 : ValRecord min, max;
235 4058648 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
236 :
237 4059801 : if (has_nil(e) && nonil)
238 2654932 : set_has_no_nil(e);
239 4059801 : if (!is_unique(e) && unique)
240 1100600 : set_unique(e);
241 4059801 : if (unique_est != 0.0) {
242 2823637 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
243 2823508 : p->value.dval = unique_est;
244 : }
245 4059672 : unsigned int digits = 0;
246 4059672 : sql_subtype *et = exp_subtype(e);
247 4059784 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
248 2640950 : digits = et->digits;
249 4059784 : if ((ok & 2) == 2) {
250 2222783 : if (!VALisnil(&max)) {
251 2222729 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
252 2222667 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
253 2222650 : if (digits) {
254 1655602 : unsigned int nd = atom_digits(p->value.pval);
255 1655565 : if (nd < digits)
256 : digits = nd;
257 1655565 : if (!digits)
258 : digits = 1;
259 : }
260 : }
261 2222575 : VALclear(&max);
262 : }
263 4059567 : if ((ok & 1) == 1) {
264 2228988 : if (!VALisnil(&min)) {
265 2229006 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
266 2229088 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
267 2229193 : if (digits) {
268 1663219 : unsigned int nd = atom_digits(p->value.pval);
269 1663219 : if (nd > digits) /* possibly set to low by max value */
270 : digits = nd;
271 : if (!digits)
272 : digits = 1;
273 : }
274 : }
275 2229183 : VALclear(&min);
276 : }
277 4059778 : if (digits)
278 2640916 : et->digits = digits;
279 4059778 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
280 216 : et->digits = et->scale + 1;
281 4059778 : }
282 :
283 : static void
284 935414 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
285 : {
286 935414 : if (e->p)
287 : return;
288 302070 : sql_column *c = NULL;
289 :
290 302070 : if ((c = rel_base_find_column(rel, e->nid))) {
291 204204 : sql_column_get_statistics(sql, c, e);
292 : }
293 : }
294 :
295 : static bool
296 8872 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
297 : {
298 8872 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
299 8872 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
300 8872 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
301 8872 : prop *est;
302 :
303 : /* for the intersection, if both expressions don't overlap, it can be pruned */
304 8872 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
305 511 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
306 26 : return true;
307 :
308 8846 : if (lval_max && rval_max) {
309 2557 : if (is_union(rel->op))
310 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
311 2554 : else if (is_inter(rel->op))
312 399 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
313 : else /* except */
314 2155 : set_minmax_property(sql, e, PROP_MAX, lval_max);
315 : }
316 8846 : if (lval_min && rval_min) {
317 2557 : if (is_union(rel->op))
318 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
319 2554 : else if (is_inter(rel->op))
320 399 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
321 : else /* except */
322 2155 : set_minmax_property(sql, e, PROP_MIN, lval_min);
323 : }
324 :
325 8846 : if (is_union(rel->op) || is_munion(rel->op)) {
326 5986 : if (!has_nil(le) && !has_nil(re))
327 879 : set_has_no_nil(e);
328 5986 : if (need_distinct(rel) && list_length(rel->exps) == 1)
329 0 : set_unique(e);
330 2860 : } else if (is_inter(rel->op)) {
331 564 : if (!has_nil(le) || !has_nil(re))
332 509 : set_has_no_nil(e);
333 564 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
334 345 : set_unique(e);
335 : } else {
336 2296 : assert(is_except(rel->op));
337 2296 : if (!has_nil(le))
338 2233 : set_has_no_nil(e);
339 2296 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
340 2009 : set_unique(e);
341 : }
342 : /* propagate unique estimation for known cases */
343 8846 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
344 205 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
345 205 : p->value.dval = est->value.dval;
346 : }
347 : return false;
348 : }
349 :
350 :
351 : static void
352 60685 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
353 : {
354 60685 : assert(is_munion(rel->op));
355 :
356 60685 : sql_rel *l = rels->h->data;
357 60685 : sql_exp *le = list_fetch(l->exps, i);
358 60685 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
359 60685 : bool has_nonil = !has_nil(le);
360 :
361 175898 : for(node *n = rels->h->next; n; n = n->next) {
362 115213 : sql_rel *r = n->data;
363 115213 : sql_exp *re = list_fetch(r->exps, i);
364 115213 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
365 :
366 115213 : if (lval_max && rval_max) {
367 42562 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
368 42562 : lval_max = find_prop_and_get(e->p, PROP_MAX);
369 : }
370 115213 : if (lval_min && rval_min) {
371 42063 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
372 42063 : lval_min = find_prop_and_get(e->p, PROP_MIN);
373 : }
374 115213 : has_nonil &= !has_nil(re);
375 :
376 : }
377 :
378 60685 : if (has_nonil)
379 41540 : set_has_no_nil(e);
380 :
381 60685 : if (need_distinct(rel) && list_length(rel->exps) == 1)
382 1594 : set_unique(e);
383 60685 : }
384 :
385 :
386 : static sql_exp *
387 3478426 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
388 : {
389 3478426 : mvc *sql = v->sql;
390 3478426 : atom *lval;
391 :
392 3478426 : (void) depth;
393 3478426 : switch(e->type) {
394 2180916 : case e_column:
395 2180916 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
396 278805 : case op_join:
397 : case op_left:
398 : case op_right:
399 : case op_full:
400 : case op_semi:
401 : case op_anti: {
402 278805 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
403 278805 : if (!found)
404 139847 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
405 : break;
406 : }
407 1902111 : case op_select:
408 : case op_project:
409 : case op_groupby: {
410 1902111 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
411 1902112 : if (!found && is_simple_project(rel->op))
412 125476 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
413 : break;
414 : }
415 0 : case op_insert:
416 : case op_update:
417 : case op_delete:
418 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
419 0 : break;
420 : default:
421 : break;
422 : }
423 : break;
424 98453 : case e_convert: {
425 98453 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
426 98453 : sql_exp *l = e->l;
427 98453 : sql_class fr = from->type->eclass, too = to->type->eclass;
428 98453 : prop *est;
429 :
430 98453 : if (fr == too) {
431 89381 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
432 58826 : atom *res = atom_copy(sql->sa, lval);
433 58826 : if ((res = atom_cast(sql->sa, res, to)))
434 58803 : set_minmax_property(sql, e, PROP_MAX, res);
435 : }
436 89381 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
437 59443 : atom *res = atom_copy(sql->sa, lval);
438 59443 : if ((res = atom_cast(sql->sa, res, to)))
439 59420 : set_minmax_property(sql, e, PROP_MIN, res);
440 : }
441 : }
442 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
443 98453 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
444 61091 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
445 61066 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
446 57637 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
447 57637 : p->value.dval = est->value.dval;
448 : }
449 98453 : if (!has_nil(l))
450 55516 : set_has_no_nil(e);
451 : break;
452 : }
453 341627 : case e_aggr:
454 : case e_func: {
455 341627 : BUN lv;
456 341627 : sql_subfunc *f = e->f;
457 :
458 341627 : if (!f->func->s) {
459 315154 : int key = hash_key(f->func->base.name); /* Using hash lookup */
460 315154 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
461 315154 : lookup_function look = NULL;
462 :
463 688375 : for (; he && !look; he = he->chain) {
464 373221 : struct function_properties* fp = (struct function_properties*) he->value;
465 :
466 373221 : if (!strcmp(f->func->base.name, fp->name))
467 107712 : look = fp->func;
468 : }
469 315154 : if (look)
470 107712 : look(sql, e);
471 : }
472 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
473 341627 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
474 89129 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
475 88806 : set_has_no_nil(e);
476 : /* set properties for global aggregates */
477 341627 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
478 7871 : if (!find_prop(e->p, PROP_NUNIQUES)) {
479 7871 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
480 7871 : p->value.dval = 1;
481 : }
482 7871 : set_unique(e);
483 : }
484 : break;
485 : }
486 570278 : case e_atom:
487 570278 : if (e->l) {
488 553559 : atom *a = (atom*) e->l;
489 553559 : if (!a->isnull) {
490 490243 : set_minmax_property(sql, e, PROP_MAX, a);
491 490243 : set_minmax_property(sql, e, PROP_MIN, a);
492 : }
493 553559 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
494 553558 : p->value.dval = 1;
495 16719 : } else if (e->f) {
496 3194 : list *vals = (list *) e->f;
497 3194 : sql_exp *first = vals->h ? vals->h->data : NULL;
498 3194 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
499 3194 : int has_nil = 0;
500 :
501 3194 : if (first) {
502 3194 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
503 3194 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
504 3194 : has_nil |= has_nil(first);
505 : }
506 :
507 13934 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
508 7546 : sql_exp *ee = n->data;
509 :
510 7546 : if (min && max) {
511 7093 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
512 7047 : max = atom_cmp(lval, max) > 0 ? lval : max;
513 : } else {
514 : max = NULL;
515 : }
516 7093 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
517 7047 : min = atom_cmp(min, lval) > 0 ? lval : min;
518 : } else {
519 : min = NULL;
520 : }
521 : }
522 7546 : has_nil |= has_nil(ee);
523 : }
524 :
525 3194 : if (!has_nil)
526 2828 : set_has_no_nil(e);
527 3194 : if (min && max) {
528 2805 : set_minmax_property(sql, e, PROP_MAX, max);
529 2805 : set_minmax_property(sql, e, PROP_MIN, min);
530 : }
531 3194 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
532 3194 : p->value.dval = (dbl) list_length(vals);
533 : } else {
534 13525 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
535 13525 : p->value.dval = 1;
536 : }
537 : break;
538 287152 : case e_cmp:
539 : /* TODO? propagating min/max/unique of booleans is not very worth it */
540 287152 : if (e->flag == cmp_or || e->flag == cmp_filter) {
541 18058 : if (!have_nil(e->l) && !have_nil(e->r))
542 13588 : set_has_no_nil(e);
543 269094 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
544 21237 : sql_exp *le = e->l;
545 21237 : if (!has_nil(le) && !have_nil(e->r))
546 18194 : set_has_no_nil(e);
547 : } else {
548 247857 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
549 247857 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
550 177921 : set_has_no_nil(e);
551 : }
552 : break;
553 : case e_psm:
554 : break;
555 : }
556 :
557 : #ifndef NDEBUG
558 : {
559 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
560 3478426 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
561 :
562 3478419 : (void) min;
563 3478419 : (void) max;
564 3478419 : assert(!min || !min->isnull);
565 3478419 : assert(!max || !max->isnull);
566 3478419 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
567 : }
568 : #endif
569 3478419 : return e;
570 : }
571 :
572 : static list * /* Remove predicates always false from min/max values */
573 139266 : rel_prune_predicates(visitor *v, sql_rel *rel)
574 : {
575 139266 : if (rel->l) {
576 139266 : sql_rel *l = rel->l;
577 139266 : if (is_single(l))
578 3391 : return rel->exps;
579 : }
580 135875 : if (!list_empty(rel->attr))
581 2960 : return rel->exps;
582 281935 : for (node *n = rel->exps->h ; n ; n = n->next) {
583 149020 : sql_exp *e = n->data;
584 :
585 149020 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
586 122629 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
587 122629 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
588 122629 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
589 122629 : bool always_false = false, always_true = false;
590 :
591 122629 : if (fe && !is_symmetric(e)) {
592 3058 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
593 3058 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
594 3660 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
595 1663 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
596 4072 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
597 2485 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
598 3058 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
599 1288 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
600 :
601 3058 : always_false |= not_int1 || not_int2 || not_int3;
602 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
603 4113 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
604 3957 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
605 575 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
606 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
607 : } else if (!fe) {
608 119553 : if (!is_semantics(e)) /* trivial not null cmp null case */
609 227682 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
610 119553 : switch (e->flag) {
611 104083 : case cmp_equal:
612 104083 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
613 133444 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
614 104083 : if (is_semantics(e)) { /* prune *= NULL cases */
615 5686 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
616 11372 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
617 : }
618 : break;
619 7207 : case cmp_notequal:
620 7207 : if (lval_min && lval_max && rval_min && rval_max)
621 11348 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
622 7207 : if (is_semantics(e)) {
623 26 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
624 52 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
625 : }
626 : break;
627 5632 : case cmp_gt:
628 5632 : if (lval_max && rval_min)
629 1947 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
630 5632 : if (lval_min && rval_max)
631 3894 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
632 : break;
633 119 : case cmp_gte:
634 119 : if (lval_max && rval_min)
635 49 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
636 119 : if (lval_min && rval_max)
637 98 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
638 : break;
639 2428 : case cmp_lt:
640 2428 : if (lval_min && rval_max)
641 1376 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
642 2428 : if (lval_max && rval_min)
643 2800 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
644 : break;
645 84 : case cmp_lte:
646 84 : if (lval_min && rval_max)
647 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
648 84 : if (lval_max && rval_min)
649 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
650 : break;
651 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
652 : break;
653 : }
654 : }
655 122629 : assert(!always_false || !always_true);
656 122629 : if (always_false || always_true) {
657 3651 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
658 3651 : if (exp_name(e))
659 0 : exp_prop_alias(v->sql->sa, ne, e);
660 3651 : n->data = ne;
661 3651 : v->changes++;
662 : }
663 : }
664 : }
665 132915 : return rel->exps;
666 : }
667 :
668 : static sql_rel *
669 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
670 : {
671 14 : if (side == rel->r)
672 0 : rel->r = NULL;
673 : else
674 14 : rel->l = NULL;
675 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
676 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
677 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
678 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
679 : }
680 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
681 21 : sql_exp *e1 = n->data, *e2 = m->data;
682 21 : exp_prop_alias(v->sql->sa, e2, e1);
683 : }
684 14 : list_hash_clear(side->exps);
685 :
686 14 : if (need_distinct(rel))
687 0 : set_distinct(side);
688 14 : side->p = prop_copy(v->sql->sa, rel->p);
689 14 : rel_destroy(rel);
690 14 : return side;
691 : }
692 :
693 : static BUN
694 21876 : trivial_project_exp_card(sql_exp *e)
695 : {
696 22170 : if (e->type == e_convert)
697 294 : return trivial_project_exp_card(e->l);
698 21876 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
699 : }
700 :
701 : static BUN
702 24248 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
703 : {
704 24248 : BUN lv = get_rel_count(l);
705 :
706 24247 : if (lv == 0)
707 : return 0;
708 21538 : if (!list_empty(exps)) {
709 21538 : BUN nuniques = 0;
710 : /* compute the highest number of unique values */
711 51130 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
712 29591 : sql_exp *e = n->data;
713 29591 : sql_rel *bt = NULL;
714 29591 : prop *p = NULL;
715 29591 : BUN euniques = BUN_NONE;
716 29591 : atom *min, *max, *sub = NULL;
717 29591 : sql_subtype *tp = exp_subtype(e);
718 29591 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
719 :
720 29591 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
721 20955 : euniques = (BUN) p->value.dval;
722 8636 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
723 6593 : euniques = (BUN) p->value.lval;
724 : }
725 : /* use min to max range to compute number of possible values in the domain for number types */
726 29591 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
727 22362 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
728 : /* the range includes min and max, so the atom_inc call is needed */
729 : /* if 'euniques' has number of distinct values, compute min between both */
730 17543 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
731 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
732 : }
733 29592 : if (euniques != BUN_NONE)
734 27549 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
735 : else
736 : nuniques = BUN_NONE;
737 : }
738 21539 : if (nuniques != BUN_NONE)
739 : return nuniques;
740 : }
741 : return lv;
742 : }
743 :
744 : static sql_rel *
745 2071789 : rel_get_statistics_(visitor *v, sql_rel *rel)
746 : {
747 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
748 2071789 : uint8_t has_special_modify = *(uint8_t*) v->data;
749 2071789 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
750 :
751 : /* Don't look at the same relation twice */
752 2071789 : if (are_statistics_gathered(rel->used))
753 : return rel;
754 1330571 : rel->used |= statistics_gathered;
755 :
756 1330571 : switch (rel->op) {
757 312090 : case op_basetable: {
758 312090 : sql_table *t = (sql_table *) rel->l;
759 312090 : sqlstore *store = v->sql->session->tr->store;
760 :
761 312090 : if (!list_empty(rel->exps)) {
762 1247607 : for (node *n = rel->exps->h ; n ; n = n->next)
763 935380 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
764 : }
765 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
766 312220 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
767 258801 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
768 : break;
769 : }
770 2790 : case op_union:
771 : case op_inter:
772 : case op_except: {
773 2790 : bool empty_cross = false;
774 2790 : int i = 0;
775 2790 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
776 :
777 2790 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
778 0 : pl = pl->l;
779 2790 : while (is_sample(pr->op) || is_topn(pr->op))
780 0 : pr = pr->l;
781 : /* if it's not a projection, then project and propagate statistics */
782 2790 : if (!is_project(pl->op) && !is_base(pl->op)) {
783 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
784 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
785 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
786 : }
787 2790 : if (!is_project(pr->op) && !is_base(pr->op)) {
788 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
789 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
790 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
791 : }
792 :
793 11662 : for (node *n = rel->exps->h ; n ; n = n->next) {
794 8872 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
795 8872 : i++;
796 : }
797 :
798 : /* propagate row count */
799 2790 : if (is_union(rel->op)) {
800 277 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
801 277 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
802 :
803 277 : if (lv == 0 && rv == 0) { /* both sides empty */
804 2 : if (can_be_pruned)
805 : empty_cross = true;
806 : else
807 2 : set_count_prop(v->sql->sa, rel, 0);
808 275 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
809 0 : rel = set_setop_side(v, rel, r);
810 0 : empty_cross = false; /* don't rewrite again */
811 275 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
812 0 : rel = set_setop_side(v, rel, l);
813 0 : empty_cross = false; /* don't rewrite again */
814 275 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
815 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
816 : }
817 2513 : } else if (is_inter(rel->op) || is_except(rel->op)) {
818 2513 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
819 2513 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
820 :
821 2513 : if (lv == 0) { /* left side empty */
822 62 : if (can_be_pruned)
823 : empty_cross = true;
824 : else
825 5 : set_count_prop(v->sql->sa, rel, 0);
826 2451 : } else if (rv == 0) { /* right side empty */
827 17 : if (is_inter(rel->op)) {
828 3 : if (can_be_pruned)
829 : empty_cross = true;
830 : else
831 0 : set_count_prop(v->sql->sa, rel, 0);
832 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
833 14 : rel = set_setop_side(v, rel, l);
834 14 : empty_cross = false; /* don't rewrite again */
835 : } else {
836 0 : set_count_prop(v->sql->sa, rel, lv);
837 : }
838 : } else {
839 2434 : set_count_prop(v->sql->sa, rel, lv);
840 : }
841 : }
842 2790 : if (can_be_pruned && empty_cross) {
843 80 : rel_destroy(rel->l);
844 80 : rel->l = NULL;
845 80 : rel_destroy(rel->r);
846 80 : rel->r = NULL;
847 244 : for (node *n = rel->exps->h ; n ; n = n->next) {
848 164 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
849 164 : exp_prop_alias(v->sql->sa, a, e);
850 164 : n->data = a;
851 : }
852 80 : list_hash_clear(rel->exps);
853 80 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
854 80 : set_count_prop(v->sql->sa, l, 1);
855 80 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
856 80 : set_count_prop(v->sql->sa, l, 0);
857 80 : rel->op = op_project;
858 80 : rel->l = l;
859 80 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
860 80 : set_count_prop(v->sql->sa, rel, 0);
861 80 : set_nodistinct(rel); /* set relations may have distinct flag set */
862 80 : v->changes++;
863 : }
864 : break;
865 : }
866 12863 : case op_munion: {
867 12863 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
868 12863 : BUN cnt = 0;
869 12863 : bool needs_pruning = false;
870 :
871 49198 : for (node *n = l->h; n; n = n->next) {
872 36335 : sql_rel *r = n->data, *pl = r;
873 :
874 36335 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
875 0 : pl = pl->l;
876 : /* if it's not a projection, then project and propagate statistics */
877 36335 : if (!is_project(pl->op) && !is_base(pl->op)) {
878 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
879 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
880 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
881 : }
882 36335 : nrels = append(nrels, pl);
883 : /* we need new munion statistics */
884 : /* propagate row count */
885 36335 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
886 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
887 36335 : if (rv == BUN_NONE) {
888 1156 : cnt++;
889 1156 : continue;
890 : }
891 35179 : if (!rv && can_be_pruned)
892 6616 : needs_pruning = true;
893 : /* overflow check */
894 35179 : if (rv > (BUN_MAX - cnt))
895 36335 : rv = BUN_MAX;
896 : else
897 35179 : cnt += rv;
898 : }
899 12863 : int i = 0;
900 73548 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
901 60685 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
902 :
903 12863 : if (needs_pruning && !rel_is_ref(rel)) {
904 4455 : v->changes++;
905 4455 : list *nl = sa_list(l->sa);
906 :
907 16438 : for (node *n = nrels->h; n; n = n->next) {
908 11983 : sql_rel *r = n->data;
909 11983 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
910 :
911 11983 : if (!rv) { /* keep last for now */
912 6146 : rel_destroy(r);
913 6146 : continue;
914 : }
915 5837 : nl = append(nl, r);
916 : }
917 4455 : rel->l = nl;
918 4455 : if (list_length(nl) == 1) {
919 4129 : sql_rel *l = rel->l = nl->h->data; /* ugh */
920 4129 : rel->r = NULL;
921 4129 : rel->op = op_project;
922 :
923 20283 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
924 16154 : sql_exp *pe = n->data, *ie = m->data;
925 16154 : sql_exp *ne = exp_ref(v->sql, ie);
926 16154 : exp_prop_alias(v->sql->sa, ne, pe);
927 16154 : n->data = ne;
928 : }
929 4129 : list_hash_clear(rel->exps);
930 326 : } else if (list_empty(nl)) {
931 : /* empty select (project [ nils ] ) */
932 454 : for (node *n = rel->exps->h ; n ; n = n->next) {
933 354 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
934 354 : exp_prop_alias(v->sql->sa, a, e);
935 354 : n->data = a;
936 : }
937 100 : list_hash_clear(rel->exps);
938 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
939 100 : set_count_prop(v->sql->sa, l, 1);
940 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
941 100 : set_count_prop(v->sql->sa, l, 0);
942 100 : rel->op = op_project;
943 100 : rel->r = NULL;
944 100 : rel->l = l;
945 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
946 100 : set_count_prop(v->sql->sa, rel, 0);
947 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
948 : }
949 : } else {
950 8408 : set_count_prop(v->sql->sa, rel, cnt);
951 : }
952 : break;
953 : }
954 532042 : case op_join:
955 : case op_left:
956 : case op_right:
957 : case op_full:
958 : case op_semi:
959 : case op_anti:
960 : case op_select:
961 : case op_project:
962 : case op_groupby: {
963 532042 : if (is_groupby(rel->op) && !list_empty(rel->r))
964 15531 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
965 532042 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
966 532031 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
967 39252 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
968 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
969 532032 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
970 139266 : int changes = v->changes;
971 139266 : rel->exps = rel_prune_predicates(v, rel);
972 139266 : if (v->changes > changes)
973 3618 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
974 : }
975 :
976 : /* propagate row count */
977 532032 : sql_rel *l = rel->l, *r = rel->r;
978 532032 : switch (rel->op) {
979 134379 : case op_join:
980 : case op_left:
981 : case op_right:
982 : case op_full: {
983 134379 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
984 :
985 134379 : if (!list_empty(rel->exps) && !is_single(rel)) {
986 245795 : for (node *n = rel->exps->h ; n ; n = n->next) {
987 125480 : sql_exp *e = n->data, *el = e->l, *er = e->r;
988 :
989 125480 : if (find_prop(e->p, PROP_JOINIDX)) {
990 667 : join_idx_estimate = lv>rv?lv:rv;
991 124813 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
992 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
993 120929 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
994 120533 : BUN lu = 0, ru = 0;
995 120533 : prop *p = NULL;
996 120533 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
997 89458 : lu = (BUN) p->value.dval;
998 120533 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
999 103843 : ru = (BUN) p->value.dval;
1000 120533 : if (is_unique(el) || lu > lv) {
1001 73582 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1002 73582 : uniques_estimate = MIN(uniques_estimate, ncount);
1003 46951 : } else if (is_unique(er) || ru > rv) {
1004 29324 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1005 29324 : uniques_estimate = MIN(uniques_estimate, ncount);
1006 : }
1007 : }
1008 : }
1009 : }
1010 : }
1011 134379 : if (is_single(rel)) {
1012 4731 : set_count_prop(v->sql->sa, rel, lv);
1013 129648 : } else if (join_idx_estimate != BUN_MAX) {
1014 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1015 128983 : } else if (uniques_estimate != BUN_MAX) {
1016 96461 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1017 32522 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1018 : /* corner cases for outer joins */
1019 126 : if (is_left(rel->op)) {
1020 114 : set_count_prop(v->sql->sa, rel, lv);
1021 12 : } else if (is_right(rel->op)) {
1022 3 : set_count_prop(v->sql->sa, rel, rv);
1023 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1024 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1025 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1026 0 : set_count_prop(v->sql->sa, rel, 0);
1027 : }
1028 32396 : } else if (lv == 0) {
1029 1166 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1030 31799 : } else if (rv == 0) {
1031 1568 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1032 30725 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1033 18276 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1034 : }
1035 : break;
1036 : }
1037 2635 : case op_anti:
1038 2635 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1039 2635 : break;
1040 108868 : case op_semi:
1041 : case op_select:
1042 : /* TODO calculate cardinalities using selectivities */
1043 108868 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1044 2743 : set_count_prop(v->sql->sa, rel, 0);
1045 : } else {
1046 209002 : if (!list_empty(rel->exps) && !is_single(rel)) {
1047 102877 : BUN cnt = get_rel_count(l), u = 1;
1048 171629 : for (node *n = rel->exps->h ; n ; n = n->next) {
1049 112717 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1050 :
1051 : /* simple expressions first */
1052 112717 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1053 : /* use selectivity */
1054 57154 : prop *p;
1055 57154 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1056 43965 : u = (BUN) p->value.dval;
1057 43965 : break;
1058 : }
1059 : }
1060 : }
1061 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1062 102877 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1063 : } else {
1064 3248 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1065 : }
1066 : }
1067 : break;
1068 262792 : case op_project:
1069 262792 : if (l) {
1070 252678 : if (need_distinct(rel)) {
1071 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1072 : } else {
1073 252678 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1074 : }
1075 : } else {
1076 10114 : BUN card = 1;
1077 :
1078 10114 : if (!list_empty(rel->exps)) {
1079 30661 : for (node *n = rel->exps->h ; n ; n = n->next) {
1080 20547 : sql_exp *e = n->data;
1081 20547 : card = MAX(card, trivial_project_exp_card(e));
1082 : }
1083 : }
1084 10114 : set_count_prop(v->sql->sa, rel, card);
1085 : }
1086 : break;
1087 22953 : case op_groupby:
1088 22953 : if (list_empty(rel->r)) {
1089 7421 : set_count_prop(v->sql->sa, rel, 1);
1090 : } else {
1091 15532 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1092 : }
1093 : break;
1094 : default:
1095 : break;
1096 : }
1097 : break;
1098 : }
1099 16788 : case op_topn: {
1100 16788 : BUN lv = get_rel_count(rel->l);
1101 :
1102 16787 : if (lv != BUN_NONE) {
1103 16770 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1104 96 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1105 96 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1106 96 : lv = offset >= lv ? 0 : lv - offset;
1107 : }
1108 16771 : if (le->l && exp_is_not_null(le)) {
1109 16733 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1110 16733 : lv = MIN(lv, limit);
1111 : }
1112 16771 : set_count_prop(v->sql->sa, rel, lv);
1113 : }
1114 : break;
1115 : }
1116 22 : case op_sample: {
1117 22 : BUN lv = get_rel_count(rel->l);
1118 :
1119 22 : if (lv != BUN_NONE) {
1120 4 : sql_exp *se = rel->exps->h->data;
1121 4 : sql_subtype *tp = exp_subtype(se);
1122 :
1123 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1124 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1125 4 : lv = MIN(lv, sample);
1126 0 : } else if (se->l) { /* sample is a percentage of rows */
1127 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1128 0 : assert(tp->type->eclass == EC_FLT);
1129 0 : lv = (BUN) ceil((dbl)lv * percent);
1130 : }
1131 4 : set_count_prop(v->sql->sa, rel, lv);
1132 : }
1133 : break;
1134 : }
1135 5972 : case op_table: {
1136 5972 : sql_exp *op = rel->r;
1137 5972 : if (rel->flag != TRIGGER_WRAPPER && op) {
1138 5660 : sql_subfunc *f = op->f;
1139 5660 : if (f->func->lang == FUNC_LANG_SQL) {
1140 97 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1141 5563 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1142 829 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1143 4734 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1144 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1145 4734 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1146 1085 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1147 3649 : } else if (f->func->lang == FUNC_LANG_MAL &&
1148 3539 : (strcmp(f->func->base.name, "queue") == 0 ||
1149 3275 : strcmp(f->func->base.name, "optimizers") == 0 ||
1150 2963 : strcmp(f->func->base.name, "env") == 0 ||
1151 2672 : strcmp(f->func->base.name, "keywords") == 0 ||
1152 2672 : strcmp(f->func->base.name, "statistics") == 0 ||
1153 2005 : strcmp(f->func->base.name, "rejects") == 0 ||
1154 1751 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1155 1751 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1156 1751 : strcmp(f->func->base.name, "sessions") == 0) ) {
1157 2078 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1158 : }
1159 : /* else {
1160 : printf("%%func needs stats : %s\n", f->func->base.name);
1161 : } */
1162 : }
1163 : break;
1164 : }
1165 : /*These relations are less important for now
1166 : TODO later we can tune it */
1167 : case op_insert:
1168 : case op_update:
1169 : case op_delete:
1170 : case op_truncate:
1171 : case op_ddl:
1172 : default:
1173 : break;
1174 : }
1175 :
1176 : return rel;
1177 : }
1178 :
1179 : static sql_rel *
1180 539410 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1181 : {
1182 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1183 539410 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1184 539410 : v->data = &has_special_modify;
1185 539410 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1186 539765 : v->data = gp;
1187 539765 : return rel;
1188 : }
1189 :
1190 : run_optimizer
1191 704556 : bind_get_statistics(visitor *v, global_props *gp)
1192 : {
1193 704556 : (void) v;
1194 704556 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1195 : }
1196 :
1197 :
1198 : static bool
1199 94897 : point_select_on_unique_column(sql_rel *rel)
1200 : {
1201 94897 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1202 131375 : for (node *n = rel->exps->h; n ; n = n->next) {
1203 75652 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1204 :
1205 75652 : if (is_compare(e->type) && e->flag == cmp_equal) {
1206 33623 : if (is_numeric_upcast(el))
1207 0 : el = el->l;
1208 33623 : if (is_numeric_upcast(er))
1209 0 : er = er->l;
1210 33623 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1211 749 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1212 : return true;
1213 32874 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1214 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1215 : return true;
1216 : }
1217 : }
1218 : }
1219 : return false;
1220 : }
1221 :
1222 : /*
1223 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1224 : * join, the opposite side's select can be pushed above the join.
1225 : */
1226 : static inline sql_rel *
1227 1244722 : rel_push_select_up(visitor *v, sql_rel *rel)
1228 : {
1229 1244722 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1230 253991 : sql_rel *l = rel->l, *r = rel->r;
1231 253991 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1232 253991 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1233 :
1234 253991 : if (can_pushup_left || can_pushup_right) {
1235 66898 : if (can_pushup_left)
1236 45430 : can_pushup_left = point_select_on_unique_column(r);
1237 66898 : if (can_pushup_right)
1238 49467 : can_pushup_right = point_select_on_unique_column(l);
1239 :
1240 : /* if both selects retrieve one row each, it's not worth it to push both up */
1241 66898 : if (can_pushup_left && !can_pushup_right) {
1242 683 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1243 683 : nrel->l = l->l;
1244 683 : rel = rel_inplace_select(rel, nrel, l->exps);
1245 683 : assert(is_select(rel->op));
1246 683 : v->changes++;
1247 66215 : } else if (!can_pushup_left && can_pushup_right) {
1248 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1249 14 : nrel->r = r->l;
1250 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1251 14 : assert(is_select(rel->op));
1252 14 : v->changes++;
1253 : }
1254 : }
1255 : }
1256 1244722 : return rel;
1257 : }
1258 :
1259 : static int
1260 93235 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1261 : {
1262 93235 : int de;
1263 :
1264 93235 : if (!t)
1265 : return 0;
1266 93235 : switch (ATOMstorage(t->type->localtype)) {
1267 : case TYPE_bte:
1268 : return 150 - 8;
1269 1592 : case TYPE_sht:
1270 1592 : return 150 - 16;
1271 37290 : case TYPE_int:
1272 37290 : return 150 - 32;
1273 518 : case TYPE_void:
1274 : case TYPE_lng:
1275 518 : return 150 - 64;
1276 : case TYPE_uuid:
1277 : #ifdef HAVE_HGE
1278 : case TYPE_hge:
1279 : #endif
1280 : return 150 - 128;
1281 2 : case TYPE_flt:
1282 2 : return 75 - 24;
1283 : case TYPE_dbl:
1284 : return 75 - 53;
1285 30567 : default:
1286 30567 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1287 1554 : return 150 - de * 8;
1288 : /* strings and blobs not duplicate eliminated don't get any points here */
1289 : return 0;
1290 : }
1291 : }
1292 :
1293 : static int
1294 34336 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1295 : {
1296 34336 : int res = 0;
1297 34336 : sql_subtype *t = exp_subtype(e);
1298 34336 : sql_column *c = NULL;
1299 :
1300 34336 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1301 : return -1000;
1302 : /* can we find out if the underlying table is sorted */
1303 33735 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1304 33735 : res += 600;
1305 :
1306 : /* prefer the shorter var types over the longer ones */
1307 33735 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1308 33735 : return res;
1309 : }
1310 :
1311 : static int
1312 58415 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1313 : {
1314 58415 : int score = 0;
1315 58415 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1316 34336 : sql_exp *l = e->l;
1317 :
1318 34336 : while (l->type == e_cmp) { /* go through nested comparisons */
1319 2 : sql_exp *ll;
1320 :
1321 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1322 0 : ll = ((list*)l->l)->h->data;
1323 : else
1324 2 : ll = l->l;
1325 2 : if (ll->type != e_cmp)
1326 : break;
1327 : l = ll;
1328 : }
1329 34336 : score += score_se_base(v, rel, l);
1330 : }
1331 58415 : score += exp_keyvalue(e);
1332 58415 : return score;
1333 : }
1334 :
1335 : static inline sql_rel *
1336 1244722 : rel_select_order(visitor *v, sql_rel *rel)
1337 : {
1338 1244722 : int *scores = NULL;
1339 1244722 : sql_exp **exps = NULL;
1340 :
1341 1244722 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1342 27325 : node *n;
1343 27325 : int i, nexps = list_length(rel->exps);
1344 27325 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1345 27325 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1346 :
1347 85740 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1348 58445 : exps[i] = n->data;
1349 58445 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1350 : return rel;
1351 58415 : scores[i] = score_se(v, rel, n->data);
1352 : }
1353 27295 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1354 :
1355 85710 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1356 58415 : n->data = exps[i];
1357 : }
1358 :
1359 : return rel;
1360 : }
1361 :
1362 : /* Compute the efficiency of using this expression early in a group by list */
1363 : static int
1364 59500 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1365 : {
1366 59500 : int res = 0;
1367 59500 : sql_subtype *t = exp_subtype(e);
1368 59500 : sql_column *c = exp_find_column(rel, e, -2);
1369 :
1370 59500 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1371 38 : res += 1000;
1372 : /* can we find out if the underlying table is sorted */
1373 59500 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1374 3584 : res += 700;
1375 38910 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1376 3710 : res += 500;
1377 59500 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1378 0 : res += 200;
1379 :
1380 : /* prefer the shorter var types over the longer ones */
1381 59500 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1382 59500 : return res;
1383 : }
1384 :
1385 : /* reorder group by expressions */
1386 : static inline sql_rel *
1387 1244722 : rel_groupby_order(visitor *v, sql_rel *rel)
1388 : {
1389 1244722 : int *scores = NULL;
1390 1244722 : sql_exp **exps = NULL;
1391 :
1392 1244722 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1393 25587 : node *n;
1394 25587 : list *gbe = rel->r;
1395 25587 : int i, ngbe = list_length(gbe);
1396 25587 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1397 25587 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1398 :
1399 : /* first sorting step, give priority for integers and sorted columns */
1400 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1401 59500 : exps[i] = n->data;
1402 59500 : scores[i] = score_gbe(v, rel, exps[i]);
1403 : }
1404 25587 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1405 :
1406 : /* second sorting step, give priority to strings with lower number of digits */
1407 50504 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1408 25587 : if (scores[i])
1409 24589 : i++;
1410 25587 : if (ngbe - i > 1) {
1411 8604 : for (int j = i; j < ngbe; j++) {
1412 6597 : sql_subtype *t = exp_subtype(exps[j]);
1413 6597 : scores[j] = t ? t->digits : 0;
1414 : }
1415 : /* the less number of digits the better, order ascending */
1416 2007 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1417 : }
1418 :
1419 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1420 59500 : n->data = exps[i];
1421 : }
1422 :
1423 1244722 : return rel;
1424 : }
1425 :
1426 : /* This optimization loop contains optimizations that can potentially use statistics */
1427 : static sql_rel *
1428 1244721 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1429 : {
1430 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1431 1244721 : rel = rel_push_select_up(v, rel);
1432 1244722 : rel = rel_select_order(v, rel);
1433 :
1434 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1435 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1436 : rel_distinct_aggregate_on_unique_values */
1437 :
1438 1244722 : rel = rel_groupby_order(v, rel);
1439 1244720 : return rel;
1440 : }
1441 :
1442 : static sql_rel *
1443 66043 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1444 : {
1445 66043 : (void) gp;
1446 66043 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1447 : }
1448 :
1449 : run_optimizer
1450 705107 : bind_final_optimization_loop(visitor *v, global_props *gp)
1451 : {
1452 705107 : int flag = v->sql->sql_optimizer;
1453 : /* At the moment, this optimizer has dependency on 3 flags */
1454 547277 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1455 771152 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1456 : }
|