Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 :
19 : static sql_exp *
20 3466573 : comparison_find_column(sql_exp *input, sql_exp *e)
21 : {
22 3466573 : switch (input->type) {
23 98466 : case e_convert: {
24 98466 : list *types = (list *)input->r;
25 98466 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
26 98466 : if (from == to)
27 189864 : return comparison_find_column(input->l, e) ? input : NULL;
28 : return NULL;
29 : }
30 3055002 : case e_column:
31 3055002 : return exp_match(e, input) ? input : NULL;
32 : default:
33 : return NULL;
34 : }
35 : }
36 :
37 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
38 : * columns */
39 : #define comp_single_column(c) (!c->f)
40 :
41 : static sql_exp *
42 8749961 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
43 : {
44 8833121 : assert(e->type == e_column);
45 8833121 : if (rel) {
46 8833042 : switch(rel->op) {
47 4090202 : case op_left:
48 : case op_right:
49 : case op_full:
50 : case op_join:
51 : case op_select:
52 : case op_anti:
53 : case op_semi: {
54 4090202 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
55 :
56 4090202 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
57 : return NULL; /* nothing will pass, skip */
58 :
59 : /* propagate from the bottom first */
60 4074649 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
61 : found_left = true;
62 2591285 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
63 4074649 : found_right = true;
64 :
65 4074649 : if (!found_left && !found_right)
66 : return NULL;
67 1778799 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
68 3498992 : for (node *n = rel->exps->h ; n ; n = n->next) {
69 1811527 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
70 :
71 1811527 : if (comp->type == e_cmp) {
72 1810525 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
73 125177 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
74 125177 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
75 :
76 : /* not semantics found or if explicitly filtering not null values from the column */
77 125177 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
78 125177 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
79 125177 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
80 19791 : continue;
81 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
82 105386 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
83 4622 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
84 4622 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
85 4622 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
86 4622 : symmetric = is_symmetric(comp);
87 :
88 4622 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
89 1817 : continue;
90 2805 : if (lne && int1 && int2) {
91 104 : if (symmetric) {
92 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
93 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
94 : /* max is min from le and (max from re and fe max) */
95 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
96 : /* min is max from le and (min from re and fe min) */
97 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
98 : } else {
99 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
100 : /* max is min from le and fe max */
101 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
102 : /* min is max from le and re min */
103 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
104 : }
105 2701 : } else if (rne) {
106 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
107 0 : prop *p = find_prop(e->p, PROP_MIN);
108 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
109 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
110 590 : } else if (int1) { /* min is max from le and re min */
111 100 : prop *p = find_prop(e->p, PROP_MIN);
112 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
113 : }
114 2111 : } else if (fne) {
115 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
116 0 : prop *p = find_prop(e->p, PROP_MAX);
117 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
118 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
119 549 : } else if (int2) { /* max is min from le and fe max */
120 266 : prop *p = find_prop(e->p, PROP_MAX);
121 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
122 : }
123 : }
124 100764 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
125 : /* both min and max must be set and the intervals must overlap */
126 43338 : switch (comp->flag) {
127 29002 : case cmp_equal: /* for equality reduce */
128 29002 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
129 29002 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
130 29002 : break;
131 4679 : case cmp_notequal: /* for inequality expand */
132 4679 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
133 4679 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
134 4679 : break;
135 5651 : case cmp_gt:
136 : case cmp_gte:
137 10364 : if (!is_anti(comp) && lne) { /* min is max from both min */
138 4713 : prop *p = find_prop(e->p, PROP_MIN);
139 4713 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
140 938 : } else if (!is_anti(comp)) { /* max is min from both max */
141 938 : prop *p = find_prop(e->p, PROP_MAX);
142 938 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
143 : }
144 : break;
145 4006 : case cmp_lt:
146 : case cmp_lte:
147 7205 : if (!is_anti(comp) && lne) { /* max is min from both max */
148 3199 : prop *p = find_prop(e->p, PROP_MAX);
149 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
150 807 : } else if (!is_anti(comp)) { /* min is max from both min */
151 807 : prop *p = find_prop(e->p, PROP_MIN);
152 807 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
153 : }
154 : break;
155 : default: /* Maybe later I can do cmp_in and cmp_notin */
156 : break;
157 : }
158 : }
159 : }
160 : }
161 : }
162 : }
163 1778799 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
164 38016 : set_has_nil(e);
165 1778799 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
166 94338 : set_has_no_nil(e);
167 1778799 : if (is_join(rel->op) && is_unique(e) && !still_unique)
168 118981 : set_not_unique(e);
169 1778799 : return e;
170 : }
171 4640327 : case op_table:
172 : case op_basetable:
173 : case op_union:
174 : case op_except:
175 : case op_inter:
176 : case op_munion:
177 : case op_project:
178 : case op_groupby: {
179 4640327 : sql_exp *found;
180 4640327 : atom *fval;
181 4640327 : prop *est;
182 4640327 : if ((found = rel_find_exp(rel, e))) {
183 2178128 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
184 2133793 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
185 1135339 : set_minmax_property(sql, e, PROP_MAX, fval);
186 2133792 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
187 1142696 : set_minmax_property(sql, e, PROP_MIN, fval);
188 2133789 : if (!has_nil(found))
189 1380055 : set_has_no_nil(e);
190 2133789 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
191 1722671 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
192 421996 : set_unique(e);
193 : /* propagate unique estimation for known cases */
194 2133789 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
195 7538 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
196 7538 : p->value.dval = 1;
197 2126251 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
198 66654 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
199 2068740 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
200 188069 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
201 188069 : p->value.dval = est->value.dval;
202 : }
203 : }
204 2178121 : return e;
205 : }
206 : return NULL;
207 : }
208 83160 : case op_topn:
209 : case op_sample:
210 83160 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
211 : default:
212 : break;
213 : }
214 : }
215 : return NULL;
216 : }
217 :
218 : static atom *
219 4459857 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
220 : {
221 4459857 : atom *a = SA_NEW(sa, atom);
222 :
223 4459790 : assert(!VALisnil(v));
224 4459713 : *a = (atom) {.tpe = *tpe,};
225 4459713 : SA_VALcopy(sa, &a->data, v);
226 4459664 : return a;
227 : }
228 :
229 : void
230 4065030 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
231 : {
232 4065030 : bool nonil = false, unique = false;
233 4065030 : double unique_est = 0.0;
234 4065030 : ValRecord min, max;
235 4065030 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
236 :
237 4065165 : if (has_nil(e) && nonil)
238 2657750 : set_has_no_nil(e);
239 4065165 : if (!is_unique(e) && unique)
240 1101458 : set_unique(e);
241 4065165 : if (unique_est != 0.0) {
242 2828248 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
243 2828240 : p->value.dval = unique_est;
244 : }
245 4065157 : unsigned int digits = 0;
246 4065157 : sql_subtype *et = exp_subtype(e);
247 4065164 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
248 2644222 : digits = et->digits;
249 4065164 : if ((ok & 2) == 2) {
250 2226825 : if (!VALisnil(&max)) {
251 2226819 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
252 2226790 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
253 2226754 : if (digits) {
254 1658274 : unsigned int nd = atom_digits(p->value.pval);
255 1658304 : if (nd < digits)
256 : digits = nd;
257 1658304 : if (!digits)
258 : digits = 1;
259 : }
260 : }
261 2226782 : VALclear(&max);
262 : }
263 4065101 : if ((ok & 1) == 1) {
264 2233231 : if (!VALisnil(&min)) {
265 2233210 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
266 2233204 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
267 2233172 : if (digits) {
268 1665780 : unsigned int nd = atom_digits(p->value.pval);
269 1665775 : if (nd > digits) /* possibly set to low by max value */
270 : digits = nd;
271 : if (!digits)
272 : digits = 1;
273 : }
274 : }
275 2233166 : VALclear(&min);
276 : }
277 4065029 : if (digits)
278 2644092 : et->digits = digits;
279 4065029 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
280 216 : et->digits = et->scale + 1;
281 4065029 : }
282 :
283 : static void
284 937751 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
285 : {
286 937751 : if (e->p)
287 : return;
288 302484 : sql_column *c = NULL;
289 :
290 302484 : if ((c = rel_base_find_column(rel, e->nid))) {
291 204384 : sql_column_get_statistics(sql, c, e);
292 : }
293 : }
294 :
295 : static bool
296 8872 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
297 : {
298 8872 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
299 8872 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
300 8872 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
301 8872 : prop *est;
302 :
303 : /* for the intersection, if both expressions don't overlap, it can be pruned */
304 8872 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
305 511 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
306 26 : return true;
307 :
308 8846 : if (lval_max && rval_max) {
309 2557 : if (is_union(rel->op))
310 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
311 2554 : else if (is_inter(rel->op))
312 399 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
313 : else /* except */
314 2155 : set_minmax_property(sql, e, PROP_MAX, lval_max);
315 : }
316 8846 : if (lval_min && rval_min) {
317 2557 : if (is_union(rel->op))
318 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
319 2554 : else if (is_inter(rel->op))
320 399 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
321 : else /* except */
322 2155 : set_minmax_property(sql, e, PROP_MIN, lval_min);
323 : }
324 :
325 8846 : if (is_union(rel->op) || is_munion(rel->op)) {
326 5986 : if (!has_nil(le) && !has_nil(re))
327 879 : set_has_no_nil(e);
328 5986 : if (need_distinct(rel) && list_length(rel->exps) == 1)
329 0 : set_unique(e);
330 2860 : } else if (is_inter(rel->op)) {
331 564 : if (!has_nil(le) || !has_nil(re))
332 509 : set_has_no_nil(e);
333 564 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
334 345 : set_unique(e);
335 : } else {
336 2296 : assert(is_except(rel->op));
337 2296 : if (!has_nil(le))
338 2233 : set_has_no_nil(e);
339 2296 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
340 2009 : set_unique(e);
341 : }
342 : /* propagate unique estimation for known cases */
343 8846 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
344 205 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
345 205 : p->value.dval = est->value.dval;
346 : }
347 : return false;
348 : }
349 :
350 :
351 : static void
352 60761 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
353 : {
354 60761 : assert(is_munion(rel->op));
355 :
356 60761 : sql_rel *l = rels->h->data;
357 60761 : sql_exp *le = list_fetch(l->exps, i);
358 60761 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
359 60761 : bool has_nonil = !has_nil(le);
360 :
361 176050 : for(node *n = rels->h->next; n; n = n->next) {
362 115289 : sql_rel *r = n->data;
363 115289 : sql_exp *re = list_fetch(r->exps, i);
364 115289 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
365 :
366 115289 : if (lval_max && rval_max) {
367 42578 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
368 42578 : lval_max = find_prop_and_get(e->p, PROP_MAX);
369 : }
370 115289 : if (lval_min && rval_min) {
371 42079 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
372 42079 : lval_min = find_prop_and_get(e->p, PROP_MIN);
373 : }
374 115289 : has_nonil &= !has_nil(re);
375 :
376 : }
377 :
378 60761 : if (has_nonil)
379 41592 : set_has_no_nil(e);
380 :
381 60761 : if (need_distinct(rel) && list_length(rel->exps) == 1)
382 1594 : set_unique(e);
383 60761 : }
384 :
385 :
386 : static sql_exp *
387 3483314 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
388 : {
389 3483314 : mvc *sql = v->sql;
390 3483314 : atom *lval;
391 :
392 3483314 : (void) depth;
393 3483314 : switch(e->type) {
394 2183957 : case e_column:
395 2183957 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
396 279077 : case op_join:
397 : case op_left:
398 : case op_right:
399 : case op_full:
400 : case op_semi:
401 : case op_anti: {
402 279077 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
403 279077 : if (!found)
404 139981 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
405 : break;
406 : }
407 1904880 : case op_select:
408 : case op_project:
409 : case op_groupby: {
410 1904880 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
411 1904874 : if (!found && is_simple_project(rel->op))
412 125463 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
413 : break;
414 : }
415 0 : case op_insert:
416 : case op_update:
417 : case op_delete:
418 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
419 0 : break;
420 : default:
421 : break;
422 : }
423 : break;
424 98370 : case e_convert: {
425 98370 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
426 98370 : sql_exp *l = e->l;
427 98370 : sql_class fr = from->type->eclass, too = to->type->eclass;
428 98370 : prop *est;
429 :
430 98370 : if (fr == too) {
431 89291 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
432 58695 : atom *res = atom_copy(sql->sa, lval);
433 58695 : if ((res = atom_cast(sql->sa, res, to)))
434 58672 : set_minmax_property(sql, e, PROP_MAX, res);
435 : }
436 89291 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
437 59318 : atom *res = atom_copy(sql->sa, lval);
438 59318 : if ((res = atom_cast(sql->sa, res, to)))
439 59295 : set_minmax_property(sql, e, PROP_MIN, res);
440 : }
441 : }
442 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
443 98370 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
444 60962 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
445 60937 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
446 57510 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
447 57510 : p->value.dval = est->value.dval;
448 : }
449 98370 : if (!has_nil(l))
450 55416 : set_has_no_nil(e);
451 : break;
452 : }
453 342059 : case e_aggr:
454 : case e_func: {
455 342059 : BUN lv;
456 342059 : sql_subfunc *f = e->f;
457 :
458 342059 : if (!f->func->s) {
459 315556 : int key = hash_key(f->func->base.name); /* Using hash lookup */
460 315556 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
461 315556 : lookup_function look = NULL;
462 :
463 689229 : for (; he && !look; he = he->chain) {
464 373673 : struct function_properties* fp = (struct function_properties*) he->value;
465 :
466 373673 : if (!strcmp(f->func->base.name, fp->name))
467 107764 : look = fp->func;
468 : }
469 315556 : if (look)
470 107764 : look(sql, e);
471 : }
472 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
473 342059 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
474 89234 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
475 88911 : set_has_no_nil(e);
476 : /* set properties for global aggregates */
477 342059 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
478 7871 : if (!find_prop(e->p, PROP_NUNIQUES)) {
479 7871 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
480 7871 : p->value.dval = 1;
481 : }
482 7871 : set_unique(e);
483 : }
484 : break;
485 : }
486 571199 : case e_atom:
487 571199 : if (e->l) {
488 554468 : atom *a = (atom*) e->l;
489 554468 : if (!a->isnull) {
490 491149 : set_minmax_property(sql, e, PROP_MAX, a);
491 491149 : set_minmax_property(sql, e, PROP_MIN, a);
492 : }
493 554468 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
494 554468 : p->value.dval = 1;
495 16731 : } else if (e->f) {
496 3206 : list *vals = (list *) e->f;
497 3206 : sql_exp *first = vals->h ? vals->h->data : NULL;
498 3206 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
499 3206 : int has_nil = 0;
500 :
501 3206 : if (first) {
502 3206 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
503 3206 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
504 3206 : has_nil |= has_nil(first);
505 : }
506 :
507 13958 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
508 7546 : sql_exp *ee = n->data;
509 :
510 7546 : if (min && max) {
511 7093 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
512 7047 : max = atom_cmp(lval, max) > 0 ? lval : max;
513 : } else {
514 : max = NULL;
515 : }
516 7093 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
517 7047 : min = atom_cmp(min, lval) > 0 ? lval : min;
518 : } else {
519 : min = NULL;
520 : }
521 : }
522 7546 : has_nil |= has_nil(ee);
523 : }
524 :
525 3206 : if (!has_nil)
526 2840 : set_has_no_nil(e);
527 3206 : if (min && max) {
528 2817 : set_minmax_property(sql, e, PROP_MAX, max);
529 2817 : set_minmax_property(sql, e, PROP_MIN, min);
530 : }
531 3206 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
532 3206 : p->value.dval = (dbl) list_length(vals);
533 : } else {
534 13525 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
535 13525 : p->value.dval = 1;
536 : }
537 : break;
538 287729 : case e_cmp:
539 : /* TODO? propagating min/max/unique of booleans is not very worth it */
540 287729 : if (e->flag == cmp_or || e->flag == cmp_filter) {
541 18103 : if (!have_nil(e->l) && !have_nil(e->r))
542 13621 : set_has_no_nil(e);
543 269626 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
544 21270 : sql_exp *le = e->l;
545 21270 : if (!has_nil(le) && !have_nil(e->r))
546 18224 : set_has_no_nil(e);
547 : } else {
548 248356 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
549 248356 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
550 178405 : set_has_no_nil(e);
551 : }
552 : break;
553 : case e_psm:
554 : break;
555 : }
556 :
557 : #ifndef NDEBUG
558 : {
559 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
560 3483308 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
561 :
562 3483307 : (void) min;
563 3483307 : (void) max;
564 3483307 : assert(!min || !min->isnull);
565 3483307 : assert(!max || !max->isnull);
566 3483307 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
567 : }
568 : #endif
569 3483307 : return e;
570 : }
571 :
572 : static list * /* Remove predicates always false from min/max values */
573 139699 : rel_prune_predicates(visitor *v, sql_rel *rel)
574 : {
575 139699 : if (rel->l) {
576 139699 : sql_rel *l = rel->l;
577 139699 : if (is_single(l))
578 3399 : return rel->exps;
579 : }
580 136300 : if (!list_empty(rel->attr))
581 2972 : return rel->exps;
582 282870 : for (node *n = rel->exps->h ; n ; n = n->next) {
583 149542 : sql_exp *e = n->data;
584 :
585 149542 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
586 123074 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
587 123074 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
588 123074 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
589 123074 : bool always_false = false, always_true = false;
590 :
591 123074 : if (fe && !is_symmetric(e)) {
592 3058 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
593 3058 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
594 3660 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
595 1663 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
596 4072 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
597 2485 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
598 3058 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
599 1287 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
600 :
601 3058 : always_false |= not_int1 || not_int2 || not_int3;
602 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
603 4113 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
604 3957 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
605 575 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
606 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
607 : } else if (!fe) {
608 119998 : if (!is_semantics(e)) /* trivial not null cmp null case */
609 228578 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
610 119998 : switch (e->flag) {
611 104508 : case cmp_equal:
612 104508 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
613 134024 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
614 104508 : if (is_semantics(e)) { /* prune *= NULL cases */
615 5683 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
616 11366 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
617 : }
618 : break;
619 7227 : case cmp_notequal:
620 7227 : if (lval_min && lval_max && rval_min && rval_max)
621 11376 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
622 7227 : if (is_semantics(e)) {
623 26 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
624 52 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
625 : }
626 : break;
627 5632 : case cmp_gt:
628 5632 : if (lval_max && rval_min)
629 1947 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
630 5632 : if (lval_min && rval_max)
631 3894 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
632 : break;
633 119 : case cmp_gte:
634 119 : if (lval_max && rval_min)
635 49 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
636 119 : if (lval_min && rval_max)
637 98 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
638 : break;
639 2428 : case cmp_lt:
640 2428 : if (lval_min && rval_max)
641 1376 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
642 2428 : if (lval_max && rval_min)
643 2800 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
644 : break;
645 84 : case cmp_lte:
646 84 : if (lval_min && rval_max)
647 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
648 84 : if (lval_max && rval_min)
649 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
650 : break;
651 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
652 : break;
653 : }
654 : }
655 123074 : assert(!always_false || !always_true);
656 123074 : if (always_false || always_true) {
657 3665 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
658 3665 : if (exp_name(e))
659 0 : exp_prop_alias(v->sql->sa, ne, e);
660 3665 : n->data = ne;
661 3665 : v->changes++;
662 : }
663 : }
664 : }
665 133328 : return rel->exps;
666 : }
667 :
668 : static sql_rel *
669 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
670 : {
671 14 : if (side == rel->r)
672 0 : rel->r = NULL;
673 : else
674 14 : rel->l = NULL;
675 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
676 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
677 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
678 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
679 : }
680 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
681 21 : sql_exp *e1 = n->data, *e2 = m->data;
682 21 : exp_prop_alias(v->sql->sa, e2, e1);
683 : }
684 14 : list_hash_clear(side->exps);
685 :
686 14 : if (need_distinct(rel))
687 0 : set_distinct(side);
688 14 : side->p = prop_copy(v->sql->sa, rel->p);
689 14 : rel_destroy(rel);
690 14 : return side;
691 : }
692 :
693 : static BUN
694 21887 : trivial_project_exp_card(sql_exp *e)
695 : {
696 22193 : if (e->type == e_convert)
697 306 : return trivial_project_exp_card(e->l);
698 21887 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
699 : }
700 :
701 : static BUN
702 24249 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
703 : {
704 24249 : BUN lv = get_rel_count(l);
705 :
706 24249 : if (lv == 0)
707 : return 0;
708 21540 : if (!list_empty(exps)) {
709 21540 : BUN nuniques = 0;
710 : /* compute the highest number of unique values */
711 51133 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
712 29593 : sql_exp *e = n->data;
713 29593 : sql_rel *bt = NULL;
714 29593 : prop *p = NULL;
715 29593 : BUN euniques = BUN_NONE;
716 29593 : atom *min, *max, *sub = NULL;
717 29593 : sql_subtype *tp = exp_subtype(e);
718 29593 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
719 :
720 29593 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
721 20957 : euniques = (BUN) p->value.dval;
722 8636 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
723 6593 : euniques = (BUN) p->value.lval;
724 : }
725 : /* use min to max range to compute number of possible values in the domain for number types */
726 29593 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
727 22363 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
728 : /* the range includes min and max, so the atom_inc call is needed */
729 : /* if 'euniques' has number of distinct values, compute min between both */
730 17543 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
731 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
732 : }
733 29593 : if (euniques != BUN_NONE)
734 27550 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
735 : else
736 : nuniques = BUN_NONE;
737 : }
738 21540 : if (nuniques != BUN_NONE)
739 : return nuniques;
740 : }
741 : return lv;
742 : }
743 :
744 : static sql_rel *
745 2073754 : rel_get_statistics_(visitor *v, sql_rel *rel)
746 : {
747 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
748 2073754 : uint8_t has_special_modify = *(uint8_t*) v->data;
749 2073754 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
750 :
751 : /* Don't look at the same relation twice */
752 2073754 : if (are_statistics_gathered(rel->used))
753 : return rel;
754 1332515 : rel->used |= statistics_gathered;
755 :
756 1332515 : switch (rel->op) {
757 312996 : case op_basetable: {
758 312996 : sql_table *t = (sql_table *) rel->l;
759 312996 : sqlstore *store = v->sql->session->tr->store;
760 :
761 312996 : if (!list_empty(rel->exps)) {
762 1250698 : for (node *n = rel->exps->h ; n ; n = n->next)
763 937751 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
764 : }
765 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
766 312996 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
767 259440 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
768 : break;
769 : }
770 2790 : case op_union:
771 : case op_inter:
772 : case op_except: {
773 2790 : bool empty_cross = false;
774 2790 : int i = 0;
775 2790 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
776 :
777 2790 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
778 0 : pl = pl->l;
779 2790 : while (is_sample(pr->op) || is_topn(pr->op))
780 0 : pr = pr->l;
781 : /* if it's not a projection, then project and propagate statistics */
782 2790 : if (!is_project(pl->op) && !is_base(pl->op)) {
783 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
784 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
785 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
786 : }
787 2790 : if (!is_project(pr->op) && !is_base(pr->op)) {
788 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
789 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
790 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
791 : }
792 :
793 11662 : for (node *n = rel->exps->h ; n ; n = n->next) {
794 8872 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
795 8872 : i++;
796 : }
797 :
798 : /* propagate row count */
799 2790 : if (is_union(rel->op)) {
800 277 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
801 277 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
802 :
803 277 : if (lv == 0 && rv == 0) { /* both sides empty */
804 2 : if (can_be_pruned)
805 : empty_cross = true;
806 : else
807 2 : set_count_prop(v->sql->sa, rel, 0);
808 275 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
809 0 : rel = set_setop_side(v, rel, r);
810 0 : empty_cross = false; /* don't rewrite again */
811 275 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
812 0 : rel = set_setop_side(v, rel, l);
813 0 : empty_cross = false; /* don't rewrite again */
814 275 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
815 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
816 : }
817 2513 : } else if (is_inter(rel->op) || is_except(rel->op)) {
818 2513 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
819 2513 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
820 :
821 2513 : if (lv == 0) { /* left side empty */
822 62 : if (can_be_pruned)
823 : empty_cross = true;
824 : else
825 5 : set_count_prop(v->sql->sa, rel, 0);
826 2451 : } else if (rv == 0) { /* right side empty */
827 17 : if (is_inter(rel->op)) {
828 3 : if (can_be_pruned)
829 : empty_cross = true;
830 : else
831 0 : set_count_prop(v->sql->sa, rel, 0);
832 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
833 14 : rel = set_setop_side(v, rel, l);
834 14 : empty_cross = false; /* don't rewrite again */
835 : } else {
836 0 : set_count_prop(v->sql->sa, rel, lv);
837 : }
838 : } else {
839 2434 : set_count_prop(v->sql->sa, rel, lv);
840 : }
841 : }
842 2790 : if (can_be_pruned && empty_cross) {
843 80 : rel_destroy(rel->l);
844 80 : rel->l = NULL;
845 80 : rel_destroy(rel->r);
846 80 : rel->r = NULL;
847 244 : for (node *n = rel->exps->h ; n ; n = n->next) {
848 164 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
849 164 : exp_prop_alias(v->sql->sa, a, e);
850 164 : n->data = a;
851 : }
852 80 : list_hash_clear(rel->exps);
853 80 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
854 80 : set_count_prop(v->sql->sa, l, 1);
855 80 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
856 80 : set_count_prop(v->sql->sa, l, 0);
857 80 : rel->op = op_project;
858 80 : rel->l = l;
859 80 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
860 80 : set_count_prop(v->sql->sa, rel, 0);
861 80 : set_nodistinct(rel); /* set relations may have distinct flag set */
862 80 : v->changes++;
863 : }
864 : break;
865 : }
866 12883 : case op_munion: {
867 12883 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
868 12883 : BUN cnt = 0;
869 12883 : bool needs_pruning = false;
870 :
871 49258 : for (node *n = l->h; n; n = n->next) {
872 36375 : sql_rel *r = n->data, *pl = r;
873 :
874 36375 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
875 0 : pl = pl->l;
876 : /* if it's not a projection, then project and propagate statistics */
877 36375 : if (!is_project(pl->op) && !is_base(pl->op)) {
878 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
879 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
880 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
881 : }
882 36375 : nrels = append(nrels, pl);
883 : /* we need new munion statistics */
884 : /* propagate row count */
885 36375 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
886 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
887 36375 : if (rv == BUN_NONE) {
888 1156 : cnt++;
889 1156 : continue;
890 : }
891 35219 : if (!rv && can_be_pruned)
892 6636 : needs_pruning = true;
893 : /* overflow check */
894 35219 : if (rv > (BUN_MAX - cnt))
895 36375 : rv = BUN_MAX;
896 : else
897 35219 : cnt += rv;
898 : }
899 12883 : int i = 0;
900 73644 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
901 60761 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
902 :
903 12883 : if (needs_pruning && !rel_is_ref(rel)) {
904 4475 : v->changes++;
905 4475 : list *nl = sa_list(l->sa);
906 :
907 16498 : for (node *n = nrels->h; n; n = n->next) {
908 12023 : sql_rel *r = n->data;
909 12023 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
910 :
911 12023 : if (!rv) { /* keep last for now */
912 6166 : rel_destroy(r);
913 6166 : continue;
914 : }
915 5857 : nl = append(nl, r);
916 : }
917 4475 : rel->l = nl;
918 4475 : if (list_length(nl) == 1) {
919 4149 : sql_rel *l = rel->l = nl->h->data; /* ugh */
920 4149 : rel->r = NULL;
921 4149 : rel->op = op_project;
922 :
923 20379 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
924 16230 : sql_exp *pe = n->data, *ie = m->data;
925 16230 : sql_exp *ne = exp_ref(v->sql, ie);
926 16230 : exp_prop_alias(v->sql->sa, ne, pe);
927 16230 : n->data = ne;
928 : }
929 4149 : list_hash_clear(rel->exps);
930 326 : } else if (list_empty(nl)) {
931 : /* empty select (project [ nils ] ) */
932 454 : for (node *n = rel->exps->h ; n ; n = n->next) {
933 354 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
934 354 : exp_prop_alias(v->sql->sa, a, e);
935 354 : n->data = a;
936 : }
937 100 : list_hash_clear(rel->exps);
938 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
939 100 : set_count_prop(v->sql->sa, l, 1);
940 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
941 100 : set_count_prop(v->sql->sa, l, 0);
942 100 : rel->op = op_project;
943 100 : rel->r = NULL;
944 100 : rel->l = l;
945 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
946 100 : set_count_prop(v->sql->sa, rel, 0);
947 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
948 : }
949 : } else {
950 8408 : set_count_prop(v->sql->sa, rel, cnt);
951 : }
952 : break;
953 : }
954 532922 : case op_join:
955 : case op_left:
956 : case op_right:
957 : case op_full:
958 : case op_semi:
959 : case op_anti:
960 : case op_select:
961 : case op_project:
962 : case op_groupby: {
963 532922 : if (is_groupby(rel->op) && !list_empty(rel->r))
964 15532 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
965 532922 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
966 532922 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
967 39250 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
968 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
969 532922 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
970 139699 : int changes = v->changes;
971 139699 : rel->exps = rel_prune_predicates(v, rel);
972 139699 : if (v->changes > changes)
973 3632 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
974 : }
975 :
976 : /* propagate row count */
977 532922 : sql_rel *l = rel->l, *r = rel->r;
978 532922 : switch (rel->op) {
979 134501 : case op_join:
980 : case op_left:
981 : case op_right:
982 : case op_full: {
983 134501 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
984 :
985 134501 : if (!list_empty(rel->exps) && !is_single(rel)) {
986 246019 : for (node *n = rel->exps->h ; n ; n = n->next) {
987 125592 : sql_exp *e = n->data, *el = e->l, *er = e->r;
988 :
989 125592 : if (find_prop(e->p, PROP_JOINIDX)) {
990 667 : join_idx_estimate = lv>rv?lv:rv;
991 124925 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
992 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
993 121041 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
994 120645 : BUN lu = 0, ru = 0;
995 120645 : prop *p = NULL;
996 120645 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
997 89535 : lu = (BUN) p->value.dval;
998 120645 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
999 103955 : ru = (BUN) p->value.dval;
1000 120645 : if (is_unique(el) || lu > lv) {
1001 73671 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1002 73671 : uniques_estimate = MIN(uniques_estimate, ncount);
1003 46974 : } else if (is_unique(er) || ru > rv) {
1004 29347 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1005 29347 : uniques_estimate = MIN(uniques_estimate, ncount);
1006 : }
1007 : }
1008 : }
1009 : }
1010 : }
1011 134501 : if (is_single(rel)) {
1012 4739 : set_count_prop(v->sql->sa, rel, lv);
1013 129762 : } else if (join_idx_estimate != BUN_MAX) {
1014 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1015 129097 : } else if (uniques_estimate != BUN_MAX) {
1016 96561 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1017 32536 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1018 : /* corner cases for outer joins */
1019 126 : if (is_left(rel->op)) {
1020 114 : set_count_prop(v->sql->sa, rel, lv);
1021 12 : } else if (is_right(rel->op)) {
1022 3 : set_count_prop(v->sql->sa, rel, rv);
1023 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1024 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1025 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1026 0 : set_count_prop(v->sql->sa, rel, 0);
1027 : }
1028 32410 : } else if (lv == 0) {
1029 1170 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1030 31811 : } else if (rv == 0) {
1031 1568 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1032 30737 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1033 18276 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1034 : }
1035 : break;
1036 : }
1037 2635 : case op_anti:
1038 2635 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1039 2635 : break;
1040 109201 : case op_semi:
1041 : case op_select:
1042 : /* TODO calculate cardinalities using selectivities */
1043 109201 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1044 2742 : set_count_prop(v->sql->sa, rel, 0);
1045 : } else {
1046 209670 : if (!list_empty(rel->exps) && !is_single(rel)) {
1047 103211 : BUN cnt = get_rel_count(l), u = 1;
1048 172080 : for (node *n = rel->exps->h ; n ; n = n->next) {
1049 113082 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1050 :
1051 : /* simple expressions first */
1052 113082 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1053 : /* use selectivity */
1054 57423 : prop *p;
1055 57423 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1056 44213 : u = (BUN) p->value.dval;
1057 44213 : break;
1058 : }
1059 : }
1060 : }
1061 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1062 103211 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1063 : } else {
1064 3248 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1065 : }
1066 : }
1067 : break;
1068 263226 : case op_project:
1069 263226 : if (l) {
1070 253109 : if (need_distinct(rel)) {
1071 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1072 : } else {
1073 253109 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1074 : }
1075 : } else {
1076 10117 : BUN card = 1;
1077 :
1078 10117 : if (!list_empty(rel->exps)) {
1079 30675 : for (node *n = rel->exps->h ; n ; n = n->next) {
1080 20558 : sql_exp *e = n->data;
1081 20558 : card = MAX(card, trivial_project_exp_card(e));
1082 : }
1083 : }
1084 10117 : set_count_prop(v->sql->sa, rel, card);
1085 : }
1086 : break;
1087 22954 : case op_groupby:
1088 22954 : if (list_empty(rel->r)) {
1089 7421 : set_count_prop(v->sql->sa, rel, 1);
1090 : } else {
1091 15533 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1092 : }
1093 : break;
1094 : default:
1095 : break;
1096 : }
1097 : break;
1098 : }
1099 16792 : case op_topn: {
1100 16792 : BUN lv = get_rel_count(rel->l);
1101 :
1102 16792 : if (lv != BUN_NONE) {
1103 16775 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1104 84 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1105 84 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1106 84 : lv = offset >= lv ? 0 : lv - offset;
1107 : }
1108 16775 : if (le->l && exp_is_not_null(le)) {
1109 16741 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1110 16741 : lv = MIN(lv, limit);
1111 : }
1112 16775 : set_count_prop(v->sql->sa, rel, lv);
1113 : }
1114 : break;
1115 : }
1116 22 : case op_sample: {
1117 22 : BUN lv = get_rel_count(rel->l);
1118 :
1119 22 : if (lv != BUN_NONE) {
1120 4 : sql_exp *se = rel->exps->h->data;
1121 4 : sql_subtype *tp = exp_subtype(se);
1122 :
1123 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1124 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1125 4 : lv = MIN(lv, sample);
1126 0 : } else if (se->l) { /* sample is a percentage of rows */
1127 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1128 0 : assert(tp->type->eclass == EC_FLT);
1129 0 : lv = (BUN) ceil((dbl)lv * percent);
1130 : }
1131 4 : set_count_prop(v->sql->sa, rel, lv);
1132 : }
1133 : break;
1134 : }
1135 5977 : case op_table: {
1136 5977 : sql_exp *op = rel->r;
1137 5977 : if (rel->flag != TRIGGER_WRAPPER && op) {
1138 5665 : sql_subfunc *f = op->f;
1139 5665 : if (f->func->lang == FUNC_LANG_SQL) {
1140 97 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1141 5568 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1142 829 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1143 4739 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1144 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1145 4739 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1146 1085 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1147 3654 : } else if (f->func->lang == FUNC_LANG_MAL &&
1148 3542 : (strcmp(f->func->base.name, "queue") == 0 ||
1149 3278 : strcmp(f->func->base.name, "optimizers") == 0 ||
1150 2966 : strcmp(f->func->base.name, "env") == 0 ||
1151 2675 : strcmp(f->func->base.name, "keywords") == 0 ||
1152 2675 : strcmp(f->func->base.name, "statistics") == 0 ||
1153 2008 : strcmp(f->func->base.name, "rejects") == 0 ||
1154 1754 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1155 1754 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1156 1754 : strcmp(f->func->base.name, "sessions") == 0) ) {
1157 2078 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1158 : }
1159 : /* else {
1160 : printf("%%func needs stats : %s\n", f->func->base.name);
1161 : } */
1162 : }
1163 : break;
1164 : }
1165 : /*These relations are less important for now
1166 : TODO later we can tune it */
1167 : case op_insert:
1168 : case op_update:
1169 : case op_delete:
1170 : case op_truncate:
1171 : case op_ddl:
1172 : default:
1173 : break;
1174 : }
1175 :
1176 : return rel;
1177 : }
1178 :
1179 : static sql_rel *
1180 540130 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1181 : {
1182 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1183 540130 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1184 540130 : v->data = &has_special_modify;
1185 540130 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1186 540131 : v->data = gp;
1187 540131 : return rel;
1188 : }
1189 :
1190 : run_optimizer
1191 705970 : bind_get_statistics(visitor *v, global_props *gp)
1192 : {
1193 705970 : (void) v;
1194 705970 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1195 : }
1196 :
1197 :
1198 : static bool
1199 94940 : point_select_on_unique_column(sql_rel *rel)
1200 : {
1201 94940 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1202 131415 : for (node *n = rel->exps->h; n ; n = n->next) {
1203 75676 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1204 :
1205 75676 : if (is_compare(e->type) && e->flag == cmp_equal) {
1206 33647 : if (is_numeric_upcast(el))
1207 0 : el = el->l;
1208 33647 : if (is_numeric_upcast(er))
1209 0 : er = er->l;
1210 33647 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1211 749 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1212 : return true;
1213 32898 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1214 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1215 : return true;
1216 : }
1217 : }
1218 : }
1219 : return false;
1220 : }
1221 :
1222 : /*
1223 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1224 : * join, the opposite side's select can be pushed above the join.
1225 : */
1226 : static inline sql_rel *
1227 1245850 : rel_push_select_up(visitor *v, sql_rel *rel)
1228 : {
1229 1245850 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1230 254093 : sql_rel *l = rel->l, *r = rel->r;
1231 254093 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1232 254093 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1233 :
1234 254093 : if (can_pushup_left || can_pushup_right) {
1235 66933 : if (can_pushup_left)
1236 45458 : can_pushup_left = point_select_on_unique_column(r);
1237 66933 : if (can_pushup_right)
1238 49482 : can_pushup_right = point_select_on_unique_column(l);
1239 :
1240 : /* if both selects retrieve one row each, it's not worth it to push both up */
1241 66933 : if (can_pushup_left && !can_pushup_right) {
1242 683 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1243 683 : nrel->l = l->l;
1244 683 : rel = rel_inplace_select(rel, nrel, l->exps);
1245 683 : assert(is_select(rel->op));
1246 683 : v->changes++;
1247 66250 : } else if (!can_pushup_left && can_pushup_right) {
1248 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1249 14 : nrel->r = r->l;
1250 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1251 14 : assert(is_select(rel->op));
1252 14 : v->changes++;
1253 : }
1254 : }
1255 : }
1256 1245850 : return rel;
1257 : }
1258 :
1259 : static int
1260 93351 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1261 : {
1262 93351 : int de;
1263 :
1264 93351 : if (!t)
1265 : return 0;
1266 93351 : switch (ATOMstorage(t->type->localtype)) {
1267 : case TYPE_bte:
1268 : return 150 - 8;
1269 1598 : case TYPE_sht:
1270 1598 : return 150 - 16;
1271 37326 : case TYPE_int:
1272 37326 : return 150 - 32;
1273 518 : case TYPE_void:
1274 : case TYPE_lng:
1275 518 : return 150 - 64;
1276 : case TYPE_uuid:
1277 : #ifdef HAVE_HGE
1278 : case TYPE_hge:
1279 : #endif
1280 : return 150 - 128;
1281 2 : case TYPE_flt:
1282 2 : return 75 - 24;
1283 : case TYPE_dbl:
1284 : return 75 - 53;
1285 30631 : default:
1286 30631 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1287 1554 : return 150 - de * 8;
1288 : /* strings and blobs not duplicate eliminated don't get any points here */
1289 : return 0;
1290 : }
1291 : }
1292 :
1293 : static int
1294 34452 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1295 : {
1296 34452 : int res = 0;
1297 34452 : sql_subtype *t = exp_subtype(e);
1298 34452 : sql_column *c = NULL;
1299 :
1300 34452 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1301 : return -1000;
1302 : /* can we find out if the underlying table is sorted */
1303 33851 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1304 33851 : res += 600;
1305 :
1306 : /* prefer the shorter var types over the longer ones */
1307 33851 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1308 33851 : return res;
1309 : }
1310 :
1311 : static int
1312 58553 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1313 : {
1314 58553 : int score = 0;
1315 58553 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1316 34452 : sql_exp *l = e->l;
1317 :
1318 34452 : while (l->type == e_cmp) { /* go through nested comparisons */
1319 2 : sql_exp *ll;
1320 :
1321 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1322 0 : ll = ((list*)l->l)->h->data;
1323 : else
1324 2 : ll = l->l;
1325 2 : if (ll->type != e_cmp)
1326 : break;
1327 : l = ll;
1328 : }
1329 34452 : score += score_se_base(v, rel, l);
1330 : }
1331 58553 : score += exp_keyvalue(e);
1332 58553 : return score;
1333 : }
1334 :
1335 : static inline sql_rel *
1336 1245850 : rel_select_order(visitor *v, sql_rel *rel)
1337 : {
1338 1245850 : int *scores = NULL;
1339 1245850 : sql_exp **exps = NULL;
1340 :
1341 1245850 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1342 27380 : node *n;
1343 27380 : int i, nexps = list_length(rel->exps);
1344 27380 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1345 27380 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1346 :
1347 85933 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1348 58583 : exps[i] = n->data;
1349 58583 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1350 : return rel;
1351 58553 : scores[i] = score_se(v, rel, n->data);
1352 : }
1353 27350 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1354 :
1355 85903 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1356 58553 : n->data = exps[i];
1357 : }
1358 :
1359 : return rel;
1360 : }
1361 :
1362 : /* Compute the efficiency of using this expression early in a group by list */
1363 : static int
1364 59500 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1365 : {
1366 59500 : int res = 0;
1367 59500 : sql_subtype *t = exp_subtype(e);
1368 59500 : sql_column *c = exp_find_column(rel, e, -2);
1369 :
1370 59500 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1371 38 : res += 1000;
1372 : /* can we find out if the underlying table is sorted */
1373 59500 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1374 3584 : res += 700;
1375 38910 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1376 3710 : res += 500;
1377 59500 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1378 0 : res += 200;
1379 :
1380 : /* prefer the shorter var types over the longer ones */
1381 59500 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1382 59500 : return res;
1383 : }
1384 :
1385 : /* reorder group by expressions */
1386 : static inline sql_rel *
1387 1245850 : rel_groupby_order(visitor *v, sql_rel *rel)
1388 : {
1389 1245850 : int *scores = NULL;
1390 1245850 : sql_exp **exps = NULL;
1391 :
1392 1245850 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1393 25587 : node *n;
1394 25587 : list *gbe = rel->r;
1395 25587 : int i, ngbe = list_length(gbe);
1396 25587 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1397 25587 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1398 :
1399 : /* first sorting step, give priority for integers and sorted columns */
1400 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1401 59500 : exps[i] = n->data;
1402 59500 : scores[i] = score_gbe(v, rel, exps[i]);
1403 : }
1404 25587 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1405 :
1406 : /* second sorting step, give priority to strings with lower number of digits */
1407 50504 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1408 25587 : if (scores[i])
1409 24589 : i++;
1410 25587 : if (ngbe - i > 1) {
1411 8604 : for (int j = i; j < ngbe; j++) {
1412 6597 : sql_subtype *t = exp_subtype(exps[j]);
1413 6597 : scores[j] = t ? t->digits : 0;
1414 : }
1415 : /* the less number of digits the better, order ascending */
1416 2007 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1417 : }
1418 :
1419 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1420 59500 : n->data = exps[i];
1421 : }
1422 :
1423 1245850 : return rel;
1424 : }
1425 :
1426 : /* This optimization loop contains optimizations that can potentially use statistics */
1427 : static sql_rel *
1428 1245850 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1429 : {
1430 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1431 1245850 : rel = rel_push_select_up(v, rel);
1432 1245850 : rel = rel_select_order(v, rel);
1433 :
1434 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1435 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1436 : rel_distinct_aggregate_on_unique_values */
1437 :
1438 1245850 : rel = rel_groupby_order(v, rel);
1439 1245850 : return rel;
1440 : }
1441 :
1442 : static sql_rel *
1443 66296 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1444 : {
1445 66296 : (void) gp;
1446 66296 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1447 : }
1448 :
1449 : run_optimizer
1450 705970 : bind_final_optimization_loop(visitor *v, global_props *gp)
1451 : {
1452 705970 : int flag = v->sql->sql_optimizer;
1453 : /* At the moment, this optimizer has dependency on 3 flags */
1454 547722 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1455 772268 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1456 : }
|