Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024, 2025 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 : #include "sql_storage.h"
19 :
20 : static sql_exp *
21 3487684 : comparison_find_column(sql_exp *input, sql_exp *e)
22 : {
23 3487684 : switch (input->type) {
24 100105 : case e_convert: {
25 100105 : list *types = (list *)input->r;
26 100105 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
27 100105 : if (from == to)
28 193445 : return comparison_find_column(input->l, e) ? input : NULL;
29 : return NULL;
30 : }
31 3067500 : case e_column:
32 3067500 : return exp_match(e, input) ? input : NULL;
33 : default:
34 : return NULL;
35 : }
36 : }
37 :
38 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
39 : * columns */
40 : #define comp_single_column(c) (!c->f)
41 :
42 : static sql_exp *
43 8806471 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
44 : {
45 8890671 : assert(e->type == e_column);
46 8890671 : if (rel) {
47 8890590 : switch(rel->op) {
48 4110095 : case op_left:
49 : case op_right:
50 : case op_full:
51 : case op_join:
52 : case op_select:
53 : case op_anti:
54 : case op_semi: {
55 4110095 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
56 :
57 4110095 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
58 : return NULL; /* nothing will pass, skip */
59 :
60 : /* propagate from the bottom first */
61 4094464 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
62 : found_left = true;
63 2601533 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
64 4094464 : found_right = true;
65 :
66 4094464 : if (!found_left && !found_right)
67 : return NULL;
68 1790870 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
69 3515820 : for (node *n = rel->exps->h ; n ; n = n->next) {
70 1820315 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
71 :
72 1820315 : if (comp->type == e_cmp) {
73 1819311 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
74 124770 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
75 124769 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
76 :
77 : /* not semantics found or if explicitly filtering not null values from the column */
78 124769 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
79 124769 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
80 124769 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
81 19870 : continue;
82 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
83 104899 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
84 4618 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
85 4618 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
86 4618 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
87 4618 : symmetric = is_symmetric(comp);
88 :
89 4618 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
90 1815 : continue;
91 2803 : if (lne && int1 && int2) {
92 103 : if (symmetric) {
93 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
94 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
95 : /* max is min from le and (max from re and fe max) */
96 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
97 : /* min is max from le and (min from re and fe min) */
98 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
99 : } else {
100 103 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
101 : /* max is min from le and fe max */
102 103 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
103 : /* min is max from le and re min */
104 103 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
105 : }
106 2700 : } else if (rne) {
107 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
108 0 : prop *p = find_prop(e->p, PROP_MIN);
109 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
110 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
111 590 : } else if (int1) { /* min is max from le and re min */
112 100 : prop *p = find_prop(e->p, PROP_MIN);
113 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
114 : }
115 2110 : } else if (fne) {
116 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
117 0 : prop *p = find_prop(e->p, PROP_MAX);
118 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
119 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
120 549 : } else if (int2) { /* max is min from le and fe max */
121 266 : prop *p = find_prop(e->p, PROP_MAX);
122 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
123 : }
124 : }
125 100281 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
126 : /* both min and max must be set and the intervals must overlap */
127 41897 : switch (comp->flag) {
128 28235 : case cmp_equal: /* for equality reduce */
129 28235 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
130 28235 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
131 28235 : break;
132 4345 : case cmp_notequal: /* for inequality expand */
133 4345 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
134 4345 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
135 4345 : break;
136 5311 : case cmp_gt:
137 : case cmp_gte:
138 9684 : if (!is_anti(comp) && lne) { /* min is max from both min */
139 4373 : prop *p = find_prop(e->p, PROP_MIN);
140 4373 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
141 938 : } else if (!is_anti(comp)) { /* max is min from both max */
142 938 : prop *p = find_prop(e->p, PROP_MAX);
143 938 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
144 : }
145 : break;
146 4006 : case cmp_lt:
147 : case cmp_lte:
148 7205 : if (!is_anti(comp) && lne) { /* max is min from both max */
149 3199 : prop *p = find_prop(e->p, PROP_MAX);
150 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
151 807 : } else if (!is_anti(comp)) { /* min is max from both min */
152 807 : prop *p = find_prop(e->p, PROP_MIN);
153 807 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
154 : }
155 : break;
156 : default: /* Maybe later I can do cmp_in and cmp_notin */
157 : break;
158 : }
159 : }
160 : }
161 : }
162 : }
163 : }
164 1790871 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
165 35775 : set_has_nil(e);
166 1790871 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
167 94052 : set_has_no_nil(e);
168 1790871 : if (is_join(rel->op) && is_unique(e) && !still_unique)
169 119363 : set_not_unique(e);
170 1790871 : return e;
171 : }
172 4676927 : case op_table:
173 : case op_basetable:
174 : case op_union:
175 : case op_except:
176 : case op_inter:
177 : case op_munion:
178 : case op_project:
179 : case op_groupby: {
180 4676927 : if (is_recursive(rel))
181 : return NULL;
182 4676730 : sql_exp *found;
183 4676730 : atom *fval;
184 4676730 : prop *est;
185 4676730 : if ((found = rel_find_exp(rel, e))) {
186 2201562 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
187 2156236 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
188 1138635 : set_minmax_property(sql, e, PROP_MAX, fval);
189 2156235 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
190 1145712 : set_minmax_property(sql, e, PROP_MIN, fval);
191 2156236 : if (!has_nil(found))
192 1394473 : set_has_no_nil(e);
193 2156236 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
194 1739671 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
195 427845 : set_unique(e);
196 : /* propagate unique estimation for known cases */
197 2156236 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
198 7645 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
199 7645 : p->value.dval = 1;
200 2148591 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
201 72019 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
202 2089288 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
203 195320 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
204 195320 : p->value.dval = est->value.dval;
205 : }
206 : }
207 2201560 : return e;
208 : }
209 : return NULL;
210 : }
211 84200 : case op_topn:
212 : case op_sample:
213 84200 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
214 : default:
215 : break;
216 : }
217 : }
218 : return NULL;
219 : }
220 :
221 : static atom *
222 4630633 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
223 : {
224 4630633 : atom *a = SA_NEW(sa, atom);
225 :
226 4630560 : assert(!VALisnil(v));
227 4630360 : *a = (atom) {.tpe = *tpe,};
228 4630360 : SA_VALcopy(sa, &a->data, v);
229 4630415 : return a;
230 : }
231 :
232 : void
233 4277875 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
234 : {
235 4277875 : bool nonil = false, unique = false;
236 4277875 : double unique_est = 0.0;
237 4277875 : ValRecord min, max;
238 4277875 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
239 :
240 4278058 : if (has_nil(e) && nonil)
241 2821229 : set_has_no_nil(e);
242 4278058 : if (!is_unique(e) && unique)
243 1129232 : set_unique(e);
244 4278058 : if (unique_est != 0.0) {
245 3009192 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
246 3009198 : p->value.dval = unique_est;
247 : }
248 4278064 : unsigned int digits = 0;
249 4278064 : sql_subtype *et = exp_subtype(e);
250 4278052 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
251 2779581 : digits = et->digits;
252 4278052 : if ((ok & 2) == 2) {
253 2312323 : if (!VALisnil(&max)) {
254 2312313 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
255 2312275 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
256 2312260 : if (digits) {
257 1718874 : unsigned int nd = atom_digits(p->value.pval);
258 1718900 : if (nd < digits)
259 : digits = nd;
260 1718900 : if (!digits)
261 : digits = 1;
262 : }
263 : }
264 2312292 : VALclear(&max);
265 : }
266 4277991 : if ((ok & 1) == 1) {
267 2318457 : if (!VALisnil(&min)) {
268 2318444 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
269 2318430 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
270 2318374 : if (digits) {
271 1726000 : unsigned int nd = atom_digits(p->value.pval);
272 1726001 : if (nd > digits) /* possibly set to low by max value */
273 : digits = nd;
274 : if (!digits)
275 : digits = 1;
276 : }
277 : }
278 2318375 : VALclear(&min);
279 : }
280 4277903 : if (digits)
281 2779431 : et->digits = digits;
282 4277903 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
283 216 : et->digits = et->scale + 1;
284 4277903 : }
285 :
286 : static void
287 953848 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
288 : {
289 953848 : if (e->p)
290 : return;
291 306958 : sql_column *c = NULL;
292 :
293 306958 : if ((c = rel_base_find_column(rel, e->nid))) {
294 208620 : sql_column_get_statistics(sql, c, e);
295 : }
296 : }
297 :
298 : static bool
299 8876 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
300 : {
301 8876 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
302 8876 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
303 8876 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
304 8876 : prop *est;
305 :
306 : /* for the intersection, if both expressions don't overlap, it can be pruned */
307 8876 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
308 511 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
309 26 : return true;
310 :
311 8850 : if (lval_max && rval_max) {
312 2557 : if (is_union(rel->op))
313 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
314 2554 : else if (is_inter(rel->op))
315 399 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
316 : else /* except */
317 2155 : set_minmax_property(sql, e, PROP_MAX, lval_max);
318 : }
319 8850 : if (lval_min && rval_min) {
320 2557 : if (is_union(rel->op))
321 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
322 2554 : else if (is_inter(rel->op))
323 399 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
324 : else /* except */
325 2155 : set_minmax_property(sql, e, PROP_MIN, lval_min);
326 : }
327 :
328 8850 : if (is_union(rel->op) || is_munion(rel->op)) {
329 5986 : if (!has_nil(le) && !has_nil(re))
330 879 : set_has_no_nil(e);
331 5986 : if (need_distinct(rel) && list_length(rel->exps) == 1)
332 0 : set_unique(e);
333 2864 : } else if (is_inter(rel->op)) {
334 564 : if (!has_nil(le) || !has_nil(re))
335 509 : set_has_no_nil(e);
336 564 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
337 345 : set_unique(e);
338 : } else {
339 2300 : assert(is_except(rel->op));
340 2300 : if (!has_nil(le))
341 2236 : set_has_no_nil(e);
342 2300 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
343 2012 : set_unique(e);
344 : }
345 : /* propagate unique estimation for known cases */
346 8850 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
347 207 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
348 207 : p->value.dval = est->value.dval;
349 : }
350 : return false;
351 : }
352 :
353 :
354 : static void
355 62850 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
356 : {
357 62850 : if (is_recursive(rel))
358 : return ;
359 62850 : assert(is_munion(rel->op));
360 :
361 62850 : sql_rel *l = rels->h->data;
362 62850 : sql_exp *le = list_fetch(l->exps, i);
363 62850 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
364 62850 : bool has_nonil = !has_nil(le);
365 :
366 182776 : for(node *n = rels->h->next; n; n = n->next) {
367 119926 : sql_rel *r = n->data;
368 119926 : sql_exp *re = list_fetch(r->exps, i);
369 119926 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
370 :
371 119926 : if (lval_max && rval_max) {
372 44201 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
373 44201 : lval_max = find_prop_and_get(e->p, PROP_MAX);
374 : }
375 119926 : if (lval_min && rval_min) {
376 43652 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
377 43652 : lval_min = find_prop_and_get(e->p, PROP_MIN);
378 : }
379 119926 : has_nonil &= !has_nil(re);
380 :
381 : }
382 :
383 62850 : if (has_nonil)
384 42803 : set_has_no_nil(e);
385 :
386 62850 : if (need_distinct(rel) && list_length(rel->exps) == 1)
387 1597 : set_unique(e);
388 : }
389 :
390 :
391 : static sql_exp *
392 3555405 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
393 : {
394 3555405 : mvc *sql = v->sql;
395 3555405 : atom *lval;
396 :
397 3555405 : (void) depth;
398 3555405 : switch(e->type) {
399 2207547 : case e_column:
400 2207547 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
401 284692 : case op_join:
402 : case op_left:
403 : case op_right:
404 : case op_full:
405 : case op_semi:
406 : case op_anti: {
407 284692 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
408 284692 : if (!found)
409 142794 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
410 : break;
411 : }
412 1922855 : case op_select:
413 : case op_project:
414 : case op_groupby: {
415 1922855 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
416 1922851 : if (!found && is_simple_project(rel->op))
417 128704 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
418 : break;
419 : }
420 0 : case op_insert:
421 : case op_update:
422 : case op_delete:
423 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
424 0 : break;
425 : default:
426 : break;
427 : }
428 : break;
429 101863 : case e_convert: {
430 101863 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
431 101863 : sql_exp *l = e->l;
432 101863 : sql_class fr = from->type->eclass, too = to->type->eclass;
433 101863 : prop *est;
434 :
435 101863 : if (fr == too) {
436 92701 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
437 60263 : atom *res = atom_copy(sql->sa, lval);
438 60263 : if ((res = atom_cast(sql->sa, res, to)))
439 60240 : set_minmax_property(sql, e, PROP_MAX, res);
440 : }
441 92701 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
442 60901 : atom *res = atom_copy(sql->sa, lval);
443 60901 : if ((res = atom_cast(sql->sa, res, to)))
444 60878 : set_minmax_property(sql, e, PROP_MIN, res);
445 : }
446 : }
447 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
448 101863 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
449 63324 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
450 63299 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
451 59865 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
452 59865 : p->value.dval = est->value.dval;
453 : }
454 101863 : if (!has_nil(l))
455 57955 : set_has_no_nil(e);
456 : break;
457 : }
458 379261 : case e_aggr:
459 : case e_func: {
460 379261 : BUN lv;
461 379261 : sql_subfunc *f = e->f;
462 :
463 379261 : if (!f->func->s) {
464 352146 : int key = hash_key(f->func->base.name); /* Using hash lookup */
465 352146 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
466 352146 : lookup_function look = NULL;
467 :
468 753772 : for (; he && !look; he = he->chain) {
469 401626 : struct function_properties* fp = (struct function_properties*) he->value;
470 :
471 401626 : if (!strcmp(f->func->base.name, fp->name))
472 108572 : look = fp->func;
473 : }
474 352146 : if (look)
475 108572 : look(sql, e);
476 : }
477 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
478 379261 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
479 109271 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
480 108935 : set_has_no_nil(e);
481 : /* set properties for global aggregates */
482 379261 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
483 7949 : if (!find_prop(e->p, PROP_NUNIQUES)) {
484 7949 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
485 7949 : p->value.dval = 1;
486 : }
487 7949 : set_unique(e);
488 : }
489 : break;
490 : }
491 584525 : case e_atom:
492 584525 : if (e->l) {
493 566496 : atom *a = (atom*) e->l;
494 566496 : if (!a->isnull) {
495 501590 : set_minmax_property(sql, e, PROP_MAX, a);
496 501590 : set_minmax_property(sql, e, PROP_MIN, a);
497 : }
498 566496 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
499 566496 : p->value.dval = 1;
500 18029 : } else if (e->f) {
501 4302 : list *vals = (list *) e->f;
502 4302 : sql_exp *first = vals->h ? vals->h->data : NULL;
503 4302 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
504 4302 : int has_nil = 0;
505 :
506 4302 : if (first) {
507 4302 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
508 4302 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
509 4302 : has_nil |= has_nil(first);
510 : }
511 :
512 18436 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
513 9832 : sql_exp *ee = n->data;
514 :
515 9832 : if (min && max) {
516 9339 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
517 9293 : max = atom_cmp(lval, max) > 0 ? lval : max;
518 : } else {
519 : max = NULL;
520 : }
521 9339 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
522 9293 : min = atom_cmp(min, lval) > 0 ? lval : min;
523 : } else {
524 : min = NULL;
525 : }
526 : }
527 9832 : has_nil |= has_nil(ee);
528 : }
529 :
530 4302 : if (!has_nil)
531 3928 : set_has_no_nil(e);
532 4302 : if (min && max) {
533 3886 : set_minmax_property(sql, e, PROP_MAX, max);
534 3886 : set_minmax_property(sql, e, PROP_MIN, min);
535 : }
536 4302 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
537 4302 : p->value.dval = (dbl) list_length(vals);
538 : } else {
539 13727 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
540 13727 : p->value.dval = 1;
541 : }
542 : break;
543 282209 : case e_cmp:
544 : /* TODO? propagating min/max/unique of booleans is not very worth it */
545 282209 : if (e->flag == cmp_or || e->flag == cmp_filter) {
546 14765 : if (!have_nil(e->l) && !have_nil(e->r))
547 10114 : set_has_no_nil(e);
548 267444 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
549 20162 : sql_exp *le = e->l;
550 20162 : if (!has_nil(le) && !have_nil(e->r))
551 16927 : set_has_no_nil(e);
552 : } else {
553 247282 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
554 247282 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
555 174382 : set_has_no_nil(e);
556 : }
557 : break;
558 : case e_psm:
559 : break;
560 : }
561 :
562 : #ifndef NDEBUG
563 : {
564 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
565 3555401 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
566 :
567 3555402 : (void) min;
568 3555402 : (void) max;
569 3555402 : assert(!min || !min->isnull);
570 3555402 : assert(!max || !max->isnull);
571 3555402 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
572 : }
573 : #endif
574 3555402 : return e;
575 : }
576 :
577 : static list * /* Remove predicates always false from min/max values */
578 141312 : rel_prune_predicates(visitor *v, sql_rel *rel)
579 : {
580 141312 : if (rel->l) {
581 141312 : sql_rel *l = rel->l;
582 141312 : if (is_single(l))
583 3372 : return rel->exps;
584 : }
585 137940 : if (!list_empty(rel->attr))
586 3003 : return rel->exps;
587 284285 : for (node *n = rel->exps->h ; n ; n = n->next) {
588 149348 : sql_exp *e = n->data;
589 :
590 149348 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
591 125227 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
592 125227 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
593 125227 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
594 125227 : bool always_false = false, always_true = false;
595 :
596 125227 : if (fe && !is_symmetric(e)) {
597 3045 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
598 3045 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
599 3644 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
600 1654 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
601 4059 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
602 2482 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
603 3045 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
604 1285 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
605 :
606 3045 : always_false |= not_int1 || not_int2 || not_int3;
607 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
608 4089 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
609 3941 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
610 573 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
611 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
612 : } else if (!fe) {
613 122166 : if (!is_semantics(e)) /* trivial not null cmp null case */
614 232888 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
615 122166 : switch (e->flag) {
616 108640 : case cmp_equal:
617 108640 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
618 135960 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
619 108640 : if (is_semantics(e)) { /* prune *= NULL cases */
620 5685 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
621 11370 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
622 : }
623 : break;
624 5334 : case cmp_notequal:
625 5334 : if (lval_min && lval_max && rval_min && rval_max)
626 7364 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
627 5334 : if (is_semantics(e)) {
628 37 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
629 74 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
630 : }
631 : break;
632 5487 : case cmp_gt:
633 5487 : if (lval_max && rval_min)
634 1834 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
635 5487 : if (lval_min && rval_max)
636 3668 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
637 : break;
638 121 : case cmp_gte:
639 121 : if (lval_max && rval_min)
640 51 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
641 121 : if (lval_min && rval_max)
642 102 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
643 : break;
644 2502 : case cmp_lt:
645 2502 : if (lval_min && rval_max)
646 1382 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
647 2502 : if (lval_max && rval_min)
648 2812 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
649 : break;
650 82 : case cmp_lte:
651 82 : if (lval_min && rval_max)
652 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
653 82 : if (lval_max && rval_min)
654 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
655 : break;
656 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
657 : break;
658 : }
659 : }
660 125227 : assert(!always_false || !always_true);
661 125227 : if (always_false || always_true) {
662 3698 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
663 3698 : if (exp_name(e))
664 0 : exp_prop_alias(v->sql->sa, ne, e);
665 3698 : n->data = ne;
666 3698 : v->changes++;
667 : }
668 : }
669 : }
670 134937 : return rel->exps;
671 : }
672 :
673 : static sql_rel *
674 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
675 : {
676 14 : if (side == rel->r)
677 0 : rel->r = NULL;
678 : else
679 14 : rel->l = NULL;
680 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
681 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
682 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
683 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
684 : }
685 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
686 21 : sql_exp *e1 = n->data, *e2 = m->data;
687 21 : exp_prop_alias(v->sql->sa, e2, e1);
688 : }
689 14 : list_hash_clear(side->exps);
690 :
691 14 : if (need_distinct(rel))
692 0 : set_distinct(side);
693 14 : side->p = prop_copy(v->sql->sa, rel->p);
694 14 : rel_destroy(rel);
695 14 : return side;
696 : }
697 :
698 : static BUN
699 22258 : trivial_project_exp_card(sql_exp *e)
700 : {
701 22633 : if (e->type == e_convert)
702 375 : return trivial_project_exp_card(e->l);
703 22258 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
704 : }
705 :
706 : static BUN
707 26198 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
708 : {
709 26198 : BUN lv = get_rel_count(l);
710 :
711 26198 : if (lv == 0)
712 : return 0;
713 23462 : if (!list_empty(exps)) {
714 23462 : BUN nuniques = 0;
715 : /* compute the highest number of unique values */
716 58644 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
717 35182 : sql_exp *e = n->data;
718 35182 : sql_rel *bt = NULL;
719 35182 : prop *p = NULL;
720 35182 : BUN euniques = BUN_NONE;
721 35182 : atom *min, *max, *sub = NULL;
722 35182 : sql_subtype *tp = exp_subtype(e);
723 35182 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
724 :
725 35182 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
726 25419 : euniques = (BUN) p->value.dval;
727 9763 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
728 7665 : euniques = (BUN) p->value.lval;
729 : }
730 : /* use min to max range to compute number of possible values in the domain for number types */
731 35182 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
732 24292 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
733 : /* the range includes min and max, so the atom_inc call is needed */
734 : /* if 'euniques' has number of distinct values, compute min between both */
735 19346 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
736 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
737 : }
738 35182 : if (euniques != BUN_NONE)
739 33084 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
740 : else
741 : nuniques = BUN_NONE;
742 : }
743 23462 : if (nuniques != BUN_NONE)
744 : return nuniques;
745 : }
746 : return lv;
747 : }
748 :
749 : static sql_rel *
750 2108733 : rel_get_statistics_(visitor *v, sql_rel *rel)
751 : {
752 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
753 2108733 : uint8_t has_special_modify = *(uint8_t*) v->data;
754 2108733 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
755 :
756 : /* Don't look at the same relation twice */
757 2108733 : if (are_statistics_gathered(rel->used))
758 : return rel;
759 1360063 : rel->used |= statistics_gathered;
760 :
761 1360063 : switch (rel->op) {
762 319466 : case op_basetable: {
763 319466 : sql_table *t = (sql_table *) rel->l;
764 319466 : sqlstore *store = v->sql->session->tr->store;
765 :
766 319466 : if (!list_empty(rel->exps)) {
767 1273265 : for (node *n = rel->exps->h ; n ; n = n->next)
768 953848 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
769 : }
770 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
771 319466 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
772 264703 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
773 : break;
774 : }
775 2794 : case op_union:
776 : case op_inter:
777 : case op_except: {
778 2794 : bool empty_cross = false;
779 2794 : int i = 0;
780 2794 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
781 :
782 2794 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
783 0 : pl = pl->l;
784 2794 : while (is_sample(pr->op) || is_topn(pr->op))
785 0 : pr = pr->l;
786 : /* if it's not a projection, then project and propagate statistics */
787 2794 : if (!is_project(pl->op) && !is_base(pl->op)) {
788 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
789 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
790 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
791 : }
792 2794 : if (!is_project(pr->op) && !is_base(pr->op)) {
793 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
794 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
795 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
796 : }
797 :
798 11670 : for (node *n = rel->exps->h ; n ; n = n->next) {
799 8876 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
800 8876 : i++;
801 : }
802 :
803 : /* propagate row count */
804 2794 : if (is_union(rel->op)) {
805 277 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
806 277 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
807 :
808 277 : if (lv == 0 && rv == 0) { /* both sides empty */
809 2 : if (can_be_pruned)
810 : empty_cross = true;
811 : else
812 2 : set_count_prop(v->sql->sa, rel, 0);
813 275 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
814 0 : rel = set_setop_side(v, rel, r);
815 0 : empty_cross = false; /* don't rewrite again */
816 275 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
817 0 : rel = set_setop_side(v, rel, l);
818 0 : empty_cross = false; /* don't rewrite again */
819 275 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
820 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
821 : }
822 2517 : } else if (is_inter(rel->op) || is_except(rel->op)) {
823 2517 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
824 2517 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
825 :
826 2517 : if (lv == 0) { /* left side empty */
827 63 : if (can_be_pruned)
828 : empty_cross = true;
829 : else
830 5 : set_count_prop(v->sql->sa, rel, 0);
831 2454 : } else if (rv == 0) { /* right side empty */
832 17 : if (is_inter(rel->op)) {
833 3 : if (can_be_pruned)
834 : empty_cross = true;
835 : else
836 0 : set_count_prop(v->sql->sa, rel, 0);
837 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
838 14 : rel = set_setop_side(v, rel, l);
839 14 : empty_cross = false; /* don't rewrite again */
840 : } else {
841 0 : set_count_prop(v->sql->sa, rel, lv);
842 : }
843 : } else {
844 2437 : set_count_prop(v->sql->sa, rel, lv);
845 : }
846 : }
847 2794 : if (can_be_pruned && empty_cross) {
848 81 : rel_destroy(rel->l);
849 81 : rel->l = NULL;
850 81 : rel_destroy(rel->r);
851 81 : rel->r = NULL;
852 246 : for (node *n = rel->exps->h ; n ; n = n->next) {
853 165 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
854 165 : exp_prop_alias(v->sql->sa, a, e);
855 165 : n->data = a;
856 : }
857 81 : list_hash_clear(rel->exps);
858 81 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
859 81 : set_count_prop(v->sql->sa, l, 1);
860 81 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
861 81 : set_count_prop(v->sql->sa, l, 0);
862 81 : rel->op = op_project;
863 81 : rel->l = l;
864 81 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
865 81 : set_count_prop(v->sql->sa, rel, 0);
866 81 : set_nodistinct(rel); /* set relations may have distinct flag set */
867 81 : v->changes++;
868 : }
869 : break;
870 : }
871 13477 : case op_munion: {
872 13477 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
873 13477 : BUN cnt = 0;
874 13477 : bool needs_pruning = false;
875 :
876 13477 : if (is_recursive(rel))
877 : break;
878 51248 : for (node *n = l->h; n; n = n->next) {
879 37842 : sql_rel *r = n->data, *pl = r;
880 :
881 37842 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
882 0 : pl = pl->l;
883 : /* if it's not a projection, then project and propagate statistics */
884 37842 : if (!is_project(pl->op) && !is_base(pl->op)) {
885 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
886 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
887 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
888 : }
889 37842 : nrels = append(nrels, pl);
890 : /* we need new munion statistics */
891 : /* propagate row count */
892 37842 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
893 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
894 37842 : if (rv == BUN_NONE) {
895 1220 : cnt++;
896 1220 : continue;
897 : }
898 36622 : if (!rv && can_be_pruned)
899 6724 : needs_pruning = true;
900 : /* overflow check */
901 36622 : if (rv > (BUN_MAX - cnt))
902 37842 : rv = BUN_MAX;
903 : else
904 36622 : cnt += rv;
905 : }
906 13406 : int i = 0;
907 76256 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
908 62850 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
909 :
910 13406 : if (needs_pruning && !rel_is_ref(rel)) {
911 4563 : v->changes++;
912 4563 : list *nl = sa_list(l->sa);
913 :
914 16832 : for (node *n = nrels->h; n; n = n->next) {
915 12269 : sql_rel *r = n->data;
916 12269 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
917 :
918 12269 : if (!rv) { /* keep last for now */
919 6254 : rel_destroy(r);
920 6254 : continue;
921 : }
922 6015 : nl = append(nl, r);
923 : }
924 4563 : rel->l = nl;
925 4563 : if (list_length(nl) == 1) {
926 4227 : sql_rel *l = rel->l = nl->h->data; /* ugh */
927 4227 : rel->r = NULL;
928 4227 : rel->op = op_project;
929 :
930 20767 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
931 16540 : sql_exp *pe = n->data, *ie = m->data;
932 16540 : sql_exp *ne = exp_ref(v->sql, ie);
933 16540 : exp_prop_alias(v->sql->sa, ne, pe);
934 16540 : n->data = ne;
935 : }
936 4227 : list_hash_clear(rel->exps);
937 336 : } else if (list_empty(nl)) {
938 : /* empty select (project [ nils ] ) */
939 441 : for (node *n = rel->exps->h ; n ; n = n->next) {
940 341 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
941 341 : exp_prop_alias(v->sql->sa, a, e);
942 341 : n->data = a;
943 : }
944 100 : list_hash_clear(rel->exps);
945 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
946 100 : set_count_prop(v->sql->sa, l, 1);
947 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
948 100 : set_count_prop(v->sql->sa, l, 0);
949 100 : rel->op = op_project;
950 100 : rel->r = NULL;
951 100 : rel->l = l;
952 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
953 100 : set_count_prop(v->sql->sa, rel, 0);
954 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
955 : }
956 : } else {
957 8843 : set_count_prop(v->sql->sa, rel, cnt);
958 : }
959 : break;
960 : }
961 542078 : case op_join:
962 : case op_left:
963 : case op_right:
964 : case op_full:
965 : case op_semi:
966 : case op_anti:
967 : case op_select:
968 : case op_project:
969 : case op_groupby: {
970 542078 : if (is_groupby(rel->op) && !list_empty(rel->r))
971 16934 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
972 542078 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
973 542077 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
974 39574 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
975 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
976 542077 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
977 141312 : int changes = v->changes;
978 141312 : rel->exps = rel_prune_predicates(v, rel);
979 141312 : if (v->changes > changes)
980 3665 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
981 : }
982 :
983 : /* propagate row count */
984 542077 : sql_rel *l = rel->l, *r = rel->r;
985 542077 : switch (rel->op) {
986 137091 : case op_join:
987 : case op_left:
988 : case op_right:
989 : case op_full: {
990 137091 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
991 :
992 137091 : if (!list_empty(rel->exps) && !is_single(rel)) {
993 250671 : for (node *n = rel->exps->h ; n ; n = n->next) {
994 127814 : sql_exp *e = n->data, *el = e->l, *er = e->r;
995 :
996 127814 : if (find_prop(e->p, PROP_JOINIDX)) {
997 667 : join_idx_estimate = lv>rv?lv:rv;
998 127147 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
999 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
1000 123262 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
1001 123085 : BUN lu = 0, ru = 0;
1002 123085 : prop *p = NULL;
1003 123085 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
1004 91312 : lu = (BUN) p->value.dval;
1005 123085 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
1006 105982 : ru = (BUN) p->value.dval;
1007 123085 : if (is_unique(el) || lu > lv) {
1008 74975 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1009 74975 : uniques_estimate = MIN(uniques_estimate, ncount);
1010 48110 : } else if (is_unique(er) || ru > rv) {
1011 30057 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1012 30057 : uniques_estimate = MIN(uniques_estimate, ncount);
1013 : }
1014 : }
1015 : }
1016 : }
1017 : }
1018 137091 : if (is_single(rel)) {
1019 4801 : set_count_prop(v->sql->sa, rel, lv);
1020 132290 : } else if (join_idx_estimate != BUN_MAX) {
1021 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1022 131625 : } else if (uniques_estimate != BUN_MAX) {
1023 98478 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1024 33147 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1025 : /* corner cases for outer joins */
1026 126 : if (is_left(rel->op)) {
1027 114 : set_count_prop(v->sql->sa, rel, lv);
1028 12 : } else if (is_right(rel->op)) {
1029 3 : set_count_prop(v->sql->sa, rel, rv);
1030 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1031 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1032 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1033 0 : set_count_prop(v->sql->sa, rel, 0);
1034 : }
1035 33021 : } else if (lv == 0) {
1036 1204 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1037 32405 : } else if (rv == 0) {
1038 1574 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1039 31325 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1040 18541 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1041 : }
1042 : break;
1043 : }
1044 2935 : case op_anti:
1045 2935 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1046 2935 : break;
1047 110552 : case op_semi:
1048 : case op_select:
1049 : /* TODO calculate cardinalities using selectivities */
1050 110552 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1051 2755 : set_count_prop(v->sql->sa, rel, 0);
1052 : } else {
1053 212307 : if (!list_empty(rel->exps) && !is_single(rel)) {
1054 104511 : BUN cnt = get_rel_count(l), u = 1;
1055 170387 : for (node *n = rel->exps->h ; n ; n = n->next) {
1056 111675 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1057 :
1058 : /* simple expressions first */
1059 111675 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1060 : /* use selectivity */
1061 61262 : prop *p;
1062 61262 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1063 45800 : u = (BUN) p->value.dval;
1064 45800 : break;
1065 : }
1066 : }
1067 : }
1068 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1069 104512 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1070 : } else {
1071 3285 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1072 : }
1073 : }
1074 : break;
1075 266657 : case op_project:
1076 266657 : if (l) {
1077 256322 : if (need_distinct(rel)) {
1078 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1079 : } else {
1080 256322 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1081 : }
1082 : } else {
1083 10335 : BUN card = 1;
1084 :
1085 10335 : if (!list_empty(rel->exps)) {
1086 31244 : for (node *n = rel->exps->h ; n ; n = n->next) {
1087 20909 : sql_exp *e = n->data;
1088 20909 : card = MAX(card, trivial_project_exp_card(e));
1089 : }
1090 : }
1091 10335 : set_count_prop(v->sql->sa, rel, card);
1092 : }
1093 : break;
1094 24431 : case op_groupby:
1095 24431 : if (list_empty(rel->r)) {
1096 7496 : set_count_prop(v->sql->sa, rel, 1);
1097 : } else {
1098 16935 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1099 : }
1100 : break;
1101 : default:
1102 : break;
1103 : }
1104 : break;
1105 : }
1106 17014 : case op_topn: {
1107 17014 : BUN lv = get_rel_count(rel->l);
1108 :
1109 17014 : if (lv != BUN_NONE) {
1110 16991 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1111 84 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1112 84 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1113 84 : lv = offset >= lv ? 0 : lv - offset;
1114 : }
1115 16991 : if (le->l && exp_is_not_null(le)) {
1116 16956 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1117 16956 : lv = MIN(lv, limit);
1118 : }
1119 16991 : set_count_prop(v->sql->sa, rel, lv);
1120 : }
1121 : break;
1122 : }
1123 22 : case op_sample: {
1124 22 : BUN lv = get_rel_count(rel->l);
1125 :
1126 22 : if (lv != BUN_NONE) {
1127 4 : sql_exp *se = rel->exps->h->data;
1128 4 : sql_subtype *tp = exp_subtype(se);
1129 :
1130 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1131 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1132 4 : lv = MIN(lv, sample);
1133 0 : } else if (se->l) { /* sample is a percentage of rows */
1134 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1135 0 : assert(tp->type->eclass == EC_FLT);
1136 0 : lv = (BUN) ceil((dbl)lv * percent);
1137 : }
1138 4 : set_count_prop(v->sql->sa, rel, lv);
1139 : }
1140 : break;
1141 : }
1142 6196 : case op_table: {
1143 6196 : sql_exp *op = rel->r;
1144 6196 : if (rel->flag != TRIGGER_WRAPPER && op) {
1145 5884 : sql_subfunc *f = op->f;
1146 5884 : if (f->func->lang == FUNC_LANG_SQL) {
1147 97 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1148 5787 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1149 837 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1150 4950 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1151 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1152 4950 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1153 1101 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1154 3849 : } else if (f->func->lang == FUNC_LANG_MAL &&
1155 3684 : (strcmp(f->func->base.name, "queue") == 0 ||
1156 3416 : strcmp(f->func->base.name, "optimizers") == 0 ||
1157 3100 : strcmp(f->func->base.name, "env") == 0 ||
1158 2734 : strcmp(f->func->base.name, "keywords") == 0 ||
1159 2734 : strcmp(f->func->base.name, "statistics") == 0 ||
1160 2057 : strcmp(f->func->base.name, "rejects") == 0 ||
1161 1799 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1162 1799 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1163 1799 : strcmp(f->func->base.name, "sessions") == 0) ) {
1164 2179 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1165 : }
1166 : /* else {
1167 : printf("%%func needs stats : %s\n", f->func->base.name);
1168 : } */
1169 : }
1170 : break;
1171 : }
1172 : /*These relations are less important for now
1173 : TODO later we can tune it */
1174 : case op_insert:
1175 : case op_update:
1176 : case op_delete:
1177 : case op_truncate:
1178 : case op_ddl:
1179 : default:
1180 : break;
1181 : }
1182 :
1183 : return rel;
1184 : }
1185 :
1186 : static sql_rel *
1187 552909 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1188 : {
1189 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1190 552909 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1191 552909 : v->data = &has_special_modify;
1192 552909 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1193 552909 : v->data = gp;
1194 552909 : return rel;
1195 : }
1196 :
1197 : run_optimizer
1198 729811 : bind_get_statistics(visitor *v, global_props *gp)
1199 : {
1200 729811 : (void) v;
1201 729811 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1202 : }
1203 :
1204 :
1205 : static bool
1206 96527 : point_select_on_unique_column(sql_rel *rel)
1207 : {
1208 96527 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1209 133131 : for (node *n = rel->exps->h; n ; n = n->next) {
1210 76721 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1211 :
1212 76721 : if (is_compare(e->type) && e->flag == cmp_equal) {
1213 33992 : if (is_numeric_upcast(el))
1214 0 : el = el->l;
1215 33992 : if (is_numeric_upcast(er))
1216 0 : er = er->l;
1217 33992 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1218 761 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1219 : return true;
1220 33231 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1221 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1222 : return true;
1223 : }
1224 : }
1225 : }
1226 : return false;
1227 : }
1228 :
1229 : /*
1230 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1231 : * join, the opposite side's select can be pushed above the join.
1232 : */
1233 : static inline sql_rel *
1234 1269837 : rel_push_select_up(visitor *v, sql_rel *rel)
1235 : {
1236 1269837 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1237 258573 : sql_rel *l = rel->l, *r = rel->r;
1238 258573 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1239 258573 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1240 :
1241 258573 : if (can_pushup_left || can_pushup_right) {
1242 68067 : if (can_pushup_left)
1243 46246 : can_pushup_left = point_select_on_unique_column(r);
1244 68067 : if (can_pushup_right)
1245 50281 : can_pushup_right = point_select_on_unique_column(l);
1246 :
1247 : /* if both selects retrieve one row each, it's not worth it to push both up */
1248 68067 : if (can_pushup_left && !can_pushup_right) {
1249 695 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1250 695 : nrel->l = l->l;
1251 695 : rel = rel_inplace_select(rel, nrel, l->exps);
1252 695 : assert(is_select(rel->op));
1253 695 : v->changes++;
1254 67372 : } else if (!can_pushup_left && can_pushup_right) {
1255 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1256 14 : nrel->r = r->l;
1257 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1258 14 : assert(is_select(rel->op));
1259 14 : v->changes++;
1260 : }
1261 : }
1262 : }
1263 1269837 : return rel;
1264 : }
1265 :
1266 : static int
1267 97955 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1268 : {
1269 97955 : int de;
1270 :
1271 97955 : if (!t)
1272 : return 0;
1273 97955 : switch (ATOMstorage(t->type->localtype)) {
1274 : case TYPE_bte:
1275 : return 150 - 8;
1276 1642 : case TYPE_sht:
1277 1642 : return 150 - 16;
1278 37649 : case TYPE_int:
1279 37649 : return 150 - 32;
1280 523 : case TYPE_void:
1281 : case TYPE_lng:
1282 523 : return 150 - 64;
1283 : case TYPE_uuid:
1284 : #ifdef HAVE_HGE
1285 : case TYPE_hge:
1286 : #endif
1287 : return 150 - 128;
1288 2 : case TYPE_flt:
1289 2 : return 75 - 24;
1290 : case TYPE_dbl:
1291 : return 75 - 53;
1292 34302 : default:
1293 34302 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1294 1656 : return 150 - de * 8;
1295 : /* strings and blobs not duplicate eliminated don't get any points here */
1296 : return 0;
1297 : }
1298 : }
1299 :
1300 : static int
1301 34506 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1302 : {
1303 34506 : int res = 0;
1304 34506 : sql_subtype *t = exp_subtype(e);
1305 34506 : sql_column *c = NULL;
1306 :
1307 34506 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1308 : return -1000;
1309 : /* can we find out if the underlying table is sorted */
1310 33906 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1311 33906 : res += 600;
1312 :
1313 : /* prefer the shorter var types over the longer ones */
1314 33906 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1315 33906 : return res;
1316 : }
1317 :
1318 : static int
1319 56536 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1320 : {
1321 56536 : int score = 0;
1322 56536 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1323 34506 : sql_exp *l = e->l;
1324 :
1325 34506 : while (l->type == e_cmp) { /* go through nested comparisons */
1326 2 : sql_exp *ll;
1327 :
1328 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1329 0 : ll = ((list*)l->l)->h->data;
1330 : else
1331 2 : ll = l->l;
1332 2 : if (ll->type != e_cmp)
1333 : break;
1334 : l = ll;
1335 : }
1336 34506 : score += score_se_base(v, rel, l);
1337 : }
1338 56536 : score += exp_keyvalue(e);
1339 56536 : return score;
1340 : }
1341 :
1342 : static inline sql_rel *
1343 1269837 : rel_select_order(visitor *v, sql_rel *rel)
1344 : {
1345 1269837 : int *scores = NULL;
1346 1269837 : sql_exp **exps = NULL;
1347 :
1348 1269837 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1349 26852 : node *n;
1350 26852 : int i, nexps = list_length(rel->exps);
1351 26852 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1352 26852 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1353 :
1354 83388 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1355 56566 : exps[i] = n->data;
1356 56566 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1357 : return rel;
1358 56536 : scores[i] = score_se(v, rel, n->data);
1359 : }
1360 26822 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1361 :
1362 83358 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1363 56536 : n->data = exps[i];
1364 : }
1365 :
1366 : return rel;
1367 : }
1368 :
1369 : /* Compute the efficiency of using this expression early in a group by list */
1370 : static int
1371 64049 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1372 : {
1373 64049 : int res = 0;
1374 64049 : sql_subtype *t = exp_subtype(e);
1375 64049 : sql_column *c = exp_find_column(rel, e, -2);
1376 :
1377 64049 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1378 38 : res += 1000;
1379 : /* can we find out if the underlying table is sorted */
1380 64049 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1381 3636 : res += 700;
1382 42586 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1383 3729 : res += 500;
1384 64049 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1385 0 : res += 200;
1386 :
1387 : /* prefer the shorter var types over the longer ones */
1388 64049 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1389 64049 : return res;
1390 : }
1391 :
1392 : /* reorder group by expressions */
1393 : static inline sql_rel *
1394 1269838 : rel_groupby_order(visitor *v, sql_rel *rel)
1395 : {
1396 1269838 : int *scores = NULL;
1397 1269838 : sql_exp **exps = NULL;
1398 :
1399 1269838 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1400 27017 : node *n;
1401 27017 : list *gbe = rel->r;
1402 27017 : int i, ngbe = list_length(gbe);
1403 27017 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1404 27017 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1405 :
1406 : /* first sorting step, give priority for integers and sorted columns */
1407 91066 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1408 64049 : exps[i] = n->data;
1409 64049 : scores[i] = score_gbe(v, rel, exps[i]);
1410 : }
1411 27017 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1412 :
1413 : /* second sorting step, give priority to strings with lower number of digits */
1414 54839 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1415 27017 : if (scores[i])
1416 26021 : i++;
1417 27017 : if (ngbe - i > 1) {
1418 12154 : for (int j = i; j < ngbe; j++) {
1419 9143 : sql_subtype *t = exp_subtype(exps[j]);
1420 9143 : scores[j] = t ? t->digits : 0;
1421 : }
1422 : /* the less number of digits the better, order ascending */
1423 3011 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1424 : }
1425 :
1426 91066 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1427 64049 : n->data = exps[i];
1428 : }
1429 :
1430 1269838 : return rel;
1431 : }
1432 :
1433 : /* This optimization loop contains optimizations that can potentially use statistics */
1434 : static sql_rel *
1435 1269837 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1436 : {
1437 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1438 1269837 : rel = rel_push_select_up(v, rel);
1439 1269837 : rel = rel_select_order(v, rel);
1440 :
1441 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1442 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1443 : rel_distinct_aggregate_on_unique_values */
1444 :
1445 1269838 : rel = rel_groupby_order(v, rel);
1446 1269838 : return rel;
1447 : }
1448 :
1449 : static sql_rel *
1450 68274 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1451 : {
1452 68274 : (void) gp;
1453 68274 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1454 : }
1455 :
1456 : run_optimizer
1457 729811 : bind_final_optimization_loop(visitor *v, global_props *gp)
1458 : {
1459 729811 : int flag = v->sql->sql_optimizer;
1460 : /* At the moment, this optimizer has dependency on 3 flags */
1461 560923 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1462 798086 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1463 : }
|