Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 :
19 : static sql_exp *
20 3462878 : comparison_find_column(sql_exp *input, sql_exp *e)
21 : {
22 3462878 : switch (input->type) {
23 98271 : case e_convert: {
24 98271 : list *types = (list *)input->r;
25 98271 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
26 98271 : if (from == to)
27 189491 : return comparison_find_column(input->l, e) ? input : NULL;
28 : return NULL;
29 : }
30 3051660 : case e_column:
31 3051660 : return exp_match(e, input) ? input : NULL;
32 : default:
33 : return NULL;
34 : }
35 : }
36 :
37 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
38 : * columns */
39 : #define comp_single_column(c) (!c->f)
40 :
41 : static sql_exp *
42 8733084 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
43 : {
44 8816529 : assert(e->type == e_column);
45 8816529 : if (rel) {
46 8816455 : switch(rel->op) {
47 4081101 : case op_left:
48 : case op_right:
49 : case op_full:
50 : case op_join:
51 : case op_select:
52 : case op_anti:
53 : case op_semi: {
54 4081101 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
55 :
56 4081101 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
57 : return NULL; /* nothing will pass, skip */
58 :
59 : /* propagate from the bottom first */
60 4066985 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
61 : found_left = true;
62 2585975 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
63 4066985 : found_right = true;
64 :
65 4066985 : if (!found_left && !found_right)
66 : return NULL;
67 1775507 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
68 3494211 : for (node *n = rel->exps->h ; n ; n = n->next) {
69 1809383 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
70 :
71 1809383 : if (comp->type == e_cmp) {
72 1808381 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
73 125627 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
74 125627 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
75 :
76 : /* not semantics found or if explicitly filtering not null values from the column */
77 125627 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
78 125627 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
79 125627 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
80 19747 : continue;
81 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
82 105880 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
83 4622 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
84 4622 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
85 4622 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
86 4622 : symmetric = is_symmetric(comp);
87 :
88 4622 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
89 1817 : continue;
90 2805 : if (lne && int1 && int2) {
91 104 : if (symmetric) {
92 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
93 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
94 : /* max is min from le and (max from re and fe max) */
95 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
96 : /* min is max from le and (min from re and fe min) */
97 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
98 : } else {
99 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
100 : /* max is min from le and fe max */
101 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
102 : /* min is max from le and re min */
103 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
104 : }
105 2701 : } else if (rne) {
106 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
107 0 : prop *p = find_prop(e->p, PROP_MIN);
108 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
109 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
110 590 : } else if (int1) { /* min is max from le and re min */
111 100 : prop *p = find_prop(e->p, PROP_MIN);
112 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
113 : }
114 2111 : } else if (fne) {
115 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
116 0 : prop *p = find_prop(e->p, PROP_MAX);
117 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
118 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
119 549 : } else if (int2) { /* max is min from le and fe max */
120 266 : prop *p = find_prop(e->p, PROP_MAX);
121 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
122 : }
123 : }
124 101258 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
125 : /* both min and max must be set and the intervals must overlap */
126 43110 : switch (comp->flag) {
127 28773 : case cmp_equal: /* for equality reduce */
128 28773 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
129 28773 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
130 28773 : break;
131 4680 : case cmp_notequal: /* for inequality expand */
132 4680 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
133 4680 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
134 4680 : break;
135 5651 : case cmp_gt:
136 : case cmp_gte:
137 10364 : if (!is_anti(comp) && lne) { /* min is max from both min */
138 4713 : prop *p = find_prop(e->p, PROP_MIN);
139 4713 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
140 938 : } else if (!is_anti(comp)) { /* max is min from both max */
141 938 : prop *p = find_prop(e->p, PROP_MAX);
142 938 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
143 : }
144 : break;
145 4006 : case cmp_lt:
146 : case cmp_lte:
147 7205 : if (!is_anti(comp) && lne) { /* max is min from both max */
148 3199 : prop *p = find_prop(e->p, PROP_MAX);
149 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
150 807 : } else if (!is_anti(comp)) { /* min is max from both min */
151 807 : prop *p = find_prop(e->p, PROP_MIN);
152 807 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
153 : }
154 : break;
155 : default: /* Maybe later I can do cmp_in and cmp_notin */
156 : break;
157 : }
158 : }
159 : }
160 : }
161 : }
162 : }
163 1775507 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
164 37914 : set_has_nil(e);
165 1775507 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
166 94488 : set_has_no_nil(e);
167 1775507 : if (is_join(rel->op) && is_unique(e) && !still_unique)
168 118612 : set_not_unique(e);
169 1775507 : return e;
170 : }
171 4632562 : case op_table:
172 : case op_basetable:
173 : case op_union:
174 : case op_except:
175 : case op_inter:
176 : case op_munion:
177 : case op_project:
178 : case op_groupby: {
179 4632562 : sql_exp *found;
180 4632562 : atom *fval;
181 4632562 : prop *est;
182 4632562 : if ((found = rel_find_exp(rel, e))) {
183 2173579 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
184 2129147 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
185 1130604 : set_minmax_property(sql, e, PROP_MAX, fval);
186 2129146 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
187 1137713 : set_minmax_property(sql, e, PROP_MIN, fval);
188 2129145 : if (!has_nil(found))
189 1351283 : set_has_no_nil(e);
190 2129145 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
191 1718789 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
192 421234 : set_unique(e);
193 : /* propagate unique estimation for known cases */
194 2129145 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
195 7769 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
196 7769 : p->value.dval = 1;
197 2121376 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
198 66651 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
199 2063665 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
200 188166 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
201 188166 : p->value.dval = est->value.dval;
202 : }
203 : }
204 2173577 : return e;
205 : }
206 : return NULL;
207 : }
208 83445 : case op_topn:
209 : case op_sample:
210 83445 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
211 : default:
212 : break;
213 : }
214 : }
215 : return NULL;
216 : }
217 :
218 : static atom *
219 4522706 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
220 : {
221 4522706 : atom *a = SA_NEW(sa, atom);
222 :
223 4522617 : assert(!VALisnil(v));
224 4522516 : *a = (atom) {.tpe = *tpe,};
225 4522516 : SA_VALcopy(sa, &a->data, v);
226 4522461 : return a;
227 : }
228 :
229 : void
230 4152904 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
231 : {
232 4152904 : bool nonil = false, unique = false;
233 4152904 : double unique_est = 0.0;
234 4152904 : ValRecord min, max;
235 4152904 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
236 :
237 4153016 : if (has_nil(e) && nonil)
238 2608556 : set_has_no_nil(e);
239 4153016 : if (!is_unique(e) && unique)
240 1099036 : set_unique(e);
241 4153016 : if (unique_est != 0.0) {
242 2916407 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
243 2916405 : p->value.dval = unique_est;
244 : }
245 4153014 : unsigned int digits = 0;
246 4153014 : sql_subtype *et = exp_subtype(e);
247 4153038 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
248 2697137 : digits = et->digits;
249 4153038 : if ((ok & 2) == 2) {
250 2258440 : if (!VALisnil(&max)) {
251 2258433 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
252 2258413 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
253 2258360 : if (digits) {
254 1680842 : unsigned int nd = atom_digits(p->value.pval);
255 1680858 : if (nd < digits)
256 : digits = nd;
257 1680858 : if (!digits)
258 : digits = 1;
259 : }
260 : }
261 2258377 : VALclear(&max);
262 : }
263 4152959 : if ((ok & 1) == 1) {
264 2264485 : if (!VALisnil(&min)) {
265 2264467 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
266 2264450 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
267 2264405 : if (digits) {
268 1687950 : unsigned int nd = atom_digits(p->value.pval);
269 1687948 : if (nd > digits) /* possibly set to low by max value */
270 : digits = nd;
271 : if (!digits)
272 : digits = 1;
273 : }
274 : }
275 2264403 : VALclear(&min);
276 : }
277 4152863 : if (digits)
278 2696965 : et->digits = digits;
279 4152863 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
280 216 : et->digits = et->scale + 1;
281 4152863 : }
282 :
283 : static void
284 935656 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
285 : {
286 935656 : if (e->p)
287 : return;
288 302403 : sql_column *c = NULL;
289 :
290 302403 : if ((c = rel_base_find_column(rel, e->nid))) {
291 204288 : sql_column_get_statistics(sql, c, e);
292 : }
293 : }
294 :
295 : static bool
296 8872 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
297 : {
298 8872 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
299 8872 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
300 8872 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
301 8872 : prop *est;
302 :
303 : /* for the intersection, if both expressions don't overlap, it can be pruned */
304 8872 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
305 511 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
306 26 : return true;
307 :
308 8846 : if (lval_max && rval_max) {
309 2557 : if (is_union(rel->op))
310 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
311 2554 : else if (is_inter(rel->op))
312 399 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
313 : else /* except */
314 2155 : set_minmax_property(sql, e, PROP_MAX, lval_max);
315 : }
316 8846 : if (lval_min && rval_min) {
317 2557 : if (is_union(rel->op))
318 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
319 2554 : else if (is_inter(rel->op))
320 399 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
321 : else /* except */
322 2155 : set_minmax_property(sql, e, PROP_MIN, lval_min);
323 : }
324 :
325 8846 : if (is_union(rel->op) || is_munion(rel->op)) {
326 5986 : if (!has_nil(le) && !has_nil(re))
327 879 : set_has_no_nil(e);
328 5986 : if (need_distinct(rel) && list_length(rel->exps) == 1)
329 0 : set_unique(e);
330 2860 : } else if (is_inter(rel->op)) {
331 564 : if (!has_nil(le) || !has_nil(re))
332 509 : set_has_no_nil(e);
333 564 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
334 345 : set_unique(e);
335 : } else {
336 2296 : assert(is_except(rel->op));
337 2296 : if (!has_nil(le))
338 2233 : set_has_no_nil(e);
339 2296 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
340 2009 : set_unique(e);
341 : }
342 : /* propagate unique estimation for known cases */
343 8846 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
344 205 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
345 205 : p->value.dval = est->value.dval;
346 : }
347 : return false;
348 : }
349 :
350 :
351 : static void
352 60942 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
353 : {
354 60942 : assert(is_munion(rel->op));
355 :
356 60942 : sql_rel *l = rels->h->data;
357 60942 : sql_exp *le = list_fetch(l->exps, i);
358 60942 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
359 60942 : bool has_nonil = !has_nil(le);
360 :
361 176411 : for(node *n = rels->h->next; n; n = n->next) {
362 115469 : sql_rel *r = n->data;
363 115469 : sql_exp *re = list_fetch(r->exps, i);
364 115469 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
365 :
366 115469 : if (lval_max && rval_max) {
367 42584 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
368 42584 : lval_max = find_prop_and_get(e->p, PROP_MAX);
369 : }
370 115469 : if (lval_min && rval_min) {
371 42059 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
372 42059 : lval_min = find_prop_and_get(e->p, PROP_MIN);
373 : }
374 115469 : has_nonil &= !has_nil(re);
375 :
376 : }
377 :
378 60942 : if (has_nonil)
379 41018 : set_has_no_nil(e);
380 :
381 60942 : if (need_distinct(rel) && list_length(rel->exps) == 1)
382 1594 : set_unique(e);
383 60942 : }
384 :
385 :
386 : static sql_exp *
387 3474704 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
388 : {
389 3474704 : mvc *sql = v->sql;
390 3474704 : atom *lval;
391 :
392 3474704 : (void) depth;
393 3474704 : switch(e->type) {
394 2179200 : case e_column:
395 2179200 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
396 278023 : case op_join:
397 : case op_left:
398 : case op_right:
399 : case op_full:
400 : case op_semi:
401 : case op_anti: {
402 278023 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
403 278023 : if (!found)
404 139456 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
405 : break;
406 : }
407 1901177 : case op_select:
408 : case op_project:
409 : case op_groupby: {
410 1901177 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
411 1901174 : if (!found && is_simple_project(rel->op))
412 124035 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
413 : break;
414 : }
415 0 : case op_insert:
416 : case op_update:
417 : case op_delete:
418 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
419 0 : break;
420 : default:
421 : break;
422 : }
423 : break;
424 98304 : case e_convert: {
425 98304 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
426 98304 : sql_exp *l = e->l;
427 98304 : sql_class fr = from->type->eclass, too = to->type->eclass;
428 98304 : prop *est;
429 :
430 98304 : if (fr == too) {
431 89279 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
432 58658 : atom *res = atom_copy(sql->sa, lval);
433 58658 : if ((res = atom_cast(sql->sa, res, to)))
434 58635 : set_minmax_property(sql, e, PROP_MAX, res);
435 : }
436 89279 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
437 59272 : atom *res = atom_copy(sql->sa, lval);
438 59272 : if ((res = atom_cast(sql->sa, res, to)))
439 59249 : set_minmax_property(sql, e, PROP_MIN, res);
440 : }
441 : }
442 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
443 98304 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
444 60887 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
445 60862 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
446 57466 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
447 57466 : p->value.dval = est->value.dval;
448 : }
449 98304 : if (!has_nil(l))
450 55238 : set_has_no_nil(e);
451 : break;
452 : }
453 341995 : case e_aggr:
454 : case e_func: {
455 341995 : BUN lv;
456 341995 : sql_subfunc *f = e->f;
457 :
458 341995 : if (!f->func->s) {
459 315558 : int key = hash_key(f->func->base.name); /* Using hash lookup */
460 315558 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
461 315558 : lookup_function look = NULL;
462 :
463 689091 : for (; he && !look; he = he->chain) {
464 373533 : struct function_properties* fp = (struct function_properties*) he->value;
465 :
466 373533 : if (!strcmp(f->func->base.name, fp->name))
467 107871 : look = fp->func;
468 : }
469 315558 : if (look)
470 107871 : look(sql, e);
471 : }
472 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
473 341995 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
474 88681 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
475 88358 : set_has_no_nil(e);
476 : /* set properties for global aggregates */
477 341995 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
478 8091 : if (!find_prop(e->p, PROP_NUNIQUES)) {
479 8091 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
480 8091 : p->value.dval = 1;
481 : }
482 8091 : set_unique(e);
483 : }
484 : break;
485 : }
486 570598 : case e_atom:
487 570598 : if (e->l) {
488 552942 : atom *a = (atom*) e->l;
489 552942 : if (!a->isnull) {
490 490982 : set_minmax_property(sql, e, PROP_MAX, a);
491 490982 : set_minmax_property(sql, e, PROP_MIN, a);
492 : }
493 552942 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
494 552942 : p->value.dval = 1;
495 17656 : } else if (e->f) {
496 4131 : list *vals = (list *) e->f;
497 4131 : sql_exp *first = vals->h ? vals->h->data : NULL;
498 4131 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
499 4131 : int has_nil = 0;
500 :
501 4131 : if (first) {
502 4131 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
503 4131 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
504 4131 : has_nil |= has_nil(first);
505 : }
506 :
507 17775 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
508 9513 : sql_exp *ee = n->data;
509 :
510 9513 : if (min && max) {
511 9020 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
512 8974 : max = atom_cmp(lval, max) > 0 ? lval : max;
513 : } else {
514 : max = NULL;
515 : }
516 9020 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
517 8974 : min = atom_cmp(min, lval) > 0 ? lval : min;
518 : } else {
519 : min = NULL;
520 : }
521 : }
522 9513 : has_nil |= has_nil(ee);
523 : }
524 :
525 4131 : if (!has_nil)
526 3761 : set_has_no_nil(e);
527 4131 : if (min && max) {
528 3719 : set_minmax_property(sql, e, PROP_MAX, max);
529 3719 : set_minmax_property(sql, e, PROP_MIN, min);
530 : }
531 4131 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
532 4131 : p->value.dval = (dbl) list_length(vals);
533 : } else {
534 13525 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
535 13525 : p->value.dval = 1;
536 : }
537 : break;
538 284607 : case e_cmp:
539 : /* TODO? propagating min/max/unique of booleans is not very worth it */
540 284607 : if (e->flag == cmp_or || e->flag == cmp_filter) {
541 17386 : if (!have_nil(e->l) && !have_nil(e->r))
542 12931 : set_has_no_nil(e);
543 267221 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
544 21736 : sql_exp *le = e->l;
545 21736 : if (!has_nil(le) && !have_nil(e->r))
546 18524 : set_has_no_nil(e);
547 : } else {
548 245485 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
549 245485 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
550 170248 : set_has_no_nil(e);
551 : }
552 : break;
553 : case e_psm:
554 : break;
555 : }
556 :
557 : #ifndef NDEBUG
558 : {
559 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
560 3474701 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
561 :
562 3474701 : (void) min;
563 3474701 : (void) max;
564 3474701 : assert(!min || !min->isnull);
565 3474701 : assert(!max || !max->isnull);
566 3474701 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
567 : }
568 : #endif
569 3474701 : return e;
570 : }
571 :
572 : static list * /* Remove predicates always false from min/max values */
573 138284 : rel_prune_predicates(visitor *v, sql_rel *rel)
574 : {
575 138284 : if (rel->l) {
576 138284 : sql_rel *l = rel->l;
577 138284 : if (is_single(l))
578 3269 : return rel->exps;
579 : }
580 135015 : if (!list_empty(rel->attr))
581 2961 : return rel->exps;
582 280175 : for (node *n = rel->exps->h ; n ; n = n->next) {
583 148121 : sql_exp *e = n->data;
584 :
585 148121 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
586 121980 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
587 121980 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
588 121980 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
589 121980 : bool always_false = false, always_true = false;
590 :
591 121980 : if (fe && !is_symmetric(e)) {
592 3058 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
593 3058 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
594 3660 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
595 1663 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
596 4072 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
597 2485 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
598 3058 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
599 1287 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
600 :
601 3058 : always_false |= not_int1 || not_int2 || not_int3;
602 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
603 4113 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
604 3957 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
605 575 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
606 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
607 : } else if (!fe) {
608 118904 : if (!is_semantics(e)) /* trivial not null cmp null case */
609 226036 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
610 118904 : switch (e->flag) {
611 103388 : case cmp_equal:
612 103388 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
613 131510 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
614 103388 : if (is_semantics(e)) { /* prune *= NULL cases */
615 5857 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
616 11714 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
617 : }
618 : break;
619 7241 : case cmp_notequal:
620 7241 : if (lval_min && lval_max && rval_min && rval_max)
621 11378 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
622 7241 : if (is_semantics(e)) {
623 29 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
624 58 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
625 : }
626 : break;
627 5632 : case cmp_gt:
628 5632 : if (lval_max && rval_min)
629 1947 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
630 5632 : if (lval_min && rval_max)
631 3894 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
632 : break;
633 119 : case cmp_gte:
634 119 : if (lval_max && rval_min)
635 49 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
636 119 : if (lval_min && rval_max)
637 98 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
638 : break;
639 2440 : case cmp_lt:
640 2440 : if (lval_min && rval_max)
641 1382 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
642 2440 : if (lval_max && rval_min)
643 2812 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
644 : break;
645 84 : case cmp_lte:
646 84 : if (lval_min && rval_max)
647 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
648 84 : if (lval_max && rval_min)
649 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
650 : break;
651 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
652 : break;
653 : }
654 : }
655 121980 : assert(!always_false || !always_true);
656 121980 : if (always_false || always_true) {
657 3557 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
658 3557 : if (exp_name(e))
659 0 : exp_prop_alias(v->sql->sa, ne, e);
660 3557 : n->data = ne;
661 3557 : v->changes++;
662 : }
663 : }
664 : }
665 132054 : return rel->exps;
666 : }
667 :
668 : static sql_rel *
669 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
670 : {
671 14 : if (side == rel->r)
672 0 : rel->r = NULL;
673 : else
674 14 : rel->l = NULL;
675 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
676 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
677 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
678 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
679 : }
680 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
681 21 : sql_exp *e1 = n->data, *e2 = m->data;
682 21 : exp_prop_alias(v->sql->sa, e2, e1);
683 : }
684 14 : list_hash_clear(side->exps);
685 :
686 14 : if (need_distinct(rel))
687 0 : set_distinct(side);
688 14 : side->p = prop_copy(v->sql->sa, rel->p);
689 14 : rel_destroy(rel);
690 14 : return side;
691 : }
692 :
693 : static BUN
694 20341 : trivial_project_exp_card(sql_exp *e)
695 : {
696 20641 : if (e->type == e_convert)
697 300 : return trivial_project_exp_card(e->l);
698 20341 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
699 : }
700 :
701 : static BUN
702 24248 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
703 : {
704 24248 : BUN lv = get_rel_count(l);
705 :
706 24248 : if (lv == 0)
707 : return 0;
708 21539 : if (!list_empty(exps)) {
709 21539 : BUN nuniques = 0;
710 : /* compute the highest number of unique values */
711 51131 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
712 29592 : sql_exp *e = n->data;
713 29592 : sql_rel *bt = NULL;
714 29592 : prop *p = NULL;
715 29592 : BUN euniques = BUN_NONE;
716 29592 : atom *min, *max, *sub = NULL;
717 29592 : sql_subtype *tp = exp_subtype(e);
718 29592 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
719 :
720 29592 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
721 20956 : euniques = (BUN) p->value.dval;
722 8636 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
723 6593 : euniques = (BUN) p->value.lval;
724 : }
725 : /* use min to max range to compute number of possible values in the domain for number types */
726 29592 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
727 22363 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
728 : /* the range includes min and max, so the atom_inc call is needed */
729 : /* if 'euniques' has number of distinct values, compute min between both */
730 17534 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
731 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
732 : }
733 29592 : if (euniques != BUN_NONE)
734 27549 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
735 : else
736 : nuniques = BUN_NONE;
737 : }
738 21539 : if (nuniques != BUN_NONE)
739 : return nuniques;
740 : }
741 : return lv;
742 : }
743 :
744 : static sql_rel *
745 2073065 : rel_get_statistics_(visitor *v, sql_rel *rel)
746 : {
747 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
748 2073065 : uint8_t has_special_modify = *(uint8_t*) v->data;
749 2073065 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
750 :
751 : /* Don't look at the same relation twice */
752 2073065 : if (are_statistics_gathered(rel->used))
753 : return rel;
754 1331847 : rel->used |= statistics_gathered;
755 :
756 1331847 : switch (rel->op) {
757 312127 : case op_basetable: {
758 312127 : sql_table *t = (sql_table *) rel->l;
759 312127 : sqlstore *store = v->sql->session->tr->store;
760 :
761 312127 : if (!list_empty(rel->exps)) {
762 1247733 : for (node *n = rel->exps->h ; n ; n = n->next)
763 935656 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
764 : }
765 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
766 312126 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
767 258602 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
768 : break;
769 : }
770 2790 : case op_union:
771 : case op_inter:
772 : case op_except: {
773 2790 : bool empty_cross = false;
774 2790 : int i = 0;
775 2790 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
776 :
777 2790 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
778 0 : pl = pl->l;
779 2790 : while (is_sample(pr->op) || is_topn(pr->op))
780 0 : pr = pr->l;
781 : /* if it's not a projection, then project and propagate statistics */
782 2790 : if (!is_project(pl->op) && !is_base(pl->op)) {
783 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
784 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
785 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
786 : }
787 2790 : if (!is_project(pr->op) && !is_base(pr->op)) {
788 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
789 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
790 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
791 : }
792 :
793 11662 : for (node *n = rel->exps->h ; n ; n = n->next) {
794 8872 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
795 8872 : i++;
796 : }
797 :
798 : /* propagate row count */
799 2790 : if (is_union(rel->op)) {
800 277 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
801 277 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
802 :
803 277 : if (lv == 0 && rv == 0) { /* both sides empty */
804 2 : if (can_be_pruned)
805 : empty_cross = true;
806 : else
807 2 : set_count_prop(v->sql->sa, rel, 0);
808 275 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
809 0 : rel = set_setop_side(v, rel, r);
810 0 : empty_cross = false; /* don't rewrite again */
811 275 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
812 0 : rel = set_setop_side(v, rel, l);
813 0 : empty_cross = false; /* don't rewrite again */
814 275 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
815 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
816 : }
817 2513 : } else if (is_inter(rel->op) || is_except(rel->op)) {
818 2513 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
819 2513 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
820 :
821 2513 : if (lv == 0) { /* left side empty */
822 62 : if (can_be_pruned)
823 : empty_cross = true;
824 : else
825 5 : set_count_prop(v->sql->sa, rel, 0);
826 2451 : } else if (rv == 0) { /* right side empty */
827 17 : if (is_inter(rel->op)) {
828 3 : if (can_be_pruned)
829 : empty_cross = true;
830 : else
831 0 : set_count_prop(v->sql->sa, rel, 0);
832 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
833 14 : rel = set_setop_side(v, rel, l);
834 14 : empty_cross = false; /* don't rewrite again */
835 : } else {
836 0 : set_count_prop(v->sql->sa, rel, lv);
837 : }
838 : } else {
839 2434 : set_count_prop(v->sql->sa, rel, lv);
840 : }
841 : }
842 2790 : if (can_be_pruned && empty_cross) {
843 80 : rel_destroy(rel->l);
844 80 : rel->l = NULL;
845 80 : rel_destroy(rel->r);
846 80 : rel->r = NULL;
847 244 : for (node *n = rel->exps->h ; n ; n = n->next) {
848 164 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
849 164 : exp_prop_alias(v->sql->sa, a, e);
850 164 : n->data = a;
851 : }
852 80 : list_hash_clear(rel->exps);
853 80 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
854 80 : set_count_prop(v->sql->sa, l, 1);
855 80 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
856 80 : set_count_prop(v->sql->sa, l, 0);
857 80 : rel->op = op_project;
858 80 : rel->l = l;
859 80 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
860 80 : set_count_prop(v->sql->sa, rel, 0);
861 80 : set_nodistinct(rel); /* set relations may have distinct flag set */
862 80 : v->changes++;
863 : }
864 : break;
865 : }
866 12894 : case op_munion: {
867 12894 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
868 12894 : BUN cnt = 0;
869 12894 : bool needs_pruning = false;
870 :
871 49290 : for (node *n = l->h; n; n = n->next) {
872 36396 : sql_rel *r = n->data, *pl = r;
873 :
874 36396 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
875 0 : pl = pl->l;
876 : /* if it's not a projection, then project and propagate statistics */
877 36396 : if (!is_project(pl->op) && !is_base(pl->op)) {
878 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
879 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
880 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
881 : }
882 36396 : nrels = append(nrels, pl);
883 : /* we need new munion statistics */
884 : /* propagate row count */
885 36396 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
886 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
887 36396 : if (rv == BUN_NONE) {
888 1156 : cnt++;
889 1156 : continue;
890 : }
891 35240 : if (!rv && can_be_pruned)
892 6633 : needs_pruning = true;
893 : /* overflow check */
894 35240 : if (rv > (BUN_MAX - cnt))
895 36396 : rv = BUN_MAX;
896 : else
897 35240 : cnt += rv;
898 : }
899 12894 : int i = 0;
900 73836 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
901 60942 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
902 :
903 12894 : if (needs_pruning && !rel_is_ref(rel)) {
904 4472 : v->changes++;
905 4472 : list *nl = sa_list(l->sa);
906 :
907 16488 : for (node *n = nrels->h; n; n = n->next) {
908 12016 : sql_rel *r = n->data;
909 12016 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
910 :
911 12016 : if (!rv) { /* keep last for now */
912 6163 : rel_destroy(r);
913 6163 : continue;
914 : }
915 5853 : nl = append(nl, r);
916 : }
917 4472 : rel->l = nl;
918 4472 : if (list_length(nl) == 1) {
919 4147 : sql_rel *l = rel->l = nl->h->data; /* ugh */
920 4147 : rel->r = NULL;
921 4147 : rel->op = op_project;
922 :
923 20433 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
924 16286 : sql_exp *pe = n->data, *ie = m->data;
925 16286 : sql_exp *ne = exp_ref(v->sql, ie);
926 16286 : exp_prop_alias(v->sql->sa, ne, pe);
927 16286 : n->data = ne;
928 : }
929 4147 : list_hash_clear(rel->exps);
930 325 : } else if (list_empty(nl)) {
931 : /* empty select (project [ nils ] ) */
932 454 : for (node *n = rel->exps->h ; n ; n = n->next) {
933 354 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
934 354 : exp_prop_alias(v->sql->sa, a, e);
935 354 : n->data = a;
936 : }
937 100 : list_hash_clear(rel->exps);
938 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
939 100 : set_count_prop(v->sql->sa, l, 1);
940 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
941 100 : set_count_prop(v->sql->sa, l, 0);
942 100 : rel->op = op_project;
943 100 : rel->r = NULL;
944 100 : rel->l = l;
945 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
946 100 : set_count_prop(v->sql->sa, rel, 0);
947 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
948 : }
949 : } else {
950 8422 : set_count_prop(v->sql->sa, rel, cnt);
951 : }
952 : break;
953 : }
954 530783 : case op_join:
955 : case op_left:
956 : case op_right:
957 : case op_full:
958 : case op_semi:
959 : case op_anti:
960 : case op_select:
961 : case op_project:
962 : case op_groupby: {
963 530783 : if (is_groupby(rel->op) && !list_empty(rel->r))
964 15531 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
965 530783 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
966 530782 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
967 39125 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
968 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
969 530782 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
970 138284 : int changes = v->changes;
971 138284 : rel->exps = rel_prune_predicates(v, rel);
972 138284 : if (v->changes > changes)
973 3524 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
974 : }
975 :
976 : /* propagate row count */
977 530782 : sql_rel *l = rel->l, *r = rel->r;
978 530782 : switch (rel->op) {
979 133880 : case op_join:
980 : case op_left:
981 : case op_right:
982 : case op_full: {
983 133880 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
984 :
985 133880 : if (!list_empty(rel->exps) && !is_single(rel)) {
986 245020 : for (node *n = rel->exps->h ; n ; n = n->next) {
987 125082 : sql_exp *e = n->data, *el = e->l, *er = e->r;
988 :
989 125082 : if (find_prop(e->p, PROP_JOINIDX)) {
990 667 : join_idx_estimate = lv>rv?lv:rv;
991 124415 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
992 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
993 120531 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
994 120135 : BUN lu = 0, ru = 0;
995 120135 : prop *p = NULL;
996 120135 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
997 89002 : lu = (BUN) p->value.dval;
998 120135 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
999 103445 : ru = (BUN) p->value.dval;
1000 120135 : if (is_unique(el) || lu > lv) {
1001 73181 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1002 73181 : uniques_estimate = MIN(uniques_estimate, ncount);
1003 46954 : } else if (is_unique(er) || ru > rv) {
1004 29351 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1005 29351 : uniques_estimate = MIN(uniques_estimate, ncount);
1006 : }
1007 : }
1008 : }
1009 : }
1010 : }
1011 133880 : if (is_single(rel)) {
1012 4609 : set_count_prop(v->sql->sa, rel, lv);
1013 129271 : } else if (join_idx_estimate != BUN_MAX) {
1014 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1015 128606 : } else if (uniques_estimate != BUN_MAX) {
1016 96107 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1017 32499 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1018 : /* corner cases for outer joins */
1019 126 : if (is_left(rel->op)) {
1020 114 : set_count_prop(v->sql->sa, rel, lv);
1021 12 : } else if (is_right(rel->op)) {
1022 3 : set_count_prop(v->sql->sa, rel, rv);
1023 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1024 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1025 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1026 0 : set_count_prop(v->sql->sa, rel, 0);
1027 : }
1028 32373 : } else if (lv == 0) {
1029 1166 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1030 31776 : } else if (rv == 0) {
1031 1568 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1032 30702 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1033 18252 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1034 : }
1035 : break;
1036 : }
1037 2630 : case op_anti:
1038 2630 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1039 2630 : break;
1040 108292 : case op_semi:
1041 : case op_select:
1042 : /* TODO calculate cardinalities using selectivities */
1043 108292 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1044 2590 : set_count_prop(v->sql->sa, rel, 0);
1045 : } else {
1046 208164 : if (!list_empty(rel->exps) && !is_single(rel)) {
1047 102462 : BUN cnt = get_rel_count(l), u = 1;
1048 171150 : for (node *n = rel->exps->h ; n ; n = n->next) {
1049 112327 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1050 :
1051 : /* simple expressions first */
1052 112327 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1053 : /* use selectivity */
1054 56843 : prop *p;
1055 56843 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1056 43639 : u = (BUN) p->value.dval;
1057 43639 : break;
1058 : }
1059 : }
1060 : }
1061 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1062 102462 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1063 : } else {
1064 3240 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1065 : }
1066 : }
1067 : break;
1068 262559 : case op_project:
1069 262559 : if (l) {
1070 252652 : if (need_distinct(rel)) {
1071 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1072 : } else {
1073 252652 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1074 : }
1075 : } else {
1076 9907 : BUN card = 1;
1077 :
1078 9907 : if (!list_empty(rel->exps)) {
1079 28940 : for (node *n = rel->exps->h ; n ; n = n->next) {
1080 19033 : sql_exp *e = n->data;
1081 19033 : card = MAX(card, trivial_project_exp_card(e));
1082 : }
1083 : }
1084 9907 : set_count_prop(v->sql->sa, rel, card);
1085 : }
1086 : break;
1087 23057 : case op_groupby:
1088 23057 : if (list_empty(rel->r)) {
1089 7525 : set_count_prop(v->sql->sa, rel, 1);
1090 : } else {
1091 15532 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1092 : }
1093 : break;
1094 : default:
1095 : break;
1096 : }
1097 : break;
1098 : }
1099 16859 : case op_topn: {
1100 16859 : BUN lv = get_rel_count(rel->l);
1101 :
1102 16859 : if (lv != BUN_NONE) {
1103 16842 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1104 96 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1105 96 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1106 96 : lv = offset >= lv ? 0 : lv - offset;
1107 : }
1108 16842 : if (le->l && exp_is_not_null(le)) {
1109 16804 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1110 16804 : lv = MIN(lv, limit);
1111 : }
1112 16842 : set_count_prop(v->sql->sa, rel, lv);
1113 : }
1114 : break;
1115 : }
1116 22 : case op_sample: {
1117 22 : BUN lv = get_rel_count(rel->l);
1118 :
1119 22 : if (lv != BUN_NONE) {
1120 4 : sql_exp *se = rel->exps->h->data;
1121 4 : sql_subtype *tp = exp_subtype(se);
1122 :
1123 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1124 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1125 4 : lv = MIN(lv, sample);
1126 0 : } else if (se->l) { /* sample is a percentage of rows */
1127 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1128 0 : assert(tp->type->eclass == EC_FLT);
1129 0 : lv = (BUN) ceil((dbl)lv * percent);
1130 : }
1131 4 : set_count_prop(v->sql->sa, rel, lv);
1132 : }
1133 : break;
1134 : }
1135 6087 : case op_table: {
1136 6087 : sql_exp *op = rel->r;
1137 6087 : if (rel->flag != TRIGGER_WRAPPER && op) {
1138 5775 : sql_subfunc *f = op->f;
1139 5775 : if (f->func->lang == FUNC_LANG_SQL) {
1140 97 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1141 5678 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1142 829 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1143 4849 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1144 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1145 4849 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1146 1085 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1147 3764 : } else if (f->func->lang == FUNC_LANG_MAL &&
1148 3654 : (strcmp(f->func->base.name, "queue") == 0 ||
1149 3390 : strcmp(f->func->base.name, "optimizers") == 0 ||
1150 3078 : strcmp(f->func->base.name, "env") == 0 ||
1151 2786 : strcmp(f->func->base.name, "keywords") == 0 ||
1152 2786 : strcmp(f->func->base.name, "statistics") == 0 ||
1153 2119 : strcmp(f->func->base.name, "rejects") == 0 ||
1154 1867 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1155 1867 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1156 1867 : strcmp(f->func->base.name, "sessions") == 0) ) {
1157 2077 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1158 : }
1159 : /* else {
1160 : printf("%%func needs stats : %s\n", f->func->base.name);
1161 : } */
1162 : }
1163 : break;
1164 : }
1165 : /*These relations are less important for now
1166 : TODO later we can tune it */
1167 : case op_insert:
1168 : case op_update:
1169 : case op_delete:
1170 : case op_truncate:
1171 : case op_ddl:
1172 : default:
1173 : break;
1174 : }
1175 :
1176 : return rel;
1177 : }
1178 :
1179 : static sql_rel *
1180 541947 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1181 : {
1182 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1183 541947 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1184 541947 : v->data = &has_special_modify;
1185 541947 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1186 541948 : v->data = gp;
1187 541948 : return rel;
1188 : }
1189 :
1190 : run_optimizer
1191 718453 : bind_get_statistics(visitor *v, global_props *gp)
1192 : {
1193 718453 : (void) v;
1194 718453 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1195 : }
1196 :
1197 :
1198 : static bool
1199 94500 : point_select_on_unique_column(sql_rel *rel)
1200 : {
1201 94500 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1202 130811 : for (node *n = rel->exps->h; n ; n = n->next) {
1203 75368 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1204 :
1205 75368 : if (is_compare(e->type) && e->flag == cmp_equal) {
1206 33339 : if (is_numeric_upcast(el))
1207 0 : el = el->l;
1208 33339 : if (is_numeric_upcast(er))
1209 0 : er = er->l;
1210 33339 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1211 749 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1212 : return true;
1213 32590 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1214 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1215 : return true;
1216 : }
1217 : }
1218 : }
1219 : return false;
1220 : }
1221 :
1222 : /*
1223 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1224 : * join, the opposite side's select can be pushed above the join.
1225 : */
1226 : static inline sql_rel *
1227 1243070 : rel_push_select_up(visitor *v, sql_rel *rel)
1228 : {
1229 1243070 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1230 253619 : sql_rel *l = rel->l, *r = rel->r;
1231 253619 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1232 253619 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1233 :
1234 253619 : if (can_pushup_left || can_pushup_right) {
1235 66641 : if (can_pushup_left)
1236 45254 : can_pushup_left = point_select_on_unique_column(r);
1237 66641 : if (can_pushup_right)
1238 49246 : can_pushup_right = point_select_on_unique_column(l);
1239 :
1240 : /* if both selects retrieve one row each, it's not worth it to push both up */
1241 66641 : if (can_pushup_left && !can_pushup_right) {
1242 683 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1243 683 : nrel->l = l->l;
1244 683 : rel = rel_inplace_select(rel, nrel, l->exps);
1245 683 : assert(is_select(rel->op));
1246 683 : v->changes++;
1247 65958 : } else if (!can_pushup_left && can_pushup_right) {
1248 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1249 14 : nrel->r = r->l;
1250 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1251 14 : assert(is_select(rel->op));
1252 14 : v->changes++;
1253 : }
1254 : }
1255 : }
1256 1243070 : return rel;
1257 : }
1258 :
1259 : static int
1260 93400 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1261 : {
1262 93400 : int de;
1263 :
1264 93400 : if (!t)
1265 : return 0;
1266 93400 : switch (ATOMstorage(t->type->localtype)) {
1267 : case TYPE_bte:
1268 : return 150 - 8;
1269 1672 : case TYPE_sht:
1270 1672 : return 150 - 16;
1271 37312 : case TYPE_int:
1272 37312 : return 150 - 32;
1273 516 : case TYPE_void:
1274 : case TYPE_lng:
1275 516 : return 150 - 64;
1276 : case TYPE_uuid:
1277 : #ifdef HAVE_HGE
1278 : case TYPE_hge:
1279 : #endif
1280 : return 150 - 128;
1281 2 : case TYPE_flt:
1282 2 : return 75 - 24;
1283 : case TYPE_dbl:
1284 : return 75 - 53;
1285 30566 : default:
1286 30566 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1287 1554 : return 150 - de * 8;
1288 : /* strings and blobs not duplicate eliminated don't get any points here */
1289 : return 0;
1290 : }
1291 : }
1292 :
1293 : static int
1294 34501 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1295 : {
1296 34501 : int res = 0;
1297 34501 : sql_subtype *t = exp_subtype(e);
1298 34501 : sql_column *c = NULL;
1299 :
1300 34501 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1301 : return -1000;
1302 : /* can we find out if the underlying table is sorted */
1303 33900 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1304 33900 : res += 600;
1305 :
1306 : /* prefer the shorter var types over the longer ones */
1307 33900 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1308 33900 : return res;
1309 : }
1310 :
1311 : static int
1312 58490 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1313 : {
1314 58490 : int score = 0;
1315 58490 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1316 34501 : sql_exp *l = e->l;
1317 :
1318 34501 : while (l->type == e_cmp) { /* go through nested comparisons */
1319 2 : sql_exp *ll;
1320 :
1321 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1322 0 : ll = ((list*)l->l)->h->data;
1323 : else
1324 2 : ll = l->l;
1325 2 : if (ll->type != e_cmp)
1326 : break;
1327 : l = ll;
1328 : }
1329 34501 : score += score_se_base(v, rel, l);
1330 : }
1331 58490 : score += exp_keyvalue(e);
1332 58490 : return score;
1333 : }
1334 :
1335 : static inline sql_rel *
1336 1243070 : rel_select_order(visitor *v, sql_rel *rel)
1337 : {
1338 1243070 : int *scores = NULL;
1339 1243070 : sql_exp **exps = NULL;
1340 :
1341 1243070 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1342 27368 : node *n;
1343 27368 : int i, nexps = list_length(rel->exps);
1344 27368 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1345 27368 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1346 :
1347 85858 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1348 58520 : exps[i] = n->data;
1349 58520 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1350 : return rel;
1351 58490 : scores[i] = score_se(v, rel, n->data);
1352 : }
1353 27338 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1354 :
1355 85828 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1356 58490 : n->data = exps[i];
1357 : }
1358 :
1359 : return rel;
1360 : }
1361 :
1362 : /* Compute the efficiency of using this expression early in a group by list */
1363 : static int
1364 59500 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1365 : {
1366 59500 : int res = 0;
1367 59500 : sql_subtype *t = exp_subtype(e);
1368 59500 : sql_column *c = exp_find_column(rel, e, -2);
1369 :
1370 59500 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1371 38 : res += 1000;
1372 : /* can we find out if the underlying table is sorted */
1373 59500 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1374 3585 : res += 700;
1375 38911 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1376 3712 : res += 500;
1377 59500 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1378 0 : res += 200;
1379 :
1380 : /* prefer the shorter var types over the longer ones */
1381 59500 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1382 59500 : return res;
1383 : }
1384 :
1385 : /* reorder group by expressions */
1386 : static inline sql_rel *
1387 1243070 : rel_groupby_order(visitor *v, sql_rel *rel)
1388 : {
1389 1243070 : int *scores = NULL;
1390 1243070 : sql_exp **exps = NULL;
1391 :
1392 1243070 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1393 25587 : node *n;
1394 25587 : list *gbe = rel->r;
1395 25587 : int i, ngbe = list_length(gbe);
1396 25587 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1397 25587 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1398 :
1399 : /* first sorting step, give priority for integers and sorted columns */
1400 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1401 59500 : exps[i] = n->data;
1402 59500 : scores[i] = score_gbe(v, rel, exps[i]);
1403 : }
1404 25587 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1405 :
1406 : /* second sorting step, give priority to strings with lower number of digits */
1407 50503 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1408 25587 : if (scores[i])
1409 24589 : i++;
1410 25587 : if (ngbe - i > 1) {
1411 8604 : for (int j = i; j < ngbe; j++) {
1412 6597 : sql_subtype *t = exp_subtype(exps[j]);
1413 6597 : scores[j] = t ? t->digits : 0;
1414 : }
1415 : /* the less number of digits the better, order ascending */
1416 2007 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1417 : }
1418 :
1419 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1420 59500 : n->data = exps[i];
1421 : }
1422 :
1423 1243071 : return rel;
1424 : }
1425 :
1426 : /* This optimization loop contains optimizations that can potentially use statistics */
1427 : static sql_rel *
1428 1243070 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1429 : {
1430 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1431 1243070 : rel = rel_push_select_up(v, rel);
1432 1243070 : rel = rel_select_order(v, rel);
1433 :
1434 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1435 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1436 : rel_distinct_aggregate_on_unique_values */
1437 :
1438 1243070 : rel = rel_groupby_order(v, rel);
1439 1243071 : return rel;
1440 : }
1441 :
1442 : static sql_rel *
1443 65961 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1444 : {
1445 65961 : (void) gp;
1446 65961 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1447 : }
1448 :
1449 : run_optimizer
1450 718455 : bind_final_optimization_loop(visitor *v, global_props *gp)
1451 : {
1452 718455 : int flag = v->sql->sql_optimizer;
1453 : /* At the moment, this optimizer has dependency on 3 flags */
1454 549537 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1455 784418 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1456 : }
|