Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 :
19 : static sql_exp *
20 3462441 : comparison_find_column(sql_exp *input, sql_exp *e)
21 : {
22 3462441 : switch (input->type) {
23 98657 : case e_convert: {
24 98657 : list *types = (list *)input->r;
25 98657 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
26 98657 : if (from == to)
27 190249 : return comparison_find_column(input->l, e) ? input : NULL;
28 : return NULL;
29 : }
30 3054183 : case e_column:
31 3054183 : return exp_match(e, input) ? input : NULL;
32 : default:
33 : return NULL;
34 : }
35 : }
36 :
37 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
38 : * columns */
39 : #define comp_single_column(c) (!c->f)
40 :
41 : static sql_exp *
42 8755041 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
43 : {
44 8838479 : assert(e->type == e_column);
45 8838479 : if (rel) {
46 8838400 : switch(rel->op) {
47 4087909 : case op_left:
48 : case op_right:
49 : case op_full:
50 : case op_join:
51 : case op_select:
52 : case op_anti:
53 : case op_semi: {
54 4087909 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
55 :
56 4087909 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
57 : return NULL; /* nothing will pass, skip */
58 :
59 : /* propagate from the bottom first */
60 4072354 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
61 : found_left = true;
62 2592158 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
63 4072354 : found_right = true;
64 :
65 4072354 : if (!found_left && !found_right)
66 : return NULL;
67 1776344 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
68 3494160 : for (node *n = rel->exps->h ; n ; n = n->next) {
69 1809020 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
70 :
71 1809020 : if (comp->type == e_cmp) {
72 1808018 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
73 124979 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
74 124979 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
75 :
76 : /* not semantics found or if explicitly filtering not null values from the column */
77 124979 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
78 124979 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
79 124979 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
80 19789 : continue;
81 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
82 105190 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
83 4622 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
84 4622 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
85 4622 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
86 4622 : symmetric = is_symmetric(comp);
87 :
88 4622 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
89 1817 : continue;
90 2805 : if (lne && int1 && int2) {
91 104 : if (symmetric) {
92 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
93 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
94 : /* max is min from le and (max from re and fe max) */
95 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
96 : /* min is max from le and (min from re and fe min) */
97 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
98 : } else {
99 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
100 : /* max is min from le and fe max */
101 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
102 : /* min is max from le and re min */
103 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
104 : }
105 2701 : } else if (rne) {
106 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
107 0 : prop *p = find_prop(e->p, PROP_MIN);
108 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
109 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
110 590 : } else if (int1) { /* min is max from le and re min */
111 100 : prop *p = find_prop(e->p, PROP_MIN);
112 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
113 : }
114 2111 : } else if (fne) {
115 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
116 0 : prop *p = find_prop(e->p, PROP_MAX);
117 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
118 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
119 549 : } else if (int2) { /* max is min from le and fe max */
120 266 : prop *p = find_prop(e->p, PROP_MAX);
121 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
122 : }
123 : }
124 100568 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
125 : /* both min and max must be set and the intervals must overlap */
126 42997 : switch (comp->flag) {
127 29001 : case cmp_equal: /* for equality reduce */
128 29001 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
129 29001 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
130 29001 : break;
131 4680 : case cmp_notequal: /* for inequality expand */
132 4680 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
133 4680 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
134 4680 : break;
135 5310 : case cmp_gt:
136 : case cmp_gte:
137 9682 : if (!is_anti(comp) && lne) { /* min is max from both min */
138 4372 : prop *p = find_prop(e->p, PROP_MIN);
139 4372 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
140 938 : } else if (!is_anti(comp)) { /* max is min from both max */
141 938 : prop *p = find_prop(e->p, PROP_MAX);
142 938 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
143 : }
144 : break;
145 4006 : case cmp_lt:
146 : case cmp_lte:
147 7205 : if (!is_anti(comp) && lne) { /* max is min from both max */
148 3199 : prop *p = find_prop(e->p, PROP_MAX);
149 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
150 807 : } else if (!is_anti(comp)) { /* min is max from both min */
151 807 : prop *p = find_prop(e->p, PROP_MIN);
152 807 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
153 : }
154 : break;
155 : default: /* Maybe later I can do cmp_in and cmp_notin */
156 : break;
157 : }
158 : }
159 : }
160 : }
161 : }
162 : }
163 1776344 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
164 38046 : set_has_nil(e);
165 1776344 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
166 94131 : set_has_no_nil(e);
167 1776344 : if (is_join(rel->op) && is_unique(e) && !still_unique)
168 119293 : set_not_unique(e);
169 1776344 : return e;
170 : }
171 4647692 : case op_table:
172 : case op_basetable:
173 : case op_union:
174 : case op_except:
175 : case op_inter:
176 : case op_munion:
177 : case op_project:
178 : case op_groupby: {
179 4647692 : sql_exp *found;
180 4647692 : atom *fval;
181 4647692 : prop *est;
182 4647692 : if ((found = rel_find_exp(rel, e))) {
183 2184376 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
184 2140055 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
185 1135955 : set_minmax_property(sql, e, PROP_MAX, fval);
186 2140054 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
187 1142841 : set_minmax_property(sql, e, PROP_MIN, fval);
188 2140053 : if (!has_nil(found))
189 1381933 : set_has_no_nil(e);
190 2140053 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
191 1728783 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
192 422151 : set_unique(e);
193 : /* propagate unique estimation for known cases */
194 2140053 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
195 7543 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
196 7543 : p->value.dval = 1;
197 2132511 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
198 66651 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
199 2074797 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
200 191761 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
201 191761 : p->value.dval = est->value.dval;
202 : }
203 : }
204 2184374 : return e;
205 : }
206 : return NULL;
207 : }
208 83438 : case op_topn:
209 : case op_sample:
210 83438 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
211 : default:
212 : break;
213 : }
214 : }
215 : return NULL;
216 : }
217 :
218 : static atom *
219 4541634 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
220 : {
221 4541634 : atom *a = SA_NEW(sa, atom);
222 :
223 4541557 : assert(!VALisnil(v));
224 4541459 : *a = (atom) {.tpe = *tpe,};
225 4541459 : SA_VALcopy(sa, &a->data, v);
226 4541410 : return a;
227 : }
228 :
229 : void
230 4178676 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
231 : {
232 4178676 : bool nonil = false, unique = false;
233 4178676 : double unique_est = 0.0;
234 4178676 : ValRecord min, max;
235 4178676 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
236 :
237 4178713 : if (has_nil(e) && nonil)
238 2753738 : set_has_no_nil(e);
239 4178713 : if (!is_unique(e) && unique)
240 1106385 : set_unique(e);
241 4178713 : if (unique_est != 0.0) {
242 2934438 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
243 2934440 : p->value.dval = unique_est;
244 : }
245 4178715 : unsigned int digits = 0;
246 4178715 : sql_subtype *et = exp_subtype(e);
247 4178689 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
248 2712118 : digits = et->digits;
249 4178689 : if ((ok & 2) == 2) {
250 2268014 : if (!VALisnil(&max)) {
251 2268010 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
252 2267977 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
253 2267922 : if (digits) {
254 1687120 : unsigned int nd = atom_digits(p->value.pval);
255 1687197 : if (nd < digits)
256 : digits = nd;
257 1687197 : if (!digits)
258 : digits = 1;
259 : }
260 : }
261 2268003 : VALclear(&max);
262 : }
263 4178660 : if ((ok & 1) == 1) {
264 2273835 : if (!VALisnil(&min)) {
265 2273829 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
266 2273819 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
267 2273791 : if (digits) {
268 1694071 : unsigned int nd = atom_digits(p->value.pval);
269 1694070 : if (nd > digits) /* possibly set to low by max value */
270 : digits = nd;
271 : if (!digits)
272 : digits = 1;
273 : }
274 : }
275 2273790 : VALclear(&min);
276 : }
277 4178603 : if (digits)
278 2712043 : et->digits = digits;
279 4178603 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
280 216 : et->digits = et->scale + 1;
281 4178603 : }
282 :
283 : static void
284 938764 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
285 : {
286 938764 : if (e->p)
287 : return;
288 303111 : sql_column *c = NULL;
289 :
290 303111 : if ((c = rel_base_find_column(rel, e->nid))) {
291 204912 : sql_column_get_statistics(sql, c, e);
292 : }
293 : }
294 :
295 : static bool
296 8872 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
297 : {
298 8872 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
299 8872 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
300 8872 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
301 8872 : prop *est;
302 :
303 : /* for the intersection, if both expressions don't overlap, it can be pruned */
304 8872 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
305 511 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
306 26 : return true;
307 :
308 8846 : if (lval_max && rval_max) {
309 2557 : if (is_union(rel->op))
310 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
311 2554 : else if (is_inter(rel->op))
312 399 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
313 : else /* except */
314 2155 : set_minmax_property(sql, e, PROP_MAX, lval_max);
315 : }
316 8846 : if (lval_min && rval_min) {
317 2557 : if (is_union(rel->op))
318 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
319 2554 : else if (is_inter(rel->op))
320 399 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
321 : else /* except */
322 2155 : set_minmax_property(sql, e, PROP_MIN, lval_min);
323 : }
324 :
325 8846 : if (is_union(rel->op) || is_munion(rel->op)) {
326 5986 : if (!has_nil(le) && !has_nil(re))
327 879 : set_has_no_nil(e);
328 5986 : if (need_distinct(rel) && list_length(rel->exps) == 1)
329 0 : set_unique(e);
330 2860 : } else if (is_inter(rel->op)) {
331 564 : if (!has_nil(le) || !has_nil(re))
332 509 : set_has_no_nil(e);
333 564 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
334 345 : set_unique(e);
335 : } else {
336 2296 : assert(is_except(rel->op));
337 2296 : if (!has_nil(le))
338 2233 : set_has_no_nil(e);
339 2296 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
340 2009 : set_unique(e);
341 : }
342 : /* propagate unique estimation for known cases */
343 8846 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
344 205 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
345 205 : p->value.dval = est->value.dval;
346 : }
347 : return false;
348 : }
349 :
350 :
351 : static void
352 60852 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
353 : {
354 60852 : assert(is_munion(rel->op));
355 :
356 60852 : sql_rel *l = rels->h->data;
357 60852 : sql_exp *le = list_fetch(l->exps, i);
358 60852 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
359 60852 : bool has_nonil = !has_nil(le);
360 :
361 178024 : for(node *n = rels->h->next; n; n = n->next) {
362 117172 : sql_rel *r = n->data;
363 117172 : sql_exp *re = list_fetch(r->exps, i);
364 117172 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
365 :
366 117172 : if (lval_max && rval_max) {
367 43136 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
368 43136 : lval_max = find_prop_and_get(e->p, PROP_MAX);
369 : }
370 117172 : if (lval_min && rval_min) {
371 42591 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
372 42591 : lval_min = find_prop_and_get(e->p, PROP_MIN);
373 : }
374 117172 : has_nonil &= !has_nil(re);
375 :
376 : }
377 :
378 60852 : if (has_nonil)
379 41583 : set_has_no_nil(e);
380 :
381 60852 : if (need_distinct(rel) && list_length(rel->exps) == 1)
382 1594 : set_unique(e);
383 60852 : }
384 :
385 :
386 : static sql_exp *
387 3490965 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
388 : {
389 3490965 : mvc *sql = v->sql;
390 3490965 : atom *lval;
391 :
392 3490965 : (void) depth;
393 3490965 : switch(e->type) {
394 2190207 : case e_column:
395 2190207 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
396 279481 : case op_join:
397 : case op_left:
398 : case op_right:
399 : case op_full:
400 : case op_semi:
401 : case op_anti: {
402 279481 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
403 279481 : if (!found)
404 140185 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
405 : break;
406 : }
407 1910726 : case op_select:
408 : case op_project:
409 : case op_groupby: {
410 1910726 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
411 1910728 : if (!found && is_simple_project(rel->op))
412 125491 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
413 : break;
414 : }
415 0 : case op_insert:
416 : case op_update:
417 : case op_delete:
418 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
419 0 : break;
420 : default:
421 : break;
422 : }
423 : break;
424 99314 : case e_convert: {
425 99314 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
426 99314 : sql_exp *l = e->l;
427 99314 : sql_class fr = from->type->eclass, too = to->type->eclass;
428 99314 : prop *est;
429 :
430 99314 : if (fr == too) {
431 90242 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
432 59460 : atom *res = atom_copy(sql->sa, lval);
433 59460 : if ((res = atom_cast(sql->sa, res, to)))
434 59437 : set_minmax_property(sql, e, PROP_MAX, res);
435 : }
436 90242 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
437 60077 : atom *res = atom_copy(sql->sa, lval);
438 60077 : if ((res = atom_cast(sql->sa, res, to)))
439 60054 : set_minmax_property(sql, e, PROP_MIN, res);
440 : }
441 : }
442 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
443 99314 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
444 61694 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
445 61669 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
446 58242 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
447 58242 : p->value.dval = est->value.dval;
448 : }
449 99314 : if (!has_nil(l))
450 56039 : set_has_no_nil(e);
451 : break;
452 : }
453 342131 : case e_aggr:
454 : case e_func: {
455 342131 : BUN lv;
456 342131 : sql_subfunc *f = e->f;
457 :
458 342131 : if (!f->func->s) {
459 315658 : int key = hash_key(f->func->base.name); /* Using hash lookup */
460 315658 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
461 315658 : lookup_function look = NULL;
462 :
463 689297 : for (; he && !look; he = he->chain) {
464 373639 : struct function_properties* fp = (struct function_properties*) he->value;
465 :
466 373639 : if (!strcmp(f->func->base.name, fp->name))
467 107852 : look = fp->func;
468 : }
469 315658 : if (look)
470 107852 : look(sql, e);
471 : }
472 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
473 342131 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
474 89264 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
475 88937 : set_has_no_nil(e);
476 : /* set properties for global aggregates */
477 342131 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
478 7876 : if (!find_prop(e->p, PROP_NUNIQUES)) {
479 7876 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
480 7876 : p->value.dval = 1;
481 : }
482 7876 : set_unique(e);
483 : }
484 : break;
485 : }
486 573626 : case e_atom:
487 573626 : if (e->l) {
488 555925 : atom *a = (atom*) e->l;
489 555925 : if (!a->isnull) {
490 492419 : set_minmax_property(sql, e, PROP_MAX, a);
491 492419 : set_minmax_property(sql, e, PROP_MIN, a);
492 : }
493 555925 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
494 555925 : p->value.dval = 1;
495 17701 : } else if (e->f) {
496 4176 : list *vals = (list *) e->f;
497 4176 : sql_exp *first = vals->h ? vals->h->data : NULL;
498 4176 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
499 4176 : int has_nil = 0;
500 :
501 4176 : if (first) {
502 4176 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
503 4176 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
504 4176 : has_nil |= has_nil(first);
505 : }
506 :
507 17949 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
508 9597 : sql_exp *ee = n->data;
509 :
510 9597 : if (min && max) {
511 9104 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
512 9058 : max = atom_cmp(lval, max) > 0 ? lval : max;
513 : } else {
514 : max = NULL;
515 : }
516 9104 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
517 9058 : min = atom_cmp(min, lval) > 0 ? lval : min;
518 : } else {
519 : min = NULL;
520 : }
521 : }
522 9597 : has_nil |= has_nil(ee);
523 : }
524 :
525 4176 : if (!has_nil)
526 3806 : set_has_no_nil(e);
527 4176 : if (min && max) {
528 3764 : set_minmax_property(sql, e, PROP_MAX, max);
529 3764 : set_minmax_property(sql, e, PROP_MIN, min);
530 : }
531 4176 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
532 4176 : p->value.dval = (dbl) list_length(vals);
533 : } else {
534 13525 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
535 13525 : p->value.dval = 1;
536 : }
537 : break;
538 285687 : case e_cmp:
539 : /* TODO? propagating min/max/unique of booleans is not very worth it */
540 285687 : if (e->flag == cmp_or || e->flag == cmp_filter) {
541 17436 : if (!have_nil(e->l) && !have_nil(e->r))
542 12966 : set_has_no_nil(e);
543 268251 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
544 21798 : sql_exp *le = e->l;
545 21798 : if (!has_nil(le) && !have_nil(e->r))
546 18734 : set_has_no_nil(e);
547 : } else {
548 246453 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
549 246453 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
550 176318 : set_has_no_nil(e);
551 : }
552 : break;
553 : case e_psm:
554 : break;
555 : }
556 :
557 : #ifndef NDEBUG
558 : {
559 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
560 3490967 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
561 :
562 3490966 : (void) min;
563 3490966 : (void) max;
564 3490966 : assert(!min || !min->isnull);
565 3490966 : assert(!max || !max->isnull);
566 3490966 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
567 : }
568 : #endif
569 3490966 : return e;
570 : }
571 :
572 : static list * /* Remove predicates always false from min/max values */
573 139300 : rel_prune_predicates(visitor *v, sql_rel *rel)
574 : {
575 139300 : if (rel->l) {
576 139300 : sql_rel *l = rel->l;
577 139300 : if (is_single(l))
578 3397 : return rel->exps;
579 : }
580 135903 : if (!list_empty(rel->attr))
581 2963 : return rel->exps;
582 282008 : for (node *n = rel->exps->h ; n ; n = n->next) {
583 149068 : sql_exp *e = n->data;
584 :
585 149068 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
586 122668 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
587 122668 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
588 122668 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
589 122668 : bool always_false = false, always_true = false;
590 :
591 122668 : if (fe && !is_symmetric(e)) {
592 3058 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
593 3058 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
594 3660 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
595 1663 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
596 4072 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
597 2485 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
598 3058 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
599 1287 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
600 :
601 3058 : always_false |= not_int1 || not_int2 || not_int3;
602 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
603 4113 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
604 3957 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
605 575 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
606 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
607 : } else if (!fe) {
608 119592 : if (!is_semantics(e)) /* trivial not null cmp null case */
609 227754 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
610 119592 : switch (e->flag) {
611 104211 : case cmp_equal:
612 104211 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
613 133468 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
614 104211 : if (is_semantics(e)) { /* prune *= NULL cases */
615 5686 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
616 11372 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
617 : }
618 : break;
619 7250 : case cmp_notequal:
620 7250 : if (lval_min && lval_max && rval_min && rval_max)
621 11354 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
622 7250 : if (is_semantics(e)) {
623 29 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
624 58 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
625 : }
626 : break;
627 5487 : case cmp_gt:
628 5487 : if (lval_max && rval_min)
629 1833 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
630 5487 : if (lval_min && rval_max)
631 3666 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
632 : break;
633 119 : case cmp_gte:
634 119 : if (lval_max && rval_min)
635 49 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
636 119 : if (lval_min && rval_max)
637 98 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
638 : break;
639 2441 : case cmp_lt:
640 2441 : if (lval_min && rval_max)
641 1383 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
642 2441 : if (lval_max && rval_min)
643 2814 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
644 : break;
645 84 : case cmp_lte:
646 84 : if (lval_min && rval_max)
647 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
648 84 : if (lval_max && rval_min)
649 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
650 : break;
651 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
652 : break;
653 : }
654 : }
655 122668 : assert(!always_false || !always_true);
656 122668 : if (always_false || always_true) {
657 3656 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
658 3656 : if (exp_name(e))
659 0 : exp_prop_alias(v->sql->sa, ne, e);
660 3656 : n->data = ne;
661 3656 : v->changes++;
662 : }
663 : }
664 : }
665 132940 : return rel->exps;
666 : }
667 :
668 : static sql_rel *
669 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
670 : {
671 14 : if (side == rel->r)
672 0 : rel->r = NULL;
673 : else
674 14 : rel->l = NULL;
675 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
676 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
677 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
678 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
679 : }
680 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
681 21 : sql_exp *e1 = n->data, *e2 = m->data;
682 21 : exp_prop_alias(v->sql->sa, e2, e1);
683 : }
684 14 : list_hash_clear(side->exps);
685 :
686 14 : if (need_distinct(rel))
687 0 : set_distinct(side);
688 14 : side->p = prop_copy(v->sql->sa, rel->p);
689 14 : rel_destroy(rel);
690 14 : return side;
691 : }
692 :
693 : static BUN
694 21885 : trivial_project_exp_card(sql_exp *e)
695 : {
696 22188 : if (e->type == e_convert)
697 303 : return trivial_project_exp_card(e->l);
698 21885 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
699 : }
700 :
701 : static BUN
702 24248 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
703 : {
704 24248 : BUN lv = get_rel_count(l);
705 :
706 24248 : if (lv == 0)
707 : return 0;
708 21539 : if (!list_empty(exps)) {
709 21539 : BUN nuniques = 0;
710 : /* compute the highest number of unique values */
711 51131 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
712 29592 : sql_exp *e = n->data;
713 29592 : sql_rel *bt = NULL;
714 29592 : prop *p = NULL;
715 29592 : BUN euniques = BUN_NONE;
716 29592 : atom *min, *max, *sub = NULL;
717 29592 : sql_subtype *tp = exp_subtype(e);
718 29592 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
719 :
720 29592 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
721 20956 : euniques = (BUN) p->value.dval;
722 8636 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
723 6593 : euniques = (BUN) p->value.lval;
724 : }
725 : /* use min to max range to compute number of possible values in the domain for number types */
726 29592 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
727 22363 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
728 : /* the range includes min and max, so the atom_inc call is needed */
729 : /* if 'euniques' has number of distinct values, compute min between both */
730 17528 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
731 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
732 : }
733 29592 : if (euniques != BUN_NONE)
734 27549 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
735 : else
736 : nuniques = BUN_NONE;
737 : }
738 21539 : if (nuniques != BUN_NONE)
739 : return nuniques;
740 : }
741 : return lv;
742 : }
743 :
744 : static sql_rel *
745 2077220 : rel_get_statistics_(visitor *v, sql_rel *rel)
746 : {
747 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
748 2077220 : uint8_t has_special_modify = *(uint8_t*) v->data;
749 2077220 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
750 :
751 : /* Don't look at the same relation twice */
752 2077220 : if (are_statistics_gathered(rel->used))
753 : return rel;
754 1336015 : rel->used |= statistics_gathered;
755 :
756 1336015 : switch (rel->op) {
757 313444 : case op_basetable: {
758 313444 : sql_table *t = (sql_table *) rel->l;
759 313444 : sqlstore *store = v->sql->session->tr->store;
760 :
761 313444 : if (!list_empty(rel->exps)) {
762 1252158 : for (node *n = rel->exps->h ; n ; n = n->next)
763 938765 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
764 : }
765 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
766 313442 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
767 259814 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
768 : break;
769 : }
770 2790 : case op_union:
771 : case op_inter:
772 : case op_except: {
773 2790 : bool empty_cross = false;
774 2790 : int i = 0;
775 2790 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
776 :
777 2790 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
778 0 : pl = pl->l;
779 2790 : while (is_sample(pr->op) || is_topn(pr->op))
780 0 : pr = pr->l;
781 : /* if it's not a projection, then project and propagate statistics */
782 2790 : if (!is_project(pl->op) && !is_base(pl->op)) {
783 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
784 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
785 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
786 : }
787 2790 : if (!is_project(pr->op) && !is_base(pr->op)) {
788 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
789 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
790 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
791 : }
792 :
793 11662 : for (node *n = rel->exps->h ; n ; n = n->next) {
794 8872 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
795 8872 : i++;
796 : }
797 :
798 : /* propagate row count */
799 2790 : if (is_union(rel->op)) {
800 277 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
801 277 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
802 :
803 277 : if (lv == 0 && rv == 0) { /* both sides empty */
804 2 : if (can_be_pruned)
805 : empty_cross = true;
806 : else
807 2 : set_count_prop(v->sql->sa, rel, 0);
808 275 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
809 0 : rel = set_setop_side(v, rel, r);
810 0 : empty_cross = false; /* don't rewrite again */
811 275 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
812 0 : rel = set_setop_side(v, rel, l);
813 0 : empty_cross = false; /* don't rewrite again */
814 275 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
815 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
816 : }
817 2513 : } else if (is_inter(rel->op) || is_except(rel->op)) {
818 2513 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
819 2513 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
820 :
821 2513 : if (lv == 0) { /* left side empty */
822 62 : if (can_be_pruned)
823 : empty_cross = true;
824 : else
825 5 : set_count_prop(v->sql->sa, rel, 0);
826 2451 : } else if (rv == 0) { /* right side empty */
827 17 : if (is_inter(rel->op)) {
828 3 : if (can_be_pruned)
829 : empty_cross = true;
830 : else
831 0 : set_count_prop(v->sql->sa, rel, 0);
832 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
833 14 : rel = set_setop_side(v, rel, l);
834 14 : empty_cross = false; /* don't rewrite again */
835 : } else {
836 0 : set_count_prop(v->sql->sa, rel, lv);
837 : }
838 : } else {
839 2434 : set_count_prop(v->sql->sa, rel, lv);
840 : }
841 : }
842 2790 : if (can_be_pruned && empty_cross) {
843 80 : rel_destroy(rel->l);
844 80 : rel->l = NULL;
845 80 : rel_destroy(rel->r);
846 80 : rel->r = NULL;
847 244 : for (node *n = rel->exps->h ; n ; n = n->next) {
848 164 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
849 164 : exp_prop_alias(v->sql->sa, a, e);
850 164 : n->data = a;
851 : }
852 80 : list_hash_clear(rel->exps);
853 80 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
854 80 : set_count_prop(v->sql->sa, l, 1);
855 80 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
856 80 : set_count_prop(v->sql->sa, l, 0);
857 80 : rel->op = op_project;
858 80 : rel->l = l;
859 80 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
860 80 : set_count_prop(v->sql->sa, rel, 0);
861 80 : set_nodistinct(rel); /* set relations may have distinct flag set */
862 80 : v->changes++;
863 : }
864 : break;
865 : }
866 12888 : case op_munion: {
867 12888 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
868 12888 : BUN cnt = 0;
869 12888 : bool needs_pruning = false;
870 :
871 49497 : for (node *n = l->h; n; n = n->next) {
872 36609 : sql_rel *r = n->data, *pl = r;
873 :
874 36609 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
875 0 : pl = pl->l;
876 : /* if it's not a projection, then project and propagate statistics */
877 36609 : if (!is_project(pl->op) && !is_base(pl->op)) {
878 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
879 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
880 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
881 : }
882 36609 : nrels = append(nrels, pl);
883 : /* we need new munion statistics */
884 : /* propagate row count */
885 36609 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
886 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
887 36609 : if (rv == BUN_NONE) {
888 1172 : cnt++;
889 1172 : continue;
890 : }
891 35437 : if (!rv && can_be_pruned)
892 6625 : needs_pruning = true;
893 : /* overflow check */
894 35437 : if (rv > (BUN_MAX - cnt))
895 36609 : rv = BUN_MAX;
896 : else
897 35437 : cnt += rv;
898 : }
899 12888 : int i = 0;
900 73740 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
901 60852 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
902 :
903 12888 : if (needs_pruning && !rel_is_ref(rel)) {
904 4464 : v->changes++;
905 4464 : list *nl = sa_list(l->sa);
906 :
907 16465 : for (node *n = nrels->h; n; n = n->next) {
908 12001 : sql_rel *r = n->data;
909 12001 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
910 :
911 12001 : if (!rv) { /* keep last for now */
912 6155 : rel_destroy(r);
913 6155 : continue;
914 : }
915 5846 : nl = append(nl, r);
916 : }
917 4464 : rel->l = nl;
918 4464 : if (list_length(nl) == 1) {
919 4138 : sql_rel *l = rel->l = nl->h->data; /* ugh */
920 4138 : rel->r = NULL;
921 4138 : rel->op = op_project;
922 :
923 20331 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
924 16193 : sql_exp *pe = n->data, *ie = m->data;
925 16193 : sql_exp *ne = exp_ref(v->sql, ie);
926 16193 : exp_prop_alias(v->sql->sa, ne, pe);
927 16193 : n->data = ne;
928 : }
929 4138 : list_hash_clear(rel->exps);
930 326 : } else if (list_empty(nl)) {
931 : /* empty select (project [ nils ] ) */
932 454 : for (node *n = rel->exps->h ; n ; n = n->next) {
933 354 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
934 354 : exp_prop_alias(v->sql->sa, a, e);
935 354 : n->data = a;
936 : }
937 100 : list_hash_clear(rel->exps);
938 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
939 100 : set_count_prop(v->sql->sa, l, 1);
940 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
941 100 : set_count_prop(v->sql->sa, l, 0);
942 100 : rel->op = op_project;
943 100 : rel->r = NULL;
944 100 : rel->l = l;
945 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
946 100 : set_count_prop(v->sql->sa, rel, 0);
947 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
948 : }
949 : } else {
950 8424 : set_count_prop(v->sql->sa, rel, cnt);
951 : }
952 : break;
953 : }
954 533575 : case op_join:
955 : case op_left:
956 : case op_right:
957 : case op_full:
958 : case op_semi:
959 : case op_anti:
960 : case op_select:
961 : case op_project:
962 : case op_groupby: {
963 533575 : if (is_groupby(rel->op) && !list_empty(rel->r))
964 15531 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
965 533575 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
966 533576 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
967 39284 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
968 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
969 533576 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
970 139300 : int changes = v->changes;
971 139300 : rel->exps = rel_prune_predicates(v, rel);
972 139300 : if (v->changes > changes)
973 3623 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
974 : }
975 :
976 : /* propagate row count */
977 533576 : sql_rel *l = rel->l, *r = rel->r;
978 533576 : switch (rel->op) {
979 134701 : case op_join:
980 : case op_left:
981 : case op_right:
982 : case op_full: {
983 134701 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
984 :
985 134701 : if (!list_empty(rel->exps) && !is_single(rel)) {
986 246427 : for (node *n = rel->exps->h ; n ; n = n->next) {
987 125796 : sql_exp *e = n->data, *el = e->l, *er = e->r;
988 :
989 125796 : if (find_prop(e->p, PROP_JOINIDX)) {
990 667 : join_idx_estimate = lv>rv?lv:rv;
991 125129 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
992 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
993 121245 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
994 120849 : BUN lu = 0, ru = 0;
995 120849 : prop *p = NULL;
996 120849 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
997 89685 : lu = (BUN) p->value.dval;
998 120849 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
999 104063 : ru = (BUN) p->value.dval;
1000 120849 : if (is_unique(el) || lu > lv) {
1001 73801 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1002 73801 : uniques_estimate = MIN(uniques_estimate, ncount);
1003 47048 : } else if (is_unique(er) || ru > rv) {
1004 29410 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1005 29410 : uniques_estimate = MIN(uniques_estimate, ncount);
1006 : }
1007 : }
1008 : }
1009 : }
1010 : }
1011 134701 : if (is_single(rel)) {
1012 4737 : set_count_prop(v->sql->sa, rel, lv);
1013 129964 : } else if (join_idx_estimate != BUN_MAX) {
1014 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1015 129299 : } else if (uniques_estimate != BUN_MAX) {
1016 96747 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1017 32552 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1018 : /* corner cases for outer joins */
1019 126 : if (is_left(rel->op)) {
1020 114 : set_count_prop(v->sql->sa, rel, lv);
1021 12 : } else if (is_right(rel->op)) {
1022 3 : set_count_prop(v->sql->sa, rel, rv);
1023 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1024 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1025 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1026 0 : set_count_prop(v->sql->sa, rel, 0);
1027 : }
1028 32426 : } else if (lv == 0) {
1029 1166 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1030 31829 : } else if (rv == 0) {
1031 1568 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1032 30755 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1033 18287 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1034 : }
1035 : break;
1036 : }
1037 2639 : case op_anti:
1038 2639 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1039 2639 : break;
1040 108641 : case op_semi:
1041 : case op_select:
1042 : /* TODO calculate cardinalities using selectivities */
1043 108641 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1044 2745 : set_count_prop(v->sql->sa, rel, 0);
1045 : } else {
1046 208552 : if (!list_empty(rel->exps) && !is_single(rel)) {
1047 102656 : BUN cnt = get_rel_count(l), u = 1;
1048 171084 : for (node *n = rel->exps->h ; n ; n = n->next) {
1049 112474 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1050 :
1051 : /* simple expressions first */
1052 112474 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1053 : /* use selectivity */
1054 57247 : prop *p;
1055 57247 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1056 44046 : u = (BUN) p->value.dval;
1057 44046 : break;
1058 : }
1059 : }
1060 : }
1061 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1062 102656 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1063 : } else {
1064 3240 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1065 : }
1066 : }
1067 : break;
1068 264224 : case op_project:
1069 264224 : if (l) {
1070 254107 : if (need_distinct(rel)) {
1071 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1072 : } else {
1073 254107 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1074 : }
1075 : } else {
1076 10117 : BUN card = 1;
1077 :
1078 10117 : if (!list_empty(rel->exps)) {
1079 30673 : for (node *n = rel->exps->h ; n ; n = n->next) {
1080 20556 : sql_exp *e = n->data;
1081 20556 : card = MAX(card, trivial_project_exp_card(e));
1082 : }
1083 : }
1084 10117 : set_count_prop(v->sql->sa, rel, card);
1085 : }
1086 : break;
1087 22957 : case op_groupby:
1088 22957 : if (list_empty(rel->r)) {
1089 7425 : set_count_prop(v->sql->sa, rel, 1);
1090 : } else {
1091 15532 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1092 : }
1093 : break;
1094 : default:
1095 : break;
1096 : }
1097 : break;
1098 : }
1099 16859 : case op_topn: {
1100 16859 : BUN lv = get_rel_count(rel->l);
1101 :
1102 16859 : if (lv != BUN_NONE) {
1103 16842 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1104 96 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1105 96 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1106 96 : lv = offset >= lv ? 0 : lv - offset;
1107 : }
1108 16842 : if (le->l && exp_is_not_null(le)) {
1109 16804 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1110 16804 : lv = MIN(lv, limit);
1111 : }
1112 16842 : set_count_prop(v->sql->sa, rel, lv);
1113 : }
1114 : break;
1115 : }
1116 22 : case op_sample: {
1117 22 : BUN lv = get_rel_count(rel->l);
1118 :
1119 22 : if (lv != BUN_NONE) {
1120 4 : sql_exp *se = rel->exps->h->data;
1121 4 : sql_subtype *tp = exp_subtype(se);
1122 :
1123 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1124 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1125 4 : lv = MIN(lv, sample);
1126 0 : } else if (se->l) { /* sample is a percentage of rows */
1127 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1128 0 : assert(tp->type->eclass == EC_FLT);
1129 0 : lv = (BUN) ceil((dbl)lv * percent);
1130 : }
1131 4 : set_count_prop(v->sql->sa, rel, lv);
1132 : }
1133 : break;
1134 : }
1135 5973 : case op_table: {
1136 5973 : sql_exp *op = rel->r;
1137 5973 : if (rel->flag != TRIGGER_WRAPPER && op) {
1138 5661 : sql_subfunc *f = op->f;
1139 5661 : if (f->func->lang == FUNC_LANG_SQL) {
1140 97 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1141 5564 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1142 829 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1143 4735 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1144 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1145 4735 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1146 1085 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1147 3650 : } else if (f->func->lang == FUNC_LANG_MAL &&
1148 3540 : (strcmp(f->func->base.name, "queue") == 0 ||
1149 3276 : strcmp(f->func->base.name, "optimizers") == 0 ||
1150 2964 : strcmp(f->func->base.name, "env") == 0 ||
1151 2672 : strcmp(f->func->base.name, "keywords") == 0 ||
1152 2672 : strcmp(f->func->base.name, "statistics") == 0 ||
1153 2005 : strcmp(f->func->base.name, "rejects") == 0 ||
1154 1751 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1155 1751 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1156 1751 : strcmp(f->func->base.name, "sessions") == 0) ) {
1157 2079 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1158 : }
1159 : /* else {
1160 : printf("%%func needs stats : %s\n", f->func->base.name);
1161 : } */
1162 : }
1163 : break;
1164 : }
1165 : /*These relations are less important for now
1166 : TODO later we can tune it */
1167 : case op_insert:
1168 : case op_update:
1169 : case op_delete:
1170 : case op_truncate:
1171 : case op_ddl:
1172 : default:
1173 : break;
1174 : }
1175 :
1176 : return rel;
1177 : }
1178 :
1179 : static sql_rel *
1180 542363 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1181 : {
1182 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1183 542363 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1184 542363 : v->data = &has_special_modify;
1185 542363 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1186 542365 : v->data = gp;
1187 542365 : return rel;
1188 : }
1189 :
1190 : run_optimizer
1191 718870 : bind_get_statistics(visitor *v, global_props *gp)
1192 : {
1193 718870 : (void) v;
1194 718870 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1195 : }
1196 :
1197 :
1198 : static bool
1199 94932 : point_select_on_unique_column(sql_rel *rel)
1200 : {
1201 94932 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1202 131381 : for (node *n = rel->exps->h; n ; n = n->next) {
1203 75654 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1204 :
1205 75654 : if (is_compare(e->type) && e->flag == cmp_equal) {
1206 33625 : if (is_numeric_upcast(el))
1207 0 : el = el->l;
1208 33625 : if (is_numeric_upcast(er))
1209 0 : er = er->l;
1210 33625 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1211 749 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1212 : return true;
1213 32876 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1214 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1215 : return true;
1216 : }
1217 : }
1218 : }
1219 : return false;
1220 : }
1221 :
1222 : /*
1223 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1224 : * join, the opposite side's select can be pushed above the join.
1225 : */
1226 : static inline sql_rel *
1227 1246758 : rel_push_select_up(visitor *v, sql_rel *rel)
1228 : {
1229 1246758 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1230 254263 : sql_rel *l = rel->l, *r = rel->r;
1231 254263 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1232 254263 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1233 :
1234 254263 : if (can_pushup_left || can_pushup_right) {
1235 66931 : if (can_pushup_left)
1236 45442 : can_pushup_left = point_select_on_unique_column(r);
1237 66931 : if (can_pushup_right)
1238 49490 : can_pushup_right = point_select_on_unique_column(l);
1239 :
1240 : /* if both selects retrieve one row each, it's not worth it to push both up */
1241 66931 : if (can_pushup_left && !can_pushup_right) {
1242 683 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1243 683 : nrel->l = l->l;
1244 683 : rel = rel_inplace_select(rel, nrel, l->exps);
1245 683 : assert(is_select(rel->op));
1246 683 : v->changes++;
1247 66248 : } else if (!can_pushup_left && can_pushup_right) {
1248 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1249 14 : nrel->r = r->l;
1250 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1251 14 : assert(is_select(rel->op));
1252 14 : v->changes++;
1253 : }
1254 : }
1255 : }
1256 1246758 : return rel;
1257 : }
1258 :
1259 : static int
1260 93332 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1261 : {
1262 93332 : int de;
1263 :
1264 93332 : if (!t)
1265 : return 0;
1266 93332 : switch (ATOMstorage(t->type->localtype)) {
1267 : case TYPE_bte:
1268 : return 150 - 8;
1269 1599 : case TYPE_sht:
1270 1599 : return 150 - 16;
1271 37333 : case TYPE_int:
1272 37333 : return 150 - 32;
1273 518 : case TYPE_void:
1274 : case TYPE_lng:
1275 518 : return 150 - 64;
1276 : case TYPE_uuid:
1277 : #ifdef HAVE_HGE
1278 : case TYPE_hge:
1279 : #endif
1280 : return 150 - 128;
1281 2 : case TYPE_flt:
1282 2 : return 75 - 24;
1283 : case TYPE_dbl:
1284 : return 75 - 53;
1285 30582 : default:
1286 30582 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1287 1554 : return 150 - de * 8;
1288 : /* strings and blobs not duplicate eliminated don't get any points here */
1289 : return 0;
1290 : }
1291 : }
1292 :
1293 : static int
1294 34433 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1295 : {
1296 34433 : int res = 0;
1297 34433 : sql_subtype *t = exp_subtype(e);
1298 34433 : sql_column *c = NULL;
1299 :
1300 34433 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1301 : return -1000;
1302 : /* can we find out if the underlying table is sorted */
1303 33832 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1304 33832 : res += 600;
1305 :
1306 : /* prefer the shorter var types over the longer ones */
1307 33832 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1308 33832 : return res;
1309 : }
1310 :
1311 : static int
1312 58443 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1313 : {
1314 58443 : int score = 0;
1315 58443 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1316 34433 : sql_exp *l = e->l;
1317 :
1318 34433 : while (l->type == e_cmp) { /* go through nested comparisons */
1319 2 : sql_exp *ll;
1320 :
1321 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1322 0 : ll = ((list*)l->l)->h->data;
1323 : else
1324 2 : ll = l->l;
1325 2 : if (ll->type != e_cmp)
1326 : break;
1327 : l = ll;
1328 : }
1329 34433 : score += score_se_base(v, rel, l);
1330 : }
1331 58443 : score += exp_keyvalue(e);
1332 58443 : return score;
1333 : }
1334 :
1335 : static inline sql_rel *
1336 1246758 : rel_select_order(visitor *v, sql_rel *rel)
1337 : {
1338 1246758 : int *scores = NULL;
1339 1246758 : sql_exp **exps = NULL;
1340 :
1341 1246758 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1342 27337 : node *n;
1343 27337 : int i, nexps = list_length(rel->exps);
1344 27337 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1345 27337 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1346 :
1347 85780 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1348 58473 : exps[i] = n->data;
1349 58473 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1350 : return rel;
1351 58443 : scores[i] = score_se(v, rel, n->data);
1352 : }
1353 27307 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1354 :
1355 85750 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1356 58443 : n->data = exps[i];
1357 : }
1358 :
1359 : return rel;
1360 : }
1361 :
1362 : /* Compute the efficiency of using this expression early in a group by list */
1363 : static int
1364 59500 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1365 : {
1366 59500 : int res = 0;
1367 59500 : sql_subtype *t = exp_subtype(e);
1368 59500 : sql_column *c = exp_find_column(rel, e, -2);
1369 :
1370 59500 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1371 38 : res += 1000;
1372 : /* can we find out if the underlying table is sorted */
1373 59500 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1374 3582 : res += 700;
1375 38911 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1376 3712 : res += 500;
1377 59500 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1378 0 : res += 200;
1379 :
1380 : /* prefer the shorter var types over the longer ones */
1381 59500 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1382 59500 : return res;
1383 : }
1384 :
1385 : /* reorder group by expressions */
1386 : static inline sql_rel *
1387 1246758 : rel_groupby_order(visitor *v, sql_rel *rel)
1388 : {
1389 1246758 : int *scores = NULL;
1390 1246758 : sql_exp **exps = NULL;
1391 :
1392 1246758 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1393 25587 : node *n;
1394 25587 : list *gbe = rel->r;
1395 25587 : int i, ngbe = list_length(gbe);
1396 25587 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1397 25587 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1398 :
1399 : /* first sorting step, give priority for integers and sorted columns */
1400 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1401 59500 : exps[i] = n->data;
1402 59500 : scores[i] = score_gbe(v, rel, exps[i]);
1403 : }
1404 25587 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1405 :
1406 : /* second sorting step, give priority to strings with lower number of digits */
1407 50503 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1408 25587 : if (scores[i])
1409 24589 : i++;
1410 25587 : if (ngbe - i > 1) {
1411 8604 : for (int j = i; j < ngbe; j++) {
1412 6597 : sql_subtype *t = exp_subtype(exps[j]);
1413 6597 : scores[j] = t ? t->digits : 0;
1414 : }
1415 : /* the less number of digits the better, order ascending */
1416 2007 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1417 : }
1418 :
1419 85087 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1420 59500 : n->data = exps[i];
1421 : }
1422 :
1423 1246758 : return rel;
1424 : }
1425 :
1426 : /* This optimization loop contains optimizations that can potentially use statistics */
1427 : static sql_rel *
1428 1246758 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1429 : {
1430 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1431 1246758 : rel = rel_push_select_up(v, rel);
1432 1246758 : rel = rel_select_order(v, rel);
1433 :
1434 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1435 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1436 : rel_distinct_aggregate_on_unique_values */
1437 :
1438 1246758 : rel = rel_groupby_order(v, rel);
1439 1246758 : return rel;
1440 : }
1441 :
1442 : static sql_rel *
1443 66182 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1444 : {
1445 66182 : (void) gp;
1446 66182 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1447 : }
1448 :
1449 : run_optimizer
1450 718871 : bind_final_optimization_loop(visitor *v, global_props *gp)
1451 : {
1452 718871 : int flag = v->sql->sql_optimizer;
1453 : /* At the moment, this optimizer has dependency on 3 flags */
1454 549959 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1455 785055 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1456 : }
|