Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_basetable.h"
17 : #include "rel_rewriter.h"
18 :
19 : static sql_exp *
20 3465738 : comparison_find_column(sql_exp *input, sql_exp *e)
21 : {
22 3465738 : switch (input->type) {
23 98521 : case e_convert: {
24 98521 : list *types = (list *)input->r;
25 98521 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
26 98521 : if (from == to)
27 189974 : return comparison_find_column(input->l, e) ? input : NULL;
28 : return NULL;
29 : }
30 3055214 : case e_column:
31 3055214 : return exp_match(e, input) ? input : NULL;
32 : default:
33 : return NULL;
34 : }
35 : }
36 :
37 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
38 : * columns */
39 : #define comp_single_column(c) (!c->f)
40 :
41 : static sql_exp *
42 8749172 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
43 : {
44 8832460 : assert(e->type == e_column);
45 8832460 : if (rel) {
46 8832381 : switch(rel->op) {
47 4090337 : case op_left:
48 : case op_right:
49 : case op_full:
50 : case op_join:
51 : case op_select:
52 : case op_anti:
53 : case op_semi: {
54 4090337 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
55 :
56 4090337 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
57 : return NULL; /* nothing will pass, skip */
58 :
59 : /* propagate from the bottom first */
60 4074769 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
61 : found_left = true;
62 2592009 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
63 4074769 : found_right = true;
64 :
65 4074769 : if (!found_left && !found_right)
66 : return NULL;
67 1778450 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
68 3498022 : for (node *n = rel->exps->h ; n ; n = n->next) {
69 1810956 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
70 :
71 1810956 : if (comp->type == e_cmp) {
72 1809954 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
73 125227 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
74 125227 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
75 :
76 : /* not semantics found or if explicitly filtering not null values from the column */
77 125227 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
78 125227 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
79 125227 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
80 19800 : continue;
81 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
82 105427 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
83 4622 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
84 4622 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
85 4622 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
86 4622 : symmetric = is_symmetric(comp);
87 :
88 4622 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
89 1817 : continue;
90 2805 : if (lne && int1 && int2) {
91 104 : if (symmetric) {
92 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
93 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
94 : /* max is min from le and (max from re and fe max) */
95 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
96 : /* min is max from le and (min from re and fe min) */
97 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
98 : } else {
99 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
100 : /* max is min from le and fe max */
101 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
102 : /* min is max from le and re min */
103 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
104 : }
105 2701 : } else if (rne) {
106 590 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
107 0 : prop *p = find_prop(e->p, PROP_MIN);
108 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
109 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
110 590 : } else if (int1) { /* min is max from le and re min */
111 100 : prop *p = find_prop(e->p, PROP_MIN);
112 100 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
113 : }
114 2111 : } else if (fne) {
115 549 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
116 0 : prop *p = find_prop(e->p, PROP_MAX);
117 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
118 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
119 549 : } else if (int2) { /* max is min from le and fe max */
120 266 : prop *p = find_prop(e->p, PROP_MAX);
121 266 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
122 : }
123 : }
124 100805 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
125 : /* both min and max must be set and the intervals must overlap */
126 43365 : switch (comp->flag) {
127 29025 : case cmp_equal: /* for equality reduce */
128 29025 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
129 29025 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
130 29025 : break;
131 4683 : case cmp_notequal: /* for inequality expand */
132 4683 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
133 4683 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
134 4683 : break;
135 5651 : case cmp_gt:
136 : case cmp_gte:
137 10364 : if (!is_anti(comp) && lne) { /* min is max from both min */
138 4713 : prop *p = find_prop(e->p, PROP_MIN);
139 4713 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
140 938 : } else if (!is_anti(comp)) { /* max is min from both max */
141 938 : prop *p = find_prop(e->p, PROP_MAX);
142 938 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
143 : }
144 : break;
145 4006 : case cmp_lt:
146 : case cmp_lte:
147 7205 : if (!is_anti(comp) && lne) { /* max is min from both max */
148 3199 : prop *p = find_prop(e->p, PROP_MAX);
149 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
150 807 : } else if (!is_anti(comp)) { /* min is max from both min */
151 807 : prop *p = find_prop(e->p, PROP_MIN);
152 807 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
153 : }
154 : break;
155 : default: /* Maybe later I can do cmp_in and cmp_notin */
156 : break;
157 : }
158 : }
159 : }
160 : }
161 : }
162 : }
163 1778450 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
164 38039 : set_has_nil(e);
165 1778450 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
166 94368 : set_has_no_nil(e);
167 1778450 : if (is_join(rel->op) && is_unique(e) && !still_unique)
168 119007 : set_not_unique(e);
169 1778450 : return e;
170 : }
171 4639406 : case op_table:
172 : case op_basetable:
173 : case op_union:
174 : case op_except:
175 : case op_inter:
176 : case op_munion:
177 : case op_project:
178 : case op_groupby: {
179 4639406 : sql_exp *found;
180 4639406 : atom *fval;
181 4639406 : prop *est;
182 4639406 : if ((found = rel_find_exp(rel, e))) {
183 2176727 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
184 2132415 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
185 1134144 : set_minmax_property(sql, e, PROP_MAX, fval);
186 2132410 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
187 1141590 : set_minmax_property(sql, e, PROP_MIN, fval);
188 2132412 : if (!has_nil(found))
189 1378788 : set_has_no_nil(e);
190 2132412 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
191 1721761 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
192 421526 : set_unique(e);
193 : /* propagate unique estimation for known cases */
194 2132411 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
195 7538 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
196 7538 : p->value.dval = 1;
197 2124873 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
198 66659 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
199 2067282 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
200 188052 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
201 188052 : p->value.dval = est->value.dval;
202 : }
203 : }
204 2176725 : return e;
205 : }
206 : return NULL;
207 : }
208 83288 : case op_topn:
209 : case op_sample:
210 83288 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
211 : default:
212 : break;
213 : }
214 : }
215 : return NULL;
216 : }
217 :
218 : static atom *
219 4454183 : atom_from_valptr( allocator *sa, sql_subtype *tpe, ValRecord *v)
220 : {
221 4454183 : atom *a = SA_NEW(sa, atom);
222 :
223 4454173 : assert(!VALisnil(v));
224 4454208 : *a = (atom) {.tpe = *tpe,};
225 4454208 : SA_VALcopy(sa, &a->data, v);
226 4454245 : return a;
227 : }
228 :
229 : void
230 4060803 : sql_column_get_statistics(mvc *sql, sql_column *c, sql_exp *e)
231 : {
232 4060803 : bool nonil = false, unique = false;
233 4060803 : double unique_est = 0.0;
234 4060803 : ValRecord min, max;
235 4060803 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
236 :
237 4061909 : if (has_nil(e) && nonil)
238 2656651 : set_has_no_nil(e);
239 4061909 : if (!is_unique(e) && unique)
240 1101113 : set_unique(e);
241 4061909 : if (unique_est != 0.0) {
242 2825317 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
243 2825182 : p->value.dval = unique_est;
244 : }
245 4061774 : unsigned int digits = 0;
246 4061774 : sql_subtype *et = exp_subtype(e);
247 4061936 : if (et->type->eclass == EC_DEC || et->type->eclass == EC_NUM)
248 2642404 : digits = et->digits;
249 4061936 : if ((ok & 2) == 2) {
250 2224087 : if (!VALisnil(&max)) {
251 2224065 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
252 2224042 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
253 2223966 : if (digits) {
254 1656498 : unsigned int nd = atom_digits(p->value.pval);
255 1656420 : if (nd < digits)
256 : digits = nd;
257 1656420 : if (!digits)
258 : digits = 1;
259 : }
260 : }
261 2223838 : VALclear(&max);
262 : }
263 4061690 : if ((ok & 1) == 1) {
264 2230451 : if (!VALisnil(&min)) {
265 2230453 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
266 2230549 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
267 2230660 : if (digits) {
268 1664245 : unsigned int nd = atom_digits(p->value.pval);
269 1664243 : if (nd > digits) /* possibly set to low by max value */
270 : digits = nd;
271 : if (!digits)
272 : digits = 1;
273 : }
274 : }
275 2230651 : VALclear(&min);
276 : }
277 4061903 : if (digits)
278 2642333 : et->digits = digits;
279 4061903 : if (et->type->eclass == EC_DEC && et->digits <= et->scale)
280 216 : et->digits = et->scale + 1;
281 4061903 : }
282 :
283 : static void
284 936304 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
285 : {
286 936304 : if (e->p)
287 : return;
288 302165 : sql_column *c = NULL;
289 :
290 302165 : if ((c = rel_base_find_column(rel, e->nid))) {
291 204279 : sql_column_get_statistics(sql, c, e);
292 : }
293 : }
294 :
295 : static bool
296 8876 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
297 : {
298 8876 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
299 8876 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
300 8876 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
301 8876 : prop *est;
302 :
303 : /* for the intersection, if both expressions don't overlap, it can be pruned */
304 8876 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
305 511 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
306 26 : return true;
307 :
308 8850 : if (lval_max && rval_max) {
309 2557 : if (is_union(rel->op))
310 3 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
311 2554 : else if (is_inter(rel->op))
312 399 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
313 : else /* except */
314 2155 : set_minmax_property(sql, e, PROP_MAX, lval_max);
315 : }
316 8850 : if (lval_min && rval_min) {
317 2557 : if (is_union(rel->op))
318 3 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
319 2554 : else if (is_inter(rel->op))
320 399 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
321 : else /* except */
322 2155 : set_minmax_property(sql, e, PROP_MIN, lval_min);
323 : }
324 :
325 8850 : if (is_union(rel->op) || is_munion(rel->op)) {
326 5990 : if (!has_nil(le) && !has_nil(re))
327 880 : set_has_no_nil(e);
328 5990 : if (need_distinct(rel) && list_length(rel->exps) == 1)
329 0 : set_unique(e);
330 2860 : } else if (is_inter(rel->op)) {
331 564 : if (!has_nil(le) || !has_nil(re))
332 509 : set_has_no_nil(e);
333 564 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
334 345 : set_unique(e);
335 : } else {
336 2296 : assert(is_except(rel->op));
337 2296 : if (!has_nil(le))
338 2233 : set_has_no_nil(e);
339 2296 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
340 2009 : set_unique(e);
341 : }
342 : /* propagate unique estimation for known cases */
343 8850 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
344 205 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
345 205 : p->value.dval = est->value.dval;
346 : }
347 : return false;
348 : }
349 :
350 :
351 : static void
352 60730 : rel_munion_get_statistics(mvc *sql, sql_rel *rel, list *rels, sql_exp *e, int i)
353 : {
354 60730 : assert(is_munion(rel->op));
355 :
356 60730 : sql_rel *l = rels->h->data;
357 60730 : sql_exp *le = list_fetch(l->exps, i);
358 60730 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX);
359 60730 : bool has_nonil = !has_nil(le);
360 :
361 175993 : for(node *n = rels->h->next; n; n = n->next) {
362 115263 : sql_rel *r = n->data;
363 115263 : sql_exp *re = list_fetch(r->exps, i);
364 115263 : atom *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
365 :
366 115263 : if (lval_max && rval_max) {
367 42591 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
368 42591 : lval_max = find_prop_and_get(e->p, PROP_MAX);
369 : }
370 115263 : if (lval_min && rval_min) {
371 42092 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
372 42092 : lval_min = find_prop_and_get(e->p, PROP_MIN);
373 : }
374 115263 : has_nonil &= !has_nil(re);
375 :
376 : }
377 :
378 60730 : if (has_nonil)
379 41579 : set_has_no_nil(e);
380 :
381 60730 : if (need_distinct(rel) && list_length(rel->exps) == 1)
382 1594 : set_unique(e);
383 60730 : }
384 :
385 :
386 : static sql_exp *
387 3480807 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
388 : {
389 3480807 : mvc *sql = v->sql;
390 3480807 : atom *lval;
391 :
392 3480807 : (void) depth;
393 3480807 : switch(e->type) {
394 2182554 : case e_column:
395 2182554 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
396 279093 : case op_join:
397 : case op_left:
398 : case op_right:
399 : case op_full:
400 : case op_semi:
401 : case op_anti: {
402 279093 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
403 279093 : if (!found)
404 139995 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
405 : break;
406 : }
407 1903461 : case op_select:
408 : case op_project:
409 : case op_groupby: {
410 1903461 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
411 1903455 : if (!found && is_simple_project(rel->op))
412 125481 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
413 : break;
414 : }
415 0 : case op_insert:
416 : case op_update:
417 : case op_delete:
418 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
419 0 : break;
420 : default:
421 : break;
422 : }
423 : break;
424 98487 : case e_convert: {
425 98487 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
426 98487 : sql_exp *l = e->l;
427 98487 : sql_class fr = from->type->eclass, too = to->type->eclass;
428 98487 : prop *est;
429 :
430 98487 : if (fr == too) {
431 89412 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
432 58844 : atom *res = atom_copy(sql->sa, lval);
433 58844 : if ((res = atom_cast(sql->sa, res, to)))
434 58821 : set_minmax_property(sql, e, PROP_MAX, res);
435 : }
436 89412 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
437 59465 : atom *res = atom_copy(sql->sa, lval);
438 59465 : if ((res = atom_cast(sql->sa, res, to)))
439 59442 : set_minmax_property(sql, e, PROP_MIN, res);
440 : }
441 : }
442 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
443 98487 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
444 61117 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
445 61092 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
446 57663 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
447 57663 : p->value.dval = est->value.dval;
448 : }
449 98487 : if (!has_nil(l))
450 55541 : set_has_no_nil(e);
451 : break;
452 : }
453 341722 : case e_aggr:
454 : case e_func: {
455 341722 : BUN lv;
456 341722 : sql_subfunc *f = e->f;
457 :
458 341722 : if (!f->func->s) {
459 315244 : int key = hash_key(f->func->base.name); /* Using hash lookup */
460 315244 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
461 315244 : lookup_function look = NULL;
462 :
463 688567 : for (; he && !look; he = he->chain) {
464 373323 : struct function_properties* fp = (struct function_properties*) he->value;
465 :
466 373323 : if (!strcmp(f->func->base.name, fp->name))
467 107728 : look = fp->func;
468 : }
469 315244 : if (look)
470 107728 : look(sql, e);
471 : }
472 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
473 341722 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
474 89177 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
475 88854 : set_has_no_nil(e);
476 : /* set properties for global aggregates */
477 341722 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
478 7871 : if (!find_prop(e->p, PROP_NUNIQUES)) {
479 7871 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
480 7871 : p->value.dval = 1;
481 : }
482 7871 : set_unique(e);
483 : }
484 : break;
485 : }
486 570583 : case e_atom:
487 570583 : if (e->l) {
488 553848 : atom *a = (atom*) e->l;
489 553848 : if (!a->isnull) {
490 490512 : set_minmax_property(sql, e, PROP_MAX, a);
491 490512 : set_minmax_property(sql, e, PROP_MIN, a);
492 : }
493 553848 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
494 553848 : p->value.dval = 1;
495 16735 : } else if (e->f) {
496 3210 : list *vals = (list *) e->f;
497 3210 : sql_exp *first = vals->h ? vals->h->data : NULL;
498 3210 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
499 3210 : int has_nil = 0;
500 :
501 3210 : if (first) {
502 3210 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
503 3210 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
504 3210 : has_nil |= has_nil(first);
505 : }
506 :
507 13988 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
508 7568 : sql_exp *ee = n->data;
509 :
510 7568 : if (min && max) {
511 7115 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
512 7069 : max = atom_cmp(lval, max) > 0 ? lval : max;
513 : } else {
514 : max = NULL;
515 : }
516 7115 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
517 7069 : min = atom_cmp(min, lval) > 0 ? lval : min;
518 : } else {
519 : min = NULL;
520 : }
521 : }
522 7568 : has_nil |= has_nil(ee);
523 : }
524 :
525 3210 : if (!has_nil)
526 2844 : set_has_no_nil(e);
527 3210 : if (min && max) {
528 2821 : set_minmax_property(sql, e, PROP_MAX, max);
529 2821 : set_minmax_property(sql, e, PROP_MIN, min);
530 : }
531 3210 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
532 3210 : p->value.dval = (dbl) list_length(vals);
533 : } else {
534 13525 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
535 13525 : p->value.dval = 1;
536 : }
537 : break;
538 287461 : case e_cmp:
539 : /* TODO? propagating min/max/unique of booleans is not very worth it */
540 287461 : if (e->flag == cmp_or || e->flag == cmp_filter) {
541 18064 : if (!have_nil(e->l) && !have_nil(e->r))
542 13591 : set_has_no_nil(e);
543 269397 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
544 21252 : sql_exp *le = e->l;
545 21252 : if (!has_nil(le) && !have_nil(e->r))
546 18209 : set_has_no_nil(e);
547 : } else {
548 248145 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
549 248145 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
550 178192 : set_has_no_nil(e);
551 : }
552 : break;
553 : case e_psm:
554 : break;
555 : }
556 :
557 : #ifndef NDEBUG
558 : {
559 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
560 3480801 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
561 :
562 3480793 : (void) min;
563 3480793 : (void) max;
564 3480793 : assert(!min || !min->isnull);
565 3480793 : assert(!max || !max->isnull);
566 3480793 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
567 : }
568 : #endif
569 3480793 : return e;
570 : }
571 :
572 : static list * /* Remove predicates always false from min/max values */
573 139547 : rel_prune_predicates(visitor *v, sql_rel *rel)
574 : {
575 139547 : if (rel->l) {
576 139547 : sql_rel *l = rel->l;
577 139547 : if (is_single(l))
578 3414 : return rel->exps;
579 : }
580 136133 : if (!list_empty(rel->attr))
581 2962 : return rel->exps;
582 282461 : for (node *n = rel->exps->h ; n ; n = n->next) {
583 149290 : sql_exp *e = n->data;
584 :
585 149290 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
586 122879 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
587 122879 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
588 122879 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
589 122879 : bool always_false = false, always_true = false;
590 :
591 122879 : if (fe && !is_symmetric(e)) {
592 3058 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
593 3058 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
594 3660 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
595 1663 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
596 4072 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
597 2485 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
598 3058 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
599 1288 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
600 :
601 3058 : always_false |= not_int1 || not_int2 || not_int3;
602 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
603 4113 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
604 3957 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
605 575 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
606 488 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
607 : } else if (!fe) {
608 119803 : if (!is_semantics(e)) /* trivial not null cmp null case */
609 228174 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
610 119803 : switch (e->flag) {
611 104324 : case cmp_equal:
612 104324 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
613 133738 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
614 104324 : if (is_semantics(e)) { /* prune *= NULL cases */
615 5690 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
616 11380 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
617 : }
618 : break;
619 7216 : case cmp_notequal:
620 7216 : if (lval_min && lval_max && rval_min && rval_max)
621 11358 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
622 7216 : if (is_semantics(e)) {
623 26 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
624 52 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
625 : }
626 : break;
627 5632 : case cmp_gt:
628 5632 : if (lval_max && rval_min)
629 1947 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
630 5632 : if (lval_min && rval_max)
631 3894 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
632 : break;
633 119 : case cmp_gte:
634 119 : if (lval_max && rval_min)
635 49 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
636 119 : if (lval_min && rval_max)
637 98 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
638 : break;
639 2428 : case cmp_lt:
640 2428 : if (lval_min && rval_max)
641 1376 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
642 2428 : if (lval_max && rval_min)
643 2800 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
644 : break;
645 84 : case cmp_lte:
646 84 : if (lval_min && rval_max)
647 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
648 84 : if (lval_max && rval_min)
649 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
650 : break;
651 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
652 : break;
653 : }
654 : }
655 122879 : assert(!always_false || !always_true);
656 122879 : if (always_false || always_true) {
657 3652 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
658 3652 : if (exp_name(e))
659 0 : exp_prop_alias(v->sql->sa, ne, e);
660 3652 : n->data = ne;
661 3652 : v->changes++;
662 : }
663 : }
664 : }
665 133171 : return rel->exps;
666 : }
667 :
668 : static sql_rel *
669 14 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
670 : {
671 14 : if (side == rel->r)
672 0 : rel->r = NULL;
673 : else
674 14 : rel->l = NULL;
675 14 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
676 0 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
677 0 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
678 0 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
679 : }
680 35 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
681 21 : sql_exp *e1 = n->data, *e2 = m->data;
682 21 : exp_prop_alias(v->sql->sa, e2, e1);
683 : }
684 14 : list_hash_clear(side->exps);
685 :
686 14 : if (need_distinct(rel))
687 0 : set_distinct(side);
688 14 : side->p = prop_copy(v->sql->sa, rel->p);
689 14 : rel_destroy(rel);
690 14 : return side;
691 : }
692 :
693 : static BUN
694 21900 : trivial_project_exp_card(sql_exp *e)
695 : {
696 22194 : if (e->type == e_convert)
697 294 : return trivial_project_exp_card(e->l);
698 21900 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
699 : }
700 :
701 : static BUN
702 24248 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
703 : {
704 24248 : BUN lv = get_rel_count(l);
705 :
706 24248 : if (lv == 0)
707 : return 0;
708 21539 : if (!list_empty(exps)) {
709 21539 : BUN nuniques = 0;
710 : /* compute the highest number of unique values */
711 51135 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
712 29596 : sql_exp *e = n->data;
713 29596 : sql_rel *bt = NULL;
714 29596 : prop *p = NULL;
715 29596 : BUN euniques = BUN_NONE;
716 29596 : atom *min, *max, *sub = NULL;
717 29596 : sql_subtype *tp = exp_subtype(e);
718 29597 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
719 :
720 29597 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
721 20959 : euniques = (BUN) p->value.dval;
722 8637 : } else if (e->type == e_column && e->nid && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
723 6594 : euniques = (BUN) p->value.lval;
724 : }
725 : /* use min to max range to compute number of possible values in the domain for number types */
726 29596 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
727 22362 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
728 : /* the range includes min and max, so the atom_inc call is needed */
729 : /* if 'euniques' has number of distinct values, compute min between both */
730 17543 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
731 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
732 : }
733 29596 : if (euniques != BUN_NONE)
734 27553 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
735 : else
736 : nuniques = BUN_NONE;
737 : }
738 21539 : if (nuniques != BUN_NONE)
739 : return nuniques;
740 : }
741 : return lv;
742 : }
743 :
744 : static sql_rel *
745 2072669 : rel_get_statistics_(visitor *v, sql_rel *rel)
746 : {
747 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
748 2072669 : uint8_t has_special_modify = *(uint8_t*) v->data;
749 2072669 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
750 :
751 : /* Don't look at the same relation twice */
752 2072669 : if (are_statistics_gathered(rel->used))
753 : return rel;
754 1331406 : rel->used |= statistics_gathered;
755 :
756 1331406 : switch (rel->op) {
757 312348 : case op_basetable: {
758 312348 : sql_table *t = (sql_table *) rel->l;
759 312348 : sqlstore *store = v->sql->session->tr->store;
760 :
761 312348 : if (!list_empty(rel->exps)) {
762 1248722 : for (node *n = rel->exps->h ; n ; n = n->next)
763 936232 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
764 : }
765 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
766 312504 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
767 259085 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
768 : break;
769 : }
770 2791 : case op_union:
771 : case op_inter:
772 : case op_except: {
773 2791 : bool empty_cross = false;
774 2791 : int i = 0;
775 2791 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
776 :
777 2791 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
778 0 : pl = pl->l;
779 2791 : while (is_sample(pr->op) || is_topn(pr->op))
780 0 : pr = pr->l;
781 : /* if it's not a projection, then project and propagate statistics */
782 2791 : if (!is_project(pl->op) && !is_base(pl->op)) {
783 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
784 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
785 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
786 : }
787 2791 : if (!is_project(pr->op) && !is_base(pr->op)) {
788 0 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
789 0 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
790 0 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
791 : }
792 :
793 11667 : for (node *n = rel->exps->h ; n ; n = n->next) {
794 8876 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
795 8876 : i++;
796 : }
797 :
798 : /* propagate row count */
799 2791 : if (is_union(rel->op)) {
800 278 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
801 278 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
802 :
803 278 : if (lv == 0 && rv == 0) { /* both sides empty */
804 2 : if (can_be_pruned)
805 : empty_cross = true;
806 : else
807 2 : set_count_prop(v->sql->sa, rel, 0);
808 276 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
809 0 : rel = set_setop_side(v, rel, r);
810 0 : empty_cross = false; /* don't rewrite again */
811 276 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
812 0 : rel = set_setop_side(v, rel, l);
813 0 : empty_cross = false; /* don't rewrite again */
814 276 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
815 7 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
816 : }
817 2513 : } else if (is_inter(rel->op) || is_except(rel->op)) {
818 2513 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
819 2513 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
820 :
821 2513 : if (lv == 0) { /* left side empty */
822 62 : if (can_be_pruned)
823 : empty_cross = true;
824 : else
825 5 : set_count_prop(v->sql->sa, rel, 0);
826 2451 : } else if (rv == 0) { /* right side empty */
827 17 : if (is_inter(rel->op)) {
828 3 : if (can_be_pruned)
829 : empty_cross = true;
830 : else
831 0 : set_count_prop(v->sql->sa, rel, 0);
832 14 : } else if (can_be_pruned && !rel_is_ref(rel)) {
833 14 : rel = set_setop_side(v, rel, l);
834 14 : empty_cross = false; /* don't rewrite again */
835 : } else {
836 0 : set_count_prop(v->sql->sa, rel, lv);
837 : }
838 : } else {
839 2434 : set_count_prop(v->sql->sa, rel, lv);
840 : }
841 : }
842 2791 : if (can_be_pruned && empty_cross) {
843 80 : rel_destroy(rel->l);
844 80 : rel->l = NULL;
845 80 : rel_destroy(rel->r);
846 80 : rel->r = NULL;
847 244 : for (node *n = rel->exps->h ; n ; n = n->next) {
848 164 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
849 164 : exp_prop_alias(v->sql->sa, a, e);
850 164 : n->data = a;
851 : }
852 80 : list_hash_clear(rel->exps);
853 80 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
854 80 : set_count_prop(v->sql->sa, l, 1);
855 80 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
856 80 : set_count_prop(v->sql->sa, l, 0);
857 80 : rel->op = op_project;
858 80 : rel->l = l;
859 80 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
860 80 : set_count_prop(v->sql->sa, rel, 0);
861 80 : set_nodistinct(rel); /* set relations may have distinct flag set */
862 80 : v->changes++;
863 : }
864 : break;
865 : }
866 12869 : case op_munion: {
867 12869 : list *l = rel->l, *nrels = sa_list(v->sql->sa);
868 12869 : BUN cnt = 0;
869 12869 : bool needs_pruning = false;
870 :
871 49217 : for (node *n = l->h; n; n = n->next) {
872 36348 : sql_rel *r = n->data, *pl = r;
873 :
874 36348 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
875 0 : pl = pl->l;
876 : /* if it's not a projection, then project and propagate statistics */
877 36348 : if (!is_project(pl->op) && !is_base(pl->op)) {
878 0 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
879 0 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
880 0 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
881 : }
882 36348 : nrels = append(nrels, pl);
883 : /* we need new munion statistics */
884 : /* propagate row count */
885 36348 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
886 : /* if PROP_COUNT does not exist we assume at least a row (see get_rel_count def) */
887 36348 : if (rv == BUN_NONE) {
888 1156 : cnt++;
889 1156 : continue;
890 : }
891 35192 : if (!rv && can_be_pruned)
892 6619 : needs_pruning = true;
893 : /* overflow check */
894 35192 : if (rv > (BUN_MAX - cnt))
895 36348 : rv = BUN_MAX;
896 : else
897 35192 : cnt += rv;
898 : }
899 12869 : int i = 0;
900 73599 : for (node *n = rel->exps->h ; n ; n = n->next, i++)
901 60730 : rel_munion_get_statistics(v->sql, rel, nrels, n->data, i);
902 :
903 12869 : if (needs_pruning && !rel_is_ref(rel)) {
904 4457 : v->changes++;
905 4457 : list *nl = sa_list(l->sa);
906 :
907 16444 : for (node *n = nrels->h; n; n = n->next) {
908 11987 : sql_rel *r = n->data;
909 11987 : BUN rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
910 :
911 11987 : if (!rv) { /* keep last for now */
912 6148 : rel_destroy(r);
913 6148 : continue;
914 : }
915 5839 : nl = append(nl, r);
916 : }
917 4457 : rel->l = nl;
918 4457 : if (list_length(nl) == 1) {
919 4131 : sql_rel *l = rel->l = nl->h->data; /* ugh */
920 4131 : rel->r = NULL;
921 4131 : rel->op = op_project;
922 :
923 20291 : for (node *n = rel->exps->h, *m = l->exps->h ; n && m ; n = n->next, m = m->next) {
924 16160 : sql_exp *pe = n->data, *ie = m->data;
925 16160 : sql_exp *ne = exp_ref(v->sql, ie);
926 16160 : exp_prop_alias(v->sql->sa, ne, pe);
927 16160 : n->data = ne;
928 : }
929 4131 : list_hash_clear(rel->exps);
930 326 : } else if (list_empty(nl)) {
931 : /* empty select (project [ nils ] ) */
932 454 : for (node *n = rel->exps->h ; n ; n = n->next) {
933 354 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL, 0));
934 354 : exp_prop_alias(v->sql->sa, a, e);
935 354 : n->data = a;
936 : }
937 100 : list_hash_clear(rel->exps);
938 100 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
939 100 : set_count_prop(v->sql->sa, l, 1);
940 100 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
941 100 : set_count_prop(v->sql->sa, l, 0);
942 100 : rel->op = op_project;
943 100 : rel->r = NULL;
944 100 : rel->l = l;
945 100 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
946 100 : set_count_prop(v->sql->sa, rel, 0);
947 100 : set_nodistinct(rel); /* set relations may have distinct flag set */
948 : }
949 : } else {
950 8412 : set_count_prop(v->sql->sa, rel, cnt);
951 : }
952 : break;
953 : }
954 532553 : case op_join:
955 : case op_left:
956 : case op_right:
957 : case op_full:
958 : case op_semi:
959 : case op_anti:
960 : case op_select:
961 : case op_project:
962 : case op_groupby: {
963 532553 : if (is_groupby(rel->op) && !list_empty(rel->r))
964 15532 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
965 532553 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
966 532546 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
967 39288 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
968 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
969 532549 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
970 139547 : int changes = v->changes;
971 139547 : rel->exps = rel_prune_predicates(v, rel);
972 139547 : if (v->changes > changes)
973 3619 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
974 : }
975 :
976 : /* propagate row count */
977 532549 : sql_rel *l = rel->l, *r = rel->r;
978 532549 : switch (rel->op) {
979 134539 : case op_join:
980 : case op_left:
981 : case op_right:
982 : case op_full: {
983 134539 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
984 :
985 134539 : if (!list_empty(rel->exps) && !is_single(rel)) {
986 246075 : for (node *n = rel->exps->h ; n ; n = n->next) {
987 125623 : sql_exp *e = n->data, *el = e->l, *er = e->r;
988 :
989 125623 : if (find_prop(e->p, PROP_JOINIDX)) {
990 667 : join_idx_estimate = lv>rv?lv:rv;
991 124956 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
992 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
993 121071 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
994 120675 : BUN lu = 0, ru = 0;
995 120675 : prop *p = NULL;
996 120675 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
997 89586 : lu = (BUN) p->value.dval;
998 120675 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
999 103979 : ru = (BUN) p->value.dval;
1000 120675 : if (is_unique(el) || lu > lv) {
1001 73715 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
1002 73715 : uniques_estimate = MIN(uniques_estimate, ncount);
1003 46960 : } else if (is_unique(er) || ru > rv) {
1004 29351 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
1005 29351 : uniques_estimate = MIN(uniques_estimate, ncount);
1006 : }
1007 : }
1008 : }
1009 : }
1010 : }
1011 134539 : if (is_single(rel)) {
1012 4754 : set_count_prop(v->sql->sa, rel, lv);
1013 129785 : } else if (join_idx_estimate != BUN_MAX) {
1014 665 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
1015 129120 : } else if (uniques_estimate != BUN_MAX) {
1016 96613 : set_count_prop(v->sql->sa, rel, uniques_estimate);
1017 32507 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1018 : /* corner cases for outer joins */
1019 126 : if (is_left(rel->op)) {
1020 114 : set_count_prop(v->sql->sa, rel, lv);
1021 12 : } else if (is_right(rel->op)) {
1022 3 : set_count_prop(v->sql->sa, rel, rv);
1023 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
1024 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
1025 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1026 0 : set_count_prop(v->sql->sa, rel, 0);
1027 : }
1028 32381 : } else if (lv == 0) {
1029 1166 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
1030 31784 : } else if (rv == 0) {
1031 1568 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
1032 30710 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
1033 18258 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
1034 : }
1035 : break;
1036 : }
1037 2636 : case op_anti:
1038 2636 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1039 2636 : break;
1040 109015 : case op_semi:
1041 : case op_select:
1042 : /* TODO calculate cardinalities using selectivities */
1043 109015 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
1044 2744 : set_count_prop(v->sql->sa, rel, 0);
1045 : } else {
1046 209294 : if (!list_empty(rel->exps) && !is_single(rel)) {
1047 103023 : BUN cnt = get_rel_count(l), u = 1;
1048 171832 : for (node *n = rel->exps->h ; n ; n = n->next) {
1049 112870 : sql_exp *e = n->data, *el = e->l, *er = e->r;
1050 :
1051 : /* simple expressions first */
1052 112870 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
1053 : /* use selectivity */
1054 57256 : prop *p;
1055 57256 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
1056 44061 : u = (BUN) p->value.dval;
1057 44061 : break;
1058 : }
1059 : }
1060 : }
1061 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
1062 103023 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
1063 : } else {
1064 3248 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1065 : }
1066 : }
1067 : break;
1068 263001 : case op_project:
1069 263001 : if (l) {
1070 252879 : if (need_distinct(rel)) {
1071 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
1072 : } else {
1073 252879 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
1074 : }
1075 : } else {
1076 10122 : BUN card = 1;
1077 :
1078 10122 : if (!list_empty(rel->exps)) {
1079 30685 : for (node *n = rel->exps->h ; n ; n = n->next) {
1080 20563 : sql_exp *e = n->data;
1081 20563 : card = MAX(card, trivial_project_exp_card(e));
1082 : }
1083 : }
1084 10122 : set_count_prop(v->sql->sa, rel, card);
1085 : }
1086 : break;
1087 22953 : case op_groupby:
1088 22953 : if (list_empty(rel->r)) {
1089 7421 : set_count_prop(v->sql->sa, rel, 1);
1090 : } else {
1091 15532 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
1092 : }
1093 : break;
1094 : default:
1095 : break;
1096 : }
1097 : break;
1098 : }
1099 16830 : case op_topn: {
1100 16830 : BUN lv = get_rel_count(rel->l);
1101 :
1102 16830 : if (lv != BUN_NONE) {
1103 16813 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
1104 96 : if (oe && oe->l && exp_is_not_null(oe)) { /* no parameters */
1105 96 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
1106 96 : lv = offset >= lv ? 0 : lv - offset;
1107 : }
1108 16813 : if (le->l && exp_is_not_null(le)) {
1109 16775 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
1110 16775 : lv = MIN(lv, limit);
1111 : }
1112 16813 : set_count_prop(v->sql->sa, rel, lv);
1113 : }
1114 : break;
1115 : }
1116 22 : case op_sample: {
1117 22 : BUN lv = get_rel_count(rel->l);
1118 :
1119 22 : if (lv != BUN_NONE) {
1120 4 : sql_exp *se = rel->exps->h->data;
1121 4 : sql_subtype *tp = exp_subtype(se);
1122 :
1123 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
1124 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
1125 4 : lv = MIN(lv, sample);
1126 0 : } else if (se->l) { /* sample is a percentage of rows */
1127 0 : dbl percent = ((atom*)se->l)->data.val.dval;
1128 0 : assert(tp->type->eclass == EC_FLT);
1129 0 : lv = (BUN) ceil((dbl)lv * percent);
1130 : }
1131 4 : set_count_prop(v->sql->sa, rel, lv);
1132 : }
1133 : break;
1134 : }
1135 5972 : case op_table: {
1136 5972 : sql_exp *op = rel->r;
1137 5972 : if (rel->flag != TRIGGER_WRAPPER && op) {
1138 5660 : sql_subfunc *f = op->f;
1139 5660 : if (f->func->lang == FUNC_LANG_SQL) {
1140 97 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
1141 5563 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
1142 829 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
1143 4734 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
1144 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
1145 4734 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
1146 1085 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
1147 3649 : } else if (f->func->lang == FUNC_LANG_MAL &&
1148 3539 : (strcmp(f->func->base.name, "queue") == 0 ||
1149 3275 : strcmp(f->func->base.name, "optimizers") == 0 ||
1150 2963 : strcmp(f->func->base.name, "env") == 0 ||
1151 2672 : strcmp(f->func->base.name, "keywords") == 0 ||
1152 2672 : strcmp(f->func->base.name, "statistics") == 0 ||
1153 2005 : strcmp(f->func->base.name, "rejects") == 0 ||
1154 1751 : strcmp(f->func->base.name, "schemastorage") == 0 ||
1155 1751 : strncmp(f->func->base.name, "storage", 7) == 0 ||
1156 1751 : strcmp(f->func->base.name, "sessions") == 0) ) {
1157 2078 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1158 : }
1159 : /* else {
1160 : printf("%%func needs stats : %s\n", f->func->base.name);
1161 : } */
1162 : }
1163 : break;
1164 : }
1165 : /*These relations are less important for now
1166 : TODO later we can tune it */
1167 : case op_insert:
1168 : case op_update:
1169 : case op_delete:
1170 : case op_truncate:
1171 : case op_ddl:
1172 : default:
1173 : break;
1174 : }
1175 :
1176 : return rel;
1177 : }
1178 :
1179 : static sql_rel *
1180 539480 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1181 : {
1182 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1183 539480 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1184 539480 : v->data = &has_special_modify;
1185 539480 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1186 539892 : v->data = gp;
1187 539892 : return rel;
1188 : }
1189 :
1190 : run_optimizer
1191 704679 : bind_get_statistics(visitor *v, global_props *gp)
1192 : {
1193 704679 : (void) v;
1194 704679 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1195 : }
1196 :
1197 :
1198 : static bool
1199 94972 : point_select_on_unique_column(sql_rel *rel)
1200 : {
1201 94972 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1202 131461 : for (node *n = rel->exps->h; n ; n = n->next) {
1203 75696 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1204 :
1205 75696 : if (is_compare(e->type) && e->flag == cmp_equal) {
1206 33664 : if (is_numeric_upcast(el))
1207 0 : el = el->l;
1208 33664 : if (is_numeric_upcast(er))
1209 0 : er = er->l;
1210 33664 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1211 749 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1212 : return true;
1213 32915 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1214 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1215 : return true;
1216 : }
1217 : }
1218 : }
1219 : return false;
1220 : }
1221 :
1222 : /*
1223 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1224 : * join, the opposite side's select can be pushed above the join.
1225 : */
1226 : static inline sql_rel *
1227 1245427 : rel_push_select_up(visitor *v, sql_rel *rel)
1228 : {
1229 1245427 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1230 254131 : sql_rel *l = rel->l, *r = rel->r;
1231 254131 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1232 254131 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1233 :
1234 254131 : if (can_pushup_left || can_pushup_right) {
1235 66952 : if (can_pushup_left)
1236 45469 : can_pushup_left = point_select_on_unique_column(r);
1237 66952 : if (can_pushup_right)
1238 49503 : can_pushup_right = point_select_on_unique_column(l);
1239 :
1240 : /* if both selects retrieve one row each, it's not worth it to push both up */
1241 66952 : if (can_pushup_left && !can_pushup_right) {
1242 683 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1243 683 : nrel->l = l->l;
1244 683 : rel = rel_inplace_select(rel, nrel, l->exps);
1245 683 : assert(is_select(rel->op));
1246 683 : v->changes++;
1247 66269 : } else if (!can_pushup_left && can_pushup_right) {
1248 14 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1249 14 : nrel->r = r->l;
1250 14 : rel = rel_inplace_select(rel, nrel, r->exps);
1251 14 : assert(is_select(rel->op));
1252 14 : v->changes++;
1253 : }
1254 : }
1255 : }
1256 1245427 : return rel;
1257 : }
1258 :
1259 : static int
1260 93255 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1261 : {
1262 93255 : int de;
1263 :
1264 93255 : if (!t)
1265 : return 0;
1266 93255 : switch (ATOMstorage(t->type->localtype)) {
1267 : case TYPE_bte:
1268 : return 150 - 8;
1269 1597 : case TYPE_sht:
1270 1597 : return 150 - 16;
1271 37292 : case TYPE_int:
1272 37292 : return 150 - 32;
1273 518 : case TYPE_void:
1274 : case TYPE_lng:
1275 518 : return 150 - 64;
1276 : case TYPE_uuid:
1277 : #ifdef HAVE_HGE
1278 : case TYPE_hge:
1279 : #endif
1280 : return 150 - 128;
1281 2 : case TYPE_flt:
1282 2 : return 75 - 24;
1283 : case TYPE_dbl:
1284 : return 75 - 53;
1285 30573 : default:
1286 30573 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1287 1558 : return 150 - de * 8;
1288 : /* strings and blobs not duplicate eliminated don't get any points here */
1289 : return 0;
1290 : }
1291 : }
1292 :
1293 : static int
1294 34351 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1295 : {
1296 34351 : int res = 0;
1297 34351 : sql_subtype *t = exp_subtype(e);
1298 34351 : sql_column *c = NULL;
1299 :
1300 34351 : if (e->type == e_convert) /* keep unsafes at the end (TODO improve) */
1301 : return -1000;
1302 : /* can we find out if the underlying table is sorted */
1303 33750 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1304 33750 : res += 600;
1305 :
1306 : /* prefer the shorter var types over the longer ones */
1307 33750 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1308 33750 : return res;
1309 : }
1310 :
1311 : static int
1312 58433 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1313 : {
1314 58433 : int score = 0;
1315 58433 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1316 34351 : sql_exp *l = e->l;
1317 :
1318 34351 : while (l->type == e_cmp) { /* go through nested comparisons */
1319 2 : sql_exp *ll;
1320 :
1321 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1322 0 : ll = ((list*)l->l)->h->data;
1323 : else
1324 2 : ll = l->l;
1325 2 : if (ll->type != e_cmp)
1326 : break;
1327 : l = ll;
1328 : }
1329 34351 : score += score_se_base(v, rel, l);
1330 : }
1331 58433 : score += exp_keyvalue(e);
1332 58433 : return score;
1333 : }
1334 :
1335 : static inline sql_rel *
1336 1245427 : rel_select_order(visitor *v, sql_rel *rel)
1337 : {
1338 1245427 : int *scores = NULL;
1339 1245427 : sql_exp **exps = NULL;
1340 :
1341 1245427 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1342 27334 : node *n;
1343 27334 : int i, nexps = list_length(rel->exps);
1344 27334 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1345 27334 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1346 :
1347 85767 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1348 58463 : exps[i] = n->data;
1349 58463 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1350 : return rel;
1351 58433 : scores[i] = score_se(v, rel, n->data);
1352 : }
1353 27304 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1354 :
1355 85737 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1356 58433 : n->data = exps[i];
1357 : }
1358 :
1359 : return rel;
1360 : }
1361 :
1362 : /* Compute the efficiency of using this expression early in a group by list */
1363 : static int
1364 59505 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1365 : {
1366 59505 : int res = 0;
1367 59505 : sql_subtype *t = exp_subtype(e);
1368 59505 : sql_column *c = exp_find_column(rel, e, -2);
1369 :
1370 59505 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1371 38 : res += 1000;
1372 : /* can we find out if the underlying table is sorted */
1373 59505 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1374 3584 : res += 700;
1375 38914 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1376 3710 : res += 500;
1377 59505 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1378 0 : res += 200;
1379 :
1380 : /* prefer the shorter var types over the longer ones */
1381 59505 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1382 59505 : return res;
1383 : }
1384 :
1385 : /* reorder group by expressions */
1386 : static inline sql_rel *
1387 1245428 : rel_groupby_order(visitor *v, sql_rel *rel)
1388 : {
1389 1245428 : int *scores = NULL;
1390 1245428 : sql_exp **exps = NULL;
1391 :
1392 1245428 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1393 25588 : node *n;
1394 25588 : list *gbe = rel->r;
1395 25588 : int i, ngbe = list_length(gbe);
1396 25588 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1397 25588 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1398 :
1399 : /* first sorting step, give priority for integers and sorted columns */
1400 85093 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1401 59505 : exps[i] = n->data;
1402 59505 : scores[i] = score_gbe(v, rel, exps[i]);
1403 : }
1404 25588 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1405 :
1406 : /* second sorting step, give priority to strings with lower number of digits */
1407 50506 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1408 25588 : if (scores[i])
1409 24590 : i++;
1410 25588 : if (ngbe - i > 1) {
1411 8604 : for (int j = i; j < ngbe; j++) {
1412 6597 : sql_subtype *t = exp_subtype(exps[j]);
1413 6597 : scores[j] = t ? t->digits : 0;
1414 : }
1415 : /* the less number of digits the better, order ascending */
1416 2007 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1417 : }
1418 :
1419 85093 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1420 59505 : n->data = exps[i];
1421 : }
1422 :
1423 1245428 : return rel;
1424 : }
1425 :
1426 : /* This optimization loop contains optimizations that can potentially use statistics */
1427 : static sql_rel *
1428 1245427 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1429 : {
1430 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1431 1245427 : rel = rel_push_select_up(v, rel);
1432 1245428 : rel = rel_select_order(v, rel);
1433 :
1434 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1435 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1436 : rel_distinct_aggregate_on_unique_values */
1437 :
1438 1245428 : rel = rel_groupby_order(v, rel);
1439 1245426 : return rel;
1440 : }
1441 :
1442 : static sql_rel *
1443 66104 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1444 : {
1445 66104 : (void) gp;
1446 66104 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1447 : }
1448 :
1449 : run_optimizer
1450 705238 : bind_final_optimization_loop(visitor *v, global_props *gp)
1451 : {
1452 705238 : int flag = v->sql->sql_optimizer;
1453 : /* At the moment, this optimizer has dependency on 3 flags */
1454 547412 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1455 771345 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1456 : }
|