Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "rel_optimizer_private.h"
15 : #include "rel_statistics.h"
16 : #include "rel_rewriter.h"
17 :
18 : static sql_exp *
19 3883423 : comparison_find_column(sql_exp *input, sql_exp *e)
20 : {
21 3883423 : switch (input->type) {
22 154123 : case e_convert: {
23 154123 : list *types = (list *)input->r;
24 154123 : sql_class from = ((sql_subtype*)types->h->data)->type->eclass, to = ((sql_subtype*)types->h->next->data)->type->eclass;
25 154123 : if (from == to)
26 183595 : return comparison_find_column(input->l, e) ? input : NULL;
27 : return NULL;
28 : }
29 3175472 : case e_column:
30 3175472 : return exp_match(e, input) ? input : NULL;
31 : default:
32 : return NULL;
33 : }
34 : }
35 :
36 : /* multi lo <= col <= hi, maybe still be false even if lo or hi are NULL, possibly similar for filter on multiple
37 : * columns */
38 : #define comp_single_column(c) (!c->f)
39 :
40 : static sql_exp *
41 8627628 : rel_propagate_column_ref_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
42 : {
43 8711345 : assert(e->type == e_column);
44 8711345 : if (rel) {
45 8711303 : switch(rel->op) {
46 4036292 : case op_left:
47 : case op_right:
48 : case op_full:
49 : case op_join:
50 : case op_select:
51 : case op_anti:
52 : case op_semi: {
53 4036292 : bool found_without_semantics = false, found_left = false, found_right = false, still_unique = false;
54 :
55 4036292 : if ((is_innerjoin(rel->op) || is_select(rel->op)) && list_length(rel->exps) == 1 && exp_is_false(rel->exps->h->data))
56 : return NULL; /* nothing will pass, skip */
57 :
58 : /* propagate from the bottom first */
59 4028852 : if (rel_propagate_column_ref_statistics(sql, rel->l, e))
60 : found_left = true;
61 2578825 : if (!found_left && is_join(rel->op) && rel_propagate_column_ref_statistics(sql, rel->r, e))
62 4028852 : found_right = true;
63 :
64 4028852 : if (!found_left && !found_right)
65 : return NULL;
66 1723140 : if (!list_empty(rel->exps) && rel->op != op_anti) { /* if there's an or, the MIN and MAX get difficult to propagate */
67 3658945 : for (node *n = rel->exps->h ; n ; n = n->next) {
68 2020790 : sql_exp *comp = n->data, *le = comp->l, *lne = NULL, *re = comp->r, *rne = NULL, *fe = comp->f, *fne = NULL;
69 :
70 2020790 : if (comp->type == e_cmp) {
71 2020044 : if (is_theta_exp(comp->flag) && ((lne = comparison_find_column(le, e)) || (rne = comparison_find_column(re, e)) || (fe && (fne = comparison_find_column(fe, e))))) {
72 135782 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
73 135782 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
74 :
75 : /* not semantics found or if explicitly filtering not null values from the column */
76 135782 : found_without_semantics |= (!is_semantics(comp) && comp_single_column(comp)) || (comp->flag == cmp_equal && lne && is_anti(comp) && exp_is_null(re));
77 135782 : still_unique |= comp->flag == cmp_equal && is_unique(le) && is_unique(re); /* unique if only equi-joins on unique columns are there */
78 135782 : if (is_full(rel->op) || (is_left(rel->op) && found_left) || (is_right(rel->op) && found_right)) /* on outer joins, min and max cannot be propagated on some cases */
79 21084 : continue;
80 : /* if (end2 >= start1 && start2 <= end1) then the 2 intervals are intersected */
81 114698 : if (fe && lval_min && lval_max) { /* range case, the middle expression must intersect the other two */
82 4467 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
83 4467 : int int1 = rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0,
84 4467 : int2 = fval_min && fval_max && atom_cmp(fval_max, lval_min) >= 0 && atom_cmp(fval_min, lval_max) <= 0,
85 4467 : symmetric = is_symmetric(comp);
86 :
87 4467 : if (is_anti(comp) || (!symmetric && fval_min && rval_max && atom_cmp(fval_min, rval_max) < 0)) /* for asymmetric case the re range must be after the fe range */
88 1700 : continue;
89 2767 : if (lne && int1 && int2) {
90 104 : if (symmetric) {
91 0 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
92 0 : atom *nmin = statistics_atom_min(sql, rval_min, fval_min), *nmax = statistics_atom_max(sql, rval_max, fval_max);
93 : /* max is min from le and (max from re and fe max) */
94 0 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, nmax, p2->value.pval) : nmax);
95 : /* min is max from le and (min from re and fe min) */
96 0 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, nmin, p1->value.pval) : nmin);
97 : } else {
98 104 : prop *p1 = find_prop(e->p, PROP_MIN), *p2 = find_prop(e->p, PROP_MAX);
99 : /* max is min from le and fe max */
100 104 : set_minmax_property(sql, e, PROP_MAX, p2 ? statistics_atom_min(sql, fval_max, p2->value.pval) : fval_max);
101 : /* min is max from le and re min */
102 104 : set_minmax_property(sql, e, PROP_MIN, p1 ? statistics_atom_max(sql, rval_min, p1->value.pval) : rval_min);
103 : }
104 2663 : } else if (rne) {
105 591 : if (symmetric && int1 && int2) { /* min is max from le and (min from re and fe min) */
106 0 : prop *p = find_prop(e->p, PROP_MIN);
107 0 : atom *nmin = p ? statistics_atom_min(sql, p->value.pval, fval_min) : fval_min;
108 0 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, nmin, lval_min));
109 591 : } else if (int1) { /* min is max from le and re min */
110 98 : prop *p = find_prop(e->p, PROP_MIN);
111 98 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
112 : }
113 2072 : } else if (fne) {
114 550 : if (symmetric && int1 && int2) { /* max is min from le and (max from re and fe max) */
115 0 : prop *p = find_prop(e->p, PROP_MAX);
116 0 : atom *nmax = p ? statistics_atom_max(sql, p->value.pval, rval_max) : rval_max;
117 0 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, nmax, lval_max) : nmax);
118 550 : } else if (int2) { /* max is min from le and fe max */
119 268 : prop *p = find_prop(e->p, PROP_MAX);
120 268 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
121 : }
122 : }
123 110231 : } else if (lval_min && lval_max && rval_min && rval_max && atom_cmp(rval_max, lval_min) >= 0 && atom_cmp(rval_min, lval_max) <= 0) {
124 : /* both min and max must be set and the intervals must overlap */
125 42391 : switch (comp->flag) {
126 30231 : case cmp_equal: /* for equality reduce */
127 30231 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_max(sql, lval_max, rval_max) : statistics_atom_min(sql, lval_max, rval_max));
128 30231 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_min(sql, lval_min, rval_min) : statistics_atom_max(sql, lval_min, rval_min));
129 30231 : break;
130 2695 : case cmp_notequal: /* for inequality expand */
131 2695 : set_minmax_property(sql, e, PROP_MAX, is_anti(comp) ? statistics_atom_min(sql, lval_max, rval_max) : statistics_atom_max(sql, lval_max, rval_max));
132 2695 : set_minmax_property(sql, e, PROP_MIN, is_anti(comp) ? statistics_atom_max(sql, lval_min, rval_min) : statistics_atom_min(sql, lval_min, rval_min));
133 2695 : break;
134 5501 : case cmp_gt:
135 : case cmp_gte:
136 10129 : if (!is_anti(comp) && lne) { /* min is max from both min */
137 4628 : prop *p = find_prop(e->p, PROP_MIN);
138 4628 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, rval_min, p->value.pval) : rval_min);
139 873 : } else if (!is_anti(comp)) { /* max is min from both max */
140 873 : prop *p = find_prop(e->p, PROP_MAX);
141 873 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, lval_max, p->value.pval) : lval_max);
142 : }
143 : break;
144 3964 : case cmp_lt:
145 : case cmp_lte:
146 7163 : if (!is_anti(comp) && lne) { /* max is min from both max */
147 3199 : prop *p = find_prop(e->p, PROP_MAX);
148 3199 : set_minmax_property(sql, e, PROP_MAX, p ? statistics_atom_min(sql, rval_max, p->value.pval) : rval_max);
149 765 : } else if (!is_anti(comp)) { /* min is max from both min */
150 765 : prop *p = find_prop(e->p, PROP_MIN);
151 765 : set_minmax_property(sql, e, PROP_MIN, p ? statistics_atom_max(sql, lval_min, p->value.pval) : lval_min);
152 : }
153 : break;
154 : default: /* Maybe later I can do cmp_in and cmp_notin */
155 : break;
156 : }
157 : }
158 : }
159 : }
160 : }
161 : }
162 1723140 : if (is_full(rel->op) || (is_left(rel->op) && found_right) || (is_right(rel->op) && found_left))
163 31653 : set_has_nil(e);
164 1723140 : if (!is_outerjoin(rel->op) && found_without_semantics) /* at an outer join, null values pass */
165 103699 : set_has_no_nil(e);
166 1723140 : if (is_join(rel->op) && is_unique(e) && !still_unique)
167 110709 : set_not_unique(e);
168 1723140 : return e;
169 : }
170 4572014 : case op_table:
171 : case op_basetable:
172 : case op_union:
173 : case op_except:
174 : case op_inter:
175 : case op_project:
176 : case op_groupby: {
177 4572014 : sql_exp *found;
178 4572014 : atom *fval;
179 4572014 : prop *est;
180 4572014 : if ((found = rel_find_exp(rel, e))) {
181 2091598 : if (rel->op != op_table) { /* At the moment don't propagate statistics for table relations */
182 2053114 : if ((fval = find_prop_and_get(found->p, PROP_MAX)))
183 1082920 : set_minmax_property(sql, e, PROP_MAX, fval);
184 2053112 : if ((fval = find_prop_and_get(found->p, PROP_MIN)))
185 1089572 : set_minmax_property(sql, e, PROP_MIN, fval);
186 2053110 : if (!has_nil(found))
187 538218 : set_has_no_nil(e);
188 2053110 : if (is_unique(found) || (need_distinct(rel) && list_length(rel->exps) == 1) ||
189 1675772 : (is_groupby(rel->op) && (list_empty(rel->r) || (list_length(rel->r) == 1 && exps_find_exp(rel->r, e)))))
190 388504 : set_unique(e);
191 : /* propagate unique estimation for known cases */
192 2053110 : if (is_groupby(rel->op) && list_empty(rel->r) && !find_prop(e->p, PROP_NUNIQUES)) { /* global aggregate case */
193 7027 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
194 7027 : p->value.dval = 1;
195 2046083 : } else if (((is_basetable(rel->op) || is_except(rel->op) || is_inter(rel->op) || is_simple_project(rel->op) ||
196 66218 : (is_groupby(rel->op) && exps_find_exp(rel->r, e))) &&
197 1990482 : (est = find_prop(found->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES))) {
198 1222575 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
199 1222576 : p->value.dval = est->value.dval;
200 : }
201 : }
202 2091593 : return e;
203 : }
204 : return NULL;
205 : }
206 83717 : case op_topn:
207 : case op_sample:
208 83717 : return rel_propagate_column_ref_statistics(sql, rel->l, e);
209 : default:
210 : break;
211 : }
212 : }
213 : return NULL;
214 : }
215 :
216 : static atom *
217 1024552 : atom_from_valptr( sql_allocator *sa, sql_subtype *tpe, ValRecord *v)
218 : {
219 1024552 : atom *a = SA_NEW(sa, atom);
220 :
221 1024547 : assert(!VALisnil(v));
222 1024541 : *a = (atom) {.tpe = *tpe,};
223 1024541 : SA_VALcopy(sa, &a->data, v);
224 1024540 : return a;
225 : }
226 :
227 : static inline void
228 883591 : rel_basetable_column_get_statistics(mvc *sql, sql_rel *rel, sql_exp *e)
229 : {
230 883591 : sql_column *c = NULL;
231 :
232 883591 : if ((c = name_find_column(rel, exp_relname(e), exp_name(e), -2, NULL))) {
233 776819 : bool nonil = false, unique = false;
234 776819 : double unique_est = 0.0;
235 776819 : ValRecord min, max;
236 776819 : int ok = mvc_col_stats(sql, c, &nonil, &unique, &unique_est, &min, &max);
237 :
238 776821 : if (has_nil(e) && nonil)
239 92477 : set_has_no_nil(e);
240 776821 : if (!is_unique(e) && unique)
241 159833 : set_unique(e);
242 776821 : if (unique_est != 0.0) {
243 597766 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
244 597768 : p->value.dval = unique_est;
245 : }
246 776823 : if ((ok & 2) == 2) {
247 511447 : if (!VALisnil(&max)) {
248 511443 : prop *p = e->p = prop_create(sql->sa, PROP_MAX, e->p);
249 511441 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &max);
250 : }
251 511436 : VALclear(&max);
252 : }
253 776814 : if ((ok & 1) == 1) {
254 513139 : if (!VALisnil(&min)) {
255 513138 : prop *p = e->p = prop_create(sql->sa, PROP_MIN, e->p);
256 513138 : p->value.pval = atom_from_valptr(sql->sa, &c->type, &min);
257 : }
258 513140 : VALclear(&min);
259 : }
260 : }
261 883593 : }
262 :
263 : static bool
264 112552 : rel_setop_get_statistics(mvc *sql, sql_rel *rel, list *lexps, list *rexps, sql_exp *e, int i)
265 : {
266 112552 : sql_exp *le = list_fetch(lexps, i), *re = list_fetch(rexps, i);
267 112552 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
268 112552 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
269 112552 : prop *est;
270 :
271 : /* for the intersection, if both expresssions don't overlap, it can be pruned */
272 112552 : if (is_inter(rel->op) && !has_nil(le) && !has_nil(re) &&
273 185 : ((rval_max && lval_min && atom_cmp(rval_max, lval_min) < 0) || (rval_min && lval_max && atom_cmp(rval_min, lval_max) > 0)))
274 23 : return true;
275 :
276 112529 : if (lval_max && rval_max) {
277 30605 : if (is_union(rel->op))
278 28071 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_max(sql, lval_max, rval_max)); /* for union the new max will be the max of the two */
279 2534 : else if (is_inter(rel->op))
280 396 : set_minmax_property(sql, e, PROP_MAX, statistics_atom_min(sql, lval_max, rval_max)); /* for intersect the new max will be the min of the two */
281 : else /* except */
282 2138 : set_minmax_property(sql, e, PROP_MAX, lval_max);
283 : }
284 112529 : if (lval_min && rval_min) {
285 30133 : if (is_union(rel->op))
286 27599 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_min(sql, lval_min, rval_min)); /* for union the new min will be the min of the two */
287 2534 : else if (is_inter(rel->op))
288 396 : set_minmax_property(sql, e, PROP_MIN, statistics_atom_max(sql, lval_min, rval_min)); /* for intersect the new min will be the max of the two */
289 : else /* except */
290 2138 : set_minmax_property(sql, e, PROP_MIN, lval_min);
291 : }
292 :
293 112529 : if (is_union(rel->op)) {
294 109687 : if (!has_nil(le) && !has_nil(re))
295 23746 : set_has_no_nil(e);
296 109687 : if (need_distinct(rel) && list_length(rel->exps) == 1)
297 1506 : set_unique(e);
298 2842 : } else if (is_inter(rel->op)) {
299 562 : if (!has_nil(le) || !has_nil(re))
300 275 : set_has_no_nil(e);
301 562 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
302 342 : set_unique(e);
303 : } else {
304 2280 : assert(is_except(rel->op));
305 2280 : if (!has_nil(le))
306 315 : set_has_no_nil(e);
307 2280 : if (is_unique(le) || (need_distinct(rel) && list_length(rel->exps) == 1))
308 2001 : set_unique(e);
309 : }
310 : /* propagate unique estimation for known cases */
311 112529 : if (!is_union(rel->op) && (est = find_prop(le->p, PROP_NUNIQUES)) && !find_prop(e->p, PROP_NUNIQUES)) {
312 1207 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
313 1207 : p->value.dval = est->value.dval;
314 : }
315 : return false;
316 : }
317 :
318 : static sql_exp *
319 3442611 : rel_propagate_statistics(visitor *v, sql_rel *rel, sql_exp *e, int depth)
320 : {
321 3442611 : mvc *sql = v->sql;
322 3442611 : atom *lval;
323 :
324 3442611 : (void) depth;
325 3442611 : switch(e->type) {
326 2096387 : case e_column:
327 2096387 : switch (rel->op) { /* set relations don't call rel_propagate_statistics */
328 260625 : case op_join:
329 : case op_left:
330 : case op_right:
331 : case op_full:
332 : case op_semi:
333 : case op_anti: {
334 260625 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e);
335 260625 : if (!found)
336 130666 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
337 : break;
338 : }
339 1835762 : case op_select:
340 : case op_project:
341 : case op_groupby: {
342 1835762 : sql_exp *found = rel_propagate_column_ref_statistics(sql, rel->l, e); /* labels may be found on the same projection, ugh */
343 1835762 : if (!found && is_simple_project(rel->op))
344 151612 : (void) rel_propagate_column_ref_statistics(sql, rel, e);
345 : break;
346 : }
347 0 : case op_insert:
348 : case op_update:
349 : case op_delete:
350 0 : (void) rel_propagate_column_ref_statistics(sql, rel->r, e);
351 0 : break;
352 : default:
353 : break;
354 : }
355 : break;
356 199124 : case e_convert: {
357 199124 : sql_subtype *to = exp_totype(e), *from = exp_fromtype(e);
358 199124 : sql_exp *l = e->l;
359 199124 : sql_class fr = from->type->eclass, too = to->type->eclass;
360 199124 : prop *est;
361 :
362 199124 : if (fr == too) {
363 154021 : if ((lval = find_prop_and_get(l->p, PROP_MAX))) {
364 97517 : atom *res = atom_copy(sql->sa, lval);
365 97517 : if ((res = atom_cast(sql->sa, res, to)))
366 96805 : set_minmax_property(sql, e, PROP_MAX, res);
367 : }
368 154021 : if ((lval = find_prop_and_get(l->p, PROP_MIN))) {
369 99142 : atom *res = atom_copy(sql->sa, lval);
370 99142 : if ((res = atom_cast(sql->sa, res, to)))
371 98430 : set_minmax_property(sql, e, PROP_MIN, res);
372 : }
373 : }
374 : /* propagating nuniques through conversions is tricky. I am adding just the general cases */
375 199124 : if ((est = find_prop(l->p, PROP_NUNIQUES)) &&
376 134135 : (((fr == EC_TIME || fr == EC_TIME_TZ || fr == EC_DATE) && (too == EC_TIMESTAMP || too == EC_TIMESTAMP_TZ)) ||
377 134110 : ((fr == EC_NUM || fr == EC_BIT) && too == EC_NUM) || (fr == EC_DEC && too == EC_DEC) || (fr != EC_EXTERNAL && too == EC_STRING))) {
378 89631 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
379 89631 : p->value.dval = est->value.dval;
380 : }
381 199124 : if (!has_nil(l))
382 67474 : set_has_no_nil(e);
383 : break;
384 : }
385 338845 : case e_aggr:
386 : case e_func: {
387 338845 : BUN lv;
388 338845 : sql_subfunc *f = e->f;
389 :
390 338845 : if (!f->func->s) {
391 312538 : int key = hash_key(f->func->base.name); /* Using hash lookup */
392 312538 : sql_hash_e *he = sql_functions_lookup->buckets[key&(sql_functions_lookup->size-1)];
393 312538 : lookup_function look = NULL;
394 :
395 680550 : for (; he && !look; he = he->chain) {
396 368012 : struct function_properties* fp = (struct function_properties*) he->value;
397 :
398 368012 : if (!strcmp(f->func->base.name, fp->name))
399 105530 : look = fp->func;
400 : }
401 312538 : if (look)
402 105530 : look(sql, e);
403 : }
404 : /* for global aggregates with no semantics, if the left relation has values, then the output won't be NULL */
405 338845 : if (!is_semantics(e) && e->l && !have_nil(e->l) &&
406 41367 : (e->type != e_aggr || (is_groupby(rel->op) && list_length(rel->r)) || ((lv = get_rel_count(rel->l)) != BUN_NONE && lv > 0)))
407 41303 : set_has_no_nil(e);
408 : /* set properties for global aggregates */
409 338845 : if (e->type == e_aggr && is_groupby(rel->op) && list_empty(rel->r)) {
410 7401 : if (!find_prop(e->p, PROP_NUNIQUES)) {
411 7401 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
412 7401 : p->value.dval = 1;
413 : }
414 7401 : set_unique(e);
415 : }
416 : break;
417 : }
418 535416 : case e_atom:
419 535416 : if (e->l) {
420 521489 : atom *a = (atom*) e->l;
421 521489 : if (!a->isnull) {
422 464549 : set_minmax_property(sql, e, PROP_MAX, a);
423 464549 : set_minmax_property(sql, e, PROP_MIN, a);
424 : }
425 521489 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
426 521489 : p->value.dval = 1;
427 13927 : } else if (e->f) {
428 2669 : list *vals = (list *) e->f;
429 2669 : sql_exp *first = vals->h ? vals->h->data : NULL;
430 2669 : atom *max = NULL, *min = NULL; /* all child values must have a valid min/max */
431 2669 : int has_nil = 0;
432 :
433 2669 : if (first) {
434 2669 : max = ((lval = find_prop_and_get(first->p, PROP_MAX))) ? lval : NULL;
435 2669 : min = ((lval = find_prop_and_get(first->p, PROP_MIN))) ? lval : NULL;
436 2669 : has_nil |= has_nil(first);
437 : }
438 :
439 12262 : for (node *n = vals->h ? vals->h->next : NULL ; n ; n = n->next) {
440 6924 : sql_exp *ee = n->data;
441 :
442 6924 : if (min && max) {
443 6474 : if ((lval = find_prop_and_get(ee->p, PROP_MAX))) {
444 6424 : max = atom_cmp(lval, max) > 0 ? lval : max;
445 : } else {
446 : max = NULL;
447 : }
448 6474 : if ((lval = find_prop_and_get(ee->p, PROP_MIN))) {
449 6424 : min = atom_cmp(min, lval) > 0 ? lval : min;
450 : } else {
451 : min = NULL;
452 : }
453 : }
454 6924 : has_nil |= has_nil(ee);
455 : }
456 :
457 2669 : if (!has_nil)
458 2305 : set_has_no_nil(e);
459 2669 : if (min && max) {
460 2282 : set_minmax_property(sql, e, PROP_MAX, max);
461 2282 : set_minmax_property(sql, e, PROP_MIN, min);
462 : }
463 2669 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
464 2669 : p->value.dval = (dbl) list_length(vals);
465 : } else {
466 11258 : prop *p = e->p = prop_create(sql->sa, PROP_NUNIQUES, e->p);
467 11258 : p->value.dval = 1;
468 : }
469 : break;
470 272839 : case e_cmp:
471 : /* TODO? propagating min/max/unique of booleans is not very worth it */
472 272839 : if (e->flag == cmp_or || e->flag == cmp_filter) {
473 17661 : if (!have_nil(e->l) && !have_nil(e->r))
474 2995 : set_has_no_nil(e);
475 255178 : } else if (e->flag == cmp_in || e->flag == cmp_notin) {
476 19429 : sql_exp *le = e->l;
477 19429 : if (!has_nil(le) && !have_nil(e->r))
478 836 : set_has_no_nil(e);
479 : } else {
480 235749 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
481 235749 : if (!has_nil(le) && !has_nil(re) && (!e->f || !has_nil(fe)))
482 24703 : set_has_no_nil(e);
483 : }
484 : break;
485 : case e_psm:
486 : break;
487 : }
488 :
489 : #ifndef NDEBUG
490 : {
491 : /* min and max cannot be NULL and min must be <= than max, if it happens the inner relation must be empty! */
492 3442611 : atom *min = find_prop_and_get(e->p, PROP_MIN), *max = find_prop_and_get(e->p, PROP_MAX);
493 :
494 3442609 : (void) min;
495 3442609 : (void) max;
496 3442609 : assert(!min || !min->isnull);
497 3442609 : assert(!max || !max->isnull);
498 3442609 : assert(!min || !max || (min && max)/* atom_cmp(min, max) <= 0*/);
499 : }
500 : #endif
501 3442609 : return e;
502 : }
503 :
504 : static list * /* Remove predicates always false from min/max values */
505 131364 : rel_prune_predicates(visitor *v, sql_rel *rel)
506 : {
507 131364 : if (rel->l) {
508 131364 : sql_rel *l = rel->l;
509 131364 : if (is_single(l))
510 3142 : return rel->exps;
511 : }
512 128222 : if (!list_empty(rel->attr))
513 2927 : return rel->exps;
514 267320 : for (node *n = rel->exps->h ; n ; n = n->next) {
515 142025 : sql_exp *e = n->data;
516 :
517 142025 : if (e->type == e_cmp && is_theta_exp(e->flag)) {
518 117873 : sql_exp *le = e->l, *re = e->r, *fe = e->f;
519 117873 : atom *lval_min = find_prop_and_get(le->p, PROP_MIN), *lval_max = find_prop_and_get(le->p, PROP_MAX),
520 117873 : *rval_min = find_prop_and_get(re->p, PROP_MIN), *rval_max = find_prop_and_get(re->p, PROP_MAX);
521 117873 : bool always_false = false, always_true = false;
522 :
523 117873 : if (fe && !is_symmetric(e)) {
524 3018 : atom *fval_min = find_prop_and_get(fe->p, PROP_MIN), *fval_max = find_prop_and_get(fe->p, PROP_MAX);
525 3018 : comp_type lower = range2lcompare(e->flag), higher = range2rcompare(e->flag);
526 3584 : int not_int1 = rval_min && lval_max && /* the middle and left intervals don't overlap */
527 1591 : (!is_anti(e) && (lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0)),
528 3997 : not_int2 = lval_min && fval_max && /* the middle and right intervals don't overlap */
529 2416 : (!is_anti(e) && (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0)),
530 3018 : not_int3 = rval_min && fval_max && /* the left interval is after the right one */
531 1254 : (!is_anti(e) && (atom_cmp(rval_min, fval_max) > 0));
532 :
533 3018 : always_false |= not_int1 || not_int2 || not_int3;
534 : /* for anti the middle must be before the left or after the right or the right after the left, for the other the middle must be always between the left and right intervals */
535 3528 : always_true |= !has_nil(le) && !has_nil(re) && !has_nil(fe) &&
536 3372 : lval_min && lval_max && rval_min && rval_max && fval_min && fval_max &&
537 75 : (is_anti(e) ? ((lower == cmp_gte ? atom_cmp(rval_min, lval_max) > 0 : atom_cmp(rval_min, lval_max) >= 0) || (higher == cmp_lte ? atom_cmp(lval_min, fval_max) > 0 : atom_cmp(lval_min, fval_max) >= 0) || atom_cmp(rval_min, fval_max) > 0) :
538 61 : ((lower == cmp_gte ? atom_cmp(lval_min, rval_max) >= 0 : atom_cmp(lval_min, rval_max) > 0) && (higher == cmp_lte ? atom_cmp(fval_min, lval_max) >= 0 : atom_cmp(fval_min, lval_max) > 0)));
539 : } else if (!fe) {
540 114837 : if (!is_semantics(e)) /* trival not null cmp null case */
541 215368 : always_false |= !is_anti(e) && ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re)));
542 114837 : switch (e->flag) {
543 100065 : case cmp_equal:
544 100065 : if (lval_min && lval_max && rval_min && rval_max && (!is_semantics(e) || !has_nil(le) || !has_nil(re)))
545 102802 : always_false |= (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
546 100065 : if (is_semantics(e)) { /* prune *= NULL cases */
547 7130 : always_false |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
548 14260 : always_true |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
549 : }
550 : break;
551 6997 : case cmp_notequal:
552 6997 : if (lval_min && lval_max && rval_min && rval_max)
553 11050 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? (atom_cmp(lval_min, rval_min) == 0 && atom_cmp(lval_max, rval_max) <= 0) : (atom_cmp(rval_max, lval_min) < 0 || atom_cmp(rval_min, lval_max) > 0));
554 6997 : if (is_semantics(e)) {
555 23 : always_true |= (is_anti(e) ? (exp_is_null(le) && exp_is_null(re)) : ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))));
556 46 : always_false |= (is_anti(e) ? ((exp_is_not_null(le) && exp_is_null(re)) || (exp_is_null(le) && exp_is_not_null(re))) : (exp_is_null(le) && exp_is_null(re)));
557 : }
558 : break;
559 5158 : case cmp_gt:
560 5158 : if (lval_max && rval_min)
561 1909 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
562 5158 : if (lval_min && rval_max)
563 3818 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
564 : break;
565 111 : case cmp_gte:
566 111 : if (lval_max && rval_min)
567 49 : always_false |= (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
568 111 : if (lval_min && rval_max)
569 98 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
570 : break;
571 2423 : case cmp_lt:
572 2423 : if (lval_min && rval_max)
573 1372 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) < 0 : atom_cmp(lval_min, rval_max) >= 0);
574 2423 : if (lval_max && rval_min)
575 2792 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) >= 0 : atom_cmp(lval_max, rval_min) < 0);
576 : break;
577 83 : case cmp_lte:
578 83 : if (lval_min && rval_max)
579 11 : always_false |= (is_anti(e) ? atom_cmp(lval_min, rval_max) <= 0 : atom_cmp(lval_min, rval_max) > 0);
580 83 : if (lval_max && rval_min)
581 22 : always_true |= !has_nil(le) && !has_nil(re) && (is_anti(e) ? atom_cmp(lval_max, rval_min) > 0 : atom_cmp(lval_max, rval_min) <= 0);
582 : break;
583 : default: /* Maybe later I can do cmp_in and cmp_notin but don't forget to remove is_theta_exp check up there */
584 : break;
585 : }
586 : }
587 117873 : assert(!always_false || !always_true);
588 117873 : if (always_false || always_true) {
589 1864 : sql_exp *ne = exp_atom_bool(v->sql->sa, always_true /* if always_true set then true else false */);
590 1864 : if (exp_name(e))
591 0 : exp_prop_alias(v->sql->sa, ne, e);
592 1864 : n->data = ne;
593 1864 : v->changes++;
594 : }
595 : }
596 : }
597 125295 : return rel->exps;
598 : }
599 :
600 : static sql_rel *
601 4409 : set_setop_side(visitor *v, sql_rel *rel, sql_rel *side)
602 : {
603 4409 : if (side == rel->r)
604 84 : rel->r = NULL;
605 : else
606 4325 : rel->l = NULL;
607 4409 : if (!is_simple_project(side->op) || rel_is_ref(side) || !list_empty(side->r)) {
608 658 : side = rel_project(v->sql->sa, side, rel_projections(v->sql, side, NULL, 0, 1));
609 658 : set_count_prop(v->sql->sa, side, get_rel_count(side->l));
610 658 : side->exps = exps_exp_visitor_bottomup(v, side, side->exps, 0, &rel_propagate_statistics, false);
611 : }
612 24492 : for (node *n = rel->exps->h, *m = side->exps->h ; n && m ; n = n->next, m = m->next) {
613 20083 : sql_exp *e1 = n->data, *e2 = m->data;
614 20083 : exp_prop_alias(v->sql->sa, e2, e1);
615 : }
616 4409 : list_hash_clear(side->exps);
617 :
618 4409 : if (need_distinct(rel))
619 4 : set_distinct(side);
620 4409 : rel_destroy(rel);
621 4409 : return side;
622 : }
623 :
624 : static BUN
625 13244 : trivial_project_exp_card(sql_exp *e)
626 : {
627 13261 : if (e->type == e_convert)
628 17 : return trivial_project_exp_card(e->l);
629 13244 : return e->type == e_atom && e->f ? (BUN) list_length(e->f) : 1;
630 : }
631 :
632 : static BUN
633 23907 : rel_calc_nuniques(mvc *sql, sql_rel *l, list *exps)
634 : {
635 23907 : BUN lv = get_rel_count(l);
636 :
637 23907 : if (lv == 0)
638 : return 0;
639 21321 : if (!list_empty(exps)) {
640 21321 : BUN nuniques = 0;
641 : /* compute the highest number of unique values */
642 50430 : for (node *n = exps->h ; n && nuniques != BUN_NONE ; n = n->next) {
643 29109 : sql_exp *e = n->data;
644 29109 : sql_rel *bt = NULL;
645 29109 : prop *p = NULL;
646 29109 : BUN euniques = BUN_NONE;
647 29109 : atom *min, *max, *sub = NULL;
648 29109 : sql_subtype *tp = exp_subtype(e);
649 29109 : sql_class ec = tp ? tp->type->eclass : EC_STRING; /* if 'e' has no type (eg parameter), use a non-number type to fail condition */
650 :
651 29109 : if ((p = find_prop(e->p, PROP_NUNIQUES))) {
652 20261 : euniques = (BUN) p->value.dval;
653 8848 : } else if (e->type == e_column && rel_find_exp_and_corresponding_rel(l, e, false, &bt, NULL) && bt && (p = find_prop(bt->p, PROP_COUNT))) {
654 5040 : euniques = (BUN) p->value.lval;
655 : }
656 : /* use min to max range to compute number of possible values in the domain for number types */
657 29109 : if ((EC_TEMP(ec)||ec==EC_NUM||ec==EC_MONTH||ec==EC_POS) &&
658 22418 : (min = find_prop_and_get(e->p, PROP_MIN)) && (max = find_prop_and_get(e->p, PROP_MAX))) {
659 : /* the range includes min and max, so the atom_inc call is needed */
660 : /* if 'euniques' has number of distinct values, compute min between both */
661 17912 : if ((sub = atom_sub(sql->sa, max, min)) && (sub = atom_inc(sql->sa, sub)) && (sub = atom_cast(sql->sa, sub, sql_bind_localtype("oid"))))
662 0 : euniques = MIN(euniques, (BUN) sub->data.val.oval);
663 : }
664 29109 : if (euniques != BUN_NONE)
665 25301 : nuniques = MAX(nuniques, euniques); /* the highest cardinality sets the estimation */
666 : else
667 : nuniques = BUN_NONE;
668 : }
669 21321 : if (nuniques != BUN_NONE)
670 : return nuniques;
671 : }
672 : return lv;
673 : }
674 :
675 : static sql_rel *
676 1907646 : rel_get_statistics_(visitor *v, sql_rel *rel)
677 : {
678 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
679 1907646 : uint8_t has_special_modify = *(uint8_t*) v->data;
680 1907646 : bool can_be_pruned = !has_special_modify && v->storage_based_opt;
681 :
682 : /* Don't look at the same relation twice */
683 1907646 : if (are_statistics_gathered(rel->used))
684 : return rel;
685 1288280 : rel->used |= statistics_gathered;
686 :
687 1288280 : switch (rel->op) {
688 301221 : case op_basetable: {
689 301221 : sql_table *t = (sql_table *) rel->l;
690 301221 : sqlstore *store = v->sql->session->tr->store;
691 :
692 301221 : if (!list_empty(rel->exps)) {
693 1184764 : for (node *n = rel->exps->h ; n ; n = n->next)
694 883590 : rel_basetable_column_get_statistics(v->sql, rel, n->data);
695 : }
696 : /* Set table row count. TODO? look for remote tables. Don't look at storage for declared tables, because it won't be cleaned */
697 301223 : if (isTable(t) && t->s && !isDeclaredTable(t)) /* count active rows only */
698 251039 : set_count_prop(v->sql->sa, rel, (BUN)store->storage_api.count_col(v->sql->session->tr, ol_first_node(t->columns)->data, 10));
699 : break;
700 : }
701 24084 : case op_union:
702 : case op_inter:
703 : case op_except: {
704 24084 : bool empty_cross = false;
705 24084 : int i = 0;
706 24084 : sql_rel *l = rel->l, *pl = rel->l, *r = rel->r, *pr = rel->r;
707 :
708 24157 : while (is_sample(pl->op) || is_topn(pl->op)) /* skip topN and sample relations in the middle */
709 73 : pl = pl->l;
710 24154 : while (is_sample(pr->op) || is_topn(pr->op))
711 70 : pr = pr->l;
712 : /* if it's not a projection, then project and propagate statistics */
713 24084 : if (!is_project(pl->op) && !is_base(pl->op)) {
714 2 : pl = rel_project(v->sql->sa, pl, rel_projections(v->sql, pl, NULL, 0, 1));
715 2 : set_count_prop(v->sql->sa, pl, get_rel_count(pl->l));
716 2 : pl->exps = exps_exp_visitor_bottomup(v, pl, pl->exps, 0, &rel_propagate_statistics, false);
717 : }
718 24084 : if (!is_project(pr->op) && !is_base(pr->op)) {
719 46 : pr = rel_project(v->sql->sa, pr, rel_projections(v->sql, pr, NULL, 0, 1));
720 46 : set_count_prop(v->sql->sa, pr, get_rel_count(pr->l));
721 46 : pr->exps = exps_exp_visitor_bottomup(v, pr, pr->exps, 0, &rel_propagate_statistics, false);
722 : }
723 :
724 136636 : for (node *n = rel->exps->h ; n ; n = n->next) {
725 112552 : empty_cross |= rel_setop_get_statistics(v->sql, rel, pl->exps, pr->exps, n->data, i);
726 112552 : i++;
727 : }
728 :
729 : /* propagate row count */
730 24084 : if (is_union(rel->op)) {
731 21582 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
732 21582 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
733 :
734 21582 : if (lv == 0 && rv == 0) { /* both sides empty */
735 1560 : if (can_be_pruned)
736 : empty_cross = true;
737 : else
738 482 : set_count_prop(v->sql->sa, rel, 0);
739 20022 : } else if (can_be_pruned && lv == 0 && !rel_is_ref(rel)) { /* left side empty */
740 84 : rel = set_setop_side(v, rel, r);
741 84 : empty_cross = false; /* don't rewrite again */
742 19938 : } else if (can_be_pruned && rv == 0 && !rel_is_ref(rel)) { /* right side empty */
743 4312 : rel = set_setop_side(v, rel, l);
744 4312 : empty_cross = false; /* don't rewrite again */
745 15626 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
746 10620 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
747 : }
748 2502 : } else if (is_inter(rel->op) || is_except(rel->op)) {
749 2502 : BUN lv = need_distinct(rel) ? rel_calc_nuniques(v->sql, l, l->exps) : get_rel_count(l),
750 2502 : rv = need_distinct(rel) ? rel_calc_nuniques(v->sql, r, r->exps) : get_rel_count(r);
751 :
752 2502 : if (lv == 0) { /* left side empty */
753 56 : if (can_be_pruned)
754 : empty_cross = true;
755 : else
756 6 : set_count_prop(v->sql->sa, rel, 0);
757 2446 : } else if (rv == 0) { /* right side empty */
758 16 : if (is_inter(rel->op)) {
759 3 : if (can_be_pruned)
760 : empty_cross = true;
761 : else
762 0 : set_count_prop(v->sql->sa, rel, 0);
763 13 : } else if (can_be_pruned && !rel_is_ref(rel)) {
764 13 : rel = set_setop_side(v, rel, l);
765 13 : empty_cross = false; /* don't rewrite again */
766 : } else {
767 0 : set_count_prop(v->sql->sa, rel, lv);
768 : }
769 : } else {
770 2430 : set_count_prop(v->sql->sa, rel, lv);
771 : }
772 : }
773 24084 : if (can_be_pruned && empty_cross) {
774 1148 : rel_destroy(rel->l);
775 1148 : rel->l = NULL;
776 1148 : rel_destroy(rel->r);
777 1148 : rel->r = NULL;
778 3977 : for (node *n = rel->exps->h ; n ; n = n->next) {
779 2829 : sql_exp *e = n->data, *a = exp_atom(v->sql->sa, atom_general(v->sql->sa, exp_subtype(e), NULL));
780 2829 : exp_prop_alias(v->sql->sa, a, e);
781 2829 : n->data = a;
782 : }
783 1148 : list_hash_clear(rel->exps);
784 1148 : sql_rel *l = rel_project(v->sql->sa, NULL, rel->exps);
785 1148 : set_count_prop(v->sql->sa, l, 1);
786 1148 : l = rel_select(v->sql->sa, l, exp_atom_bool(v->sql->sa, 0));
787 1148 : set_count_prop(v->sql->sa, l, 0);
788 1148 : rel->op = op_project;
789 1148 : rel->l = l;
790 1148 : rel->exps = rel_projections(v->sql, l, NULL, 1, 1);
791 1148 : set_count_prop(v->sql->sa, rel, 0);
792 1148 : set_nodistinct(rel); /* set relations may have distinct flag set */
793 1148 : v->changes++;
794 : }
795 : break;
796 : }
797 499457 : case op_join:
798 : case op_left:
799 : case op_right:
800 : case op_full:
801 : case op_semi:
802 : case op_anti:
803 : case op_select:
804 : case op_project:
805 : case op_groupby: {
806 499457 : if (is_groupby(rel->op) && !list_empty(rel->r))
807 15519 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
808 499457 : rel->exps = exps_exp_visitor_bottomup(v, rel, rel->exps, 0, &rel_propagate_statistics, false);
809 499458 : if ((is_simple_project(rel->op) || is_groupby(rel->op)) && !list_empty(rel->r))
810 38010 : rel->r = exps_exp_visitor_bottomup(v, rel, rel->r, 0, &rel_propagate_statistics, false);
811 : /* The following optimizations can only be applied after propagating the statistics to rel->exps */
812 499458 : if (can_be_pruned && (is_join(rel->op) || is_select(rel->op)) && !list_empty(rel->exps)) {
813 131364 : int changes = v->changes;
814 131364 : rel->exps = rel_prune_predicates(v, rel);
815 131364 : if (v->changes > changes)
816 1817 : rel = rewrite_simplify(v, 0, v->value_based_opt, rel);
817 : }
818 :
819 : /* propagate row count */
820 499458 : sql_rel *l = rel->l, *r = rel->r;
821 499458 : switch (rel->op) {
822 128253 : case op_join:
823 : case op_left:
824 : case op_right:
825 : case op_full: {
826 128253 : BUN lv = get_rel_count(l), rv = get_rel_count(r), uniques_estimate = BUN_MAX, join_idx_estimate = BUN_MAX;
827 :
828 128253 : if (!list_empty(rel->exps) && !is_single(rel)) {
829 234525 : for (node *n = rel->exps->h ; n ; n = n->next) {
830 119878 : sql_exp *e = n->data, *el = e->l, *er = e->r;
831 :
832 119878 : if (find_prop(e->p, PROP_JOINIDX)) {
833 656 : join_idx_estimate = lv>rv?lv:rv;
834 119222 : } else if (e->type == e_cmp && e->flag == cmp_equal) {
835 : /* if one of the sides is unique, the cardinality will be that exact number, but look for nulls */
836 115366 : if (!is_semantics(e) || !has_nil(el) || !has_nil(er)) {
837 114840 : BUN lu = 0, ru = 0;
838 114840 : prop *p = NULL;
839 114840 : if ((p = find_prop(el->p, PROP_NUNIQUES)))
840 84112 : lu = (BUN) p->value.dval;
841 114840 : if ((p = find_prop(er->p, PROP_NUNIQUES)))
842 98678 : ru = (BUN) p->value.dval;
843 114840 : if (is_unique(el) || lu > lv) {
844 66585 : BUN ncount = (is_right(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : lv;
845 66585 : uniques_estimate = MIN(uniques_estimate, ncount);
846 48255 : } else if (is_unique(er) || ru > rv) {
847 29054 : BUN ncount = (is_left(rel->op) || is_full(rel->op)) ? MAX(lv, rv) : rv;
848 29054 : uniques_estimate = MIN(uniques_estimate, ncount);
849 : }
850 : }
851 : }
852 : }
853 : }
854 128253 : if (is_single(rel)) {
855 4469 : set_count_prop(v->sql->sa, rel, lv);
856 123784 : } else if (join_idx_estimate != BUN_MAX) {
857 655 : set_count_prop(v->sql->sa, rel, join_idx_estimate);
858 123129 : } else if (uniques_estimate != BUN_MAX) {
859 91735 : set_count_prop(v->sql->sa, rel, uniques_estimate);
860 31394 : } else if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
861 : /* corner cases for outer joins */
862 94 : if (is_left(rel->op)) {
863 81 : set_count_prop(v->sql->sa, rel, lv);
864 13 : } else if (is_right(rel->op)) {
865 4 : set_count_prop(v->sql->sa, rel, rv);
866 9 : } else if (is_full(rel->op) && lv != BUN_NONE && rv != BUN_NONE) {
867 9 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX - lv)) ? BUN_MAX : (lv + rv)); /* overflow check */
868 0 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
869 0 : set_count_prop(v->sql->sa, rel, 0);
870 : }
871 31300 : } else if (lv == 0) {
872 1170 : set_count_prop(v->sql->sa, rel, (is_right(rel->op) || is_full(rel->op)) ? rv : 0);
873 30701 : } else if (rv == 0) {
874 1466 : set_count_prop(v->sql->sa, rel, (is_left(rel->op) || is_full(rel->op)) ? lv : 0);
875 29796 : } else if (lv != BUN_NONE && rv != BUN_NONE) {
876 19729 : set_count_prop(v->sql->sa, rel, (rv > (BUN_MAX / lv)) ? BUN_MAX : (lv * rv)); /* overflow check */
877 : }
878 : break;
879 : }
880 2722 : case op_anti:
881 2722 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
882 2722 : break;
883 99414 : case op_semi:
884 : case op_select:
885 : /* TODO calculate cardinalities using selectivities */
886 99414 : if (list_length(rel->exps) == 1 && (exp_is_false(rel->exps->h->data) || exp_is_null(rel->exps->h->data))) {
887 1700 : set_count_prop(v->sql->sa, rel, 0);
888 : } else {
889 192189 : if (!list_empty(rel->exps) && !is_single(rel)) {
890 94475 : BUN cnt = get_rel_count(l), u = 1;
891 171996 : for (node *n = rel->exps->h ; n ; n = n->next) {
892 107460 : sql_exp *e = n->data, *el = e->l, *er = e->r;
893 :
894 : /* simple expressions first */
895 107460 : if (e->type == e_cmp && e->flag == cmp_equal && exp_is_atom(er)) {
896 : /* use selectivity */
897 54324 : prop *p;
898 54324 : if ((p = find_prop(el->p, PROP_NUNIQUES))) {
899 29939 : u = (BUN) p->value.dval;
900 29939 : break;
901 : }
902 : }
903 : }
904 : /* u is an *estimate*, so don't set count_prop to 0 unless cnt is 0 */
905 94475 : set_count_prop(v->sql->sa, rel, cnt == 0 ? 0 : u == 0 || u > cnt ? 1 : cnt/u);
906 : } else {
907 3239 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
908 : }
909 : }
910 : break;
911 246255 : case op_project:
912 246255 : if (l) {
913 236932 : if (need_distinct(rel)) {
914 0 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->exps));
915 : } else {
916 236932 : set_count_prop(v->sql->sa, rel, get_rel_count(l));
917 : }
918 : } else {
919 9323 : BUN card = 1;
920 :
921 9323 : if (!list_empty(rel->exps)) {
922 21355 : for (node *n = rel->exps->h ; n ; n = n->next) {
923 12032 : sql_exp *e = n->data;
924 12032 : card = MAX(card, trivial_project_exp_card(e));
925 : }
926 : }
927 9323 : set_count_prop(v->sql->sa, rel, card);
928 : }
929 : break;
930 22307 : case op_groupby:
931 22307 : if (list_empty(rel->r)) {
932 6788 : set_count_prop(v->sql->sa, rel, 1);
933 : } else {
934 15519 : set_count_prop(v->sql->sa, rel, rel_calc_nuniques(v->sql, l, rel->r));
935 : }
936 : break;
937 : default:
938 : break;
939 : }
940 : break;
941 : }
942 16939 : case op_topn: {
943 16939 : BUN lv = get_rel_count(rel->l);
944 :
945 16939 : if (lv != BUN_NONE) {
946 16930 : sql_exp *le = rel->exps->h->data, *oe = list_length(rel->exps) > 1 ? rel->exps->h->next->data : NULL;
947 96 : if (oe && oe->l && !exp_is_null(oe)) { /* no parameters */
948 96 : BUN offset = (BUN) ((atom*)oe->l)->data.val.lval;
949 96 : lv = offset >= lv ? 0 : lv - offset;
950 : }
951 16930 : if (le->l && !exp_is_null(le)) {
952 16897 : BUN limit = (BUN) ((atom*)le->l)->data.val.lval;
953 16897 : lv = MIN(lv, limit);
954 : }
955 16930 : set_count_prop(v->sql->sa, rel, lv);
956 : }
957 : break;
958 : }
959 22 : case op_sample: {
960 22 : BUN lv = get_rel_count(rel->l);
961 :
962 22 : if (lv != BUN_NONE) {
963 4 : sql_exp *se = rel->exps->h->data;
964 4 : sql_subtype *tp = exp_subtype(se);
965 :
966 4 : if (se->l && tp->type->eclass == EC_NUM) { /* sample is a number of rows */
967 4 : BUN sample = (BUN) ((atom*)se->l)->data.val.lval;
968 4 : lv = MIN(lv, sample);
969 0 : } else if (se->l) { /* sample is a percentage of rows */
970 0 : dbl percent = ((atom*)se->l)->data.val.dval;
971 0 : assert(tp->type->eclass == EC_FLT);
972 0 : lv = (BUN) ceil((dbl)lv * percent);
973 : }
974 4 : set_count_prop(v->sql->sa, rel, lv);
975 : }
976 : break;
977 : }
978 5533 : case op_table: {
979 5533 : sql_exp *op = rel->r;
980 5533 : if (rel->flag != TRIGGER_WRAPPER && op) {
981 5229 : sql_subfunc *f = op->f;
982 5229 : if (f->func->lang == FUNC_LANG_SQL) {
983 216 : set_count_prop(v->sql->sa, rel, 1000 /* just some fallback value */);
984 5013 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "storage") == 0) {
985 483 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of catalog */);
986 4530 : } else if (f->func->lang == FUNC_LANG_MAL && strcmp(f->func->base.name, "db_users") == 0) {
987 0 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of users */);
988 4530 : } else if (f->func->lang == FUNC_LANG_MAL && strncmp(f->func->base.name, "querylog", 8) == 0) {
989 1017 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of querylog */);
990 3513 : } else if (f->func->lang == FUNC_LANG_MAL &&
991 3410 : (strcmp(f->func->base.name, "queue") == 0 ||
992 3152 : strcmp(f->func->base.name, "optimizers") == 0 ||
993 2874 : strcmp(f->func->base.name, "env") == 0 ||
994 2616 : strcmp(f->func->base.name, "keywords") == 0 ||
995 2616 : strcmp(f->func->base.name, "statistics") == 0 ||
996 2007 : strcmp(f->func->base.name, "rejects") == 0 ||
997 1761 : strcmp(f->func->base.name, "schemastorage") == 0 ||
998 1761 : strncmp(f->func->base.name, "storage", 7) == 0 ||
999 1761 : strcmp(f->func->base.name, "sessions") == 0) ) {
1000 1898 : set_count_prop(v->sql->sa, rel, 1000 /* TODO get size of queue */);
1001 : }
1002 : /* else {
1003 : printf("%%func needs stats : %s\n", f->func->base.name);
1004 : } */
1005 : }
1006 : break;
1007 : }
1008 : /*These relations are less important for now
1009 : TODO later we can tune it */
1010 : case op_insert:
1011 : case op_update:
1012 : case op_delete:
1013 : case op_truncate:
1014 : case op_ddl:
1015 : default:
1016 : break;
1017 : }
1018 :
1019 : return rel;
1020 : }
1021 :
1022 : static sql_rel *
1023 522758 : rel_get_statistics(visitor *v, global_props *gp, sql_rel *rel)
1024 : {
1025 : /* Don't prune updates as pruning will possibly result in removing the joins which therefore cannot be used for constraint checking */
1026 522758 : uint8_t has_special_modify = (uint8_t) gp->has_special_modify;
1027 522758 : v->data = &has_special_modify;
1028 522758 : rel = rel_visitor_bottomup(v, rel, &rel_get_statistics_);
1029 522761 : v->data = gp;
1030 522761 : return rel;
1031 : }
1032 :
1033 : run_optimizer
1034 686774 : bind_get_statistics(visitor *v, global_props *gp)
1035 : {
1036 686774 : (void) v;
1037 686774 : return (gp->opt_level == 1 && !gp->cnt[op_insert]) ? rel_get_statistics : NULL;
1038 : }
1039 :
1040 :
1041 : static bool
1042 97657 : point_select_on_unique_column(sql_rel *rel)
1043 : {
1044 97657 : if (is_select(rel->op) && !list_empty(rel->exps)) {
1045 133094 : for (node *n = rel->exps->h; n ; n = n->next) {
1046 76390 : sql_exp *e = n->data, *el = e->l, *er = e->r, *found = NULL;
1047 :
1048 76390 : if (is_compare(e->type) && e->flag == cmp_equal) {
1049 33947 : if (is_numeric_upcast(el))
1050 0 : el = el->l;
1051 33947 : if (is_numeric_upcast(er))
1052 0 : er = er->l;
1053 33947 : if (is_alias(el->type) && exp_is_atom(er) && (found = rel_find_exp(rel->l, el)) &&
1054 27 : is_unique(found) && (!is_semantics(e) || !has_nil(found) || !has_nil(er)))
1055 : return true;
1056 33921 : if (is_alias(er->type) && exp_is_atom(el) && (found = rel_find_exp(rel->l, er)) &&
1057 0 : is_unique(found) && (!is_semantics(e) || !has_nil(el) || !has_nil(found)))
1058 : return true;
1059 : }
1060 : }
1061 : }
1062 : return false;
1063 : }
1064 :
1065 : /*
1066 : * A point select on an unique column reduces the number of rows to 1. If the same select is under a
1067 : * join, the opposite side's select can be pushed above the join.
1068 : */
1069 : static inline sql_rel *
1070 1326921 : rel_push_select_up(visitor *v, sql_rel *rel)
1071 : {
1072 1326921 : if ((is_innerjoin(rel->op) || is_left(rel->op) || is_right(rel->op) || is_semi(rel->op)) && !is_single(rel)) {
1073 261250 : sql_rel *l = rel->l, *r = rel->r;
1074 261250 : bool can_pushup_left = is_select(l->op) && !rel_is_ref(l) && !is_single(l) && (is_innerjoin(rel->op) || is_left(rel->op) || is_semi(rel->op)),
1075 261250 : can_pushup_right = is_select(r->op) && !rel_is_ref(r) && !is_single(r) && (is_innerjoin(rel->op) || is_right(rel->op));
1076 :
1077 261250 : if (can_pushup_left || can_pushup_right) {
1078 69410 : if (can_pushup_left)
1079 48429 : can_pushup_left = point_select_on_unique_column(r);
1080 69410 : if (can_pushup_right)
1081 49228 : can_pushup_right = point_select_on_unique_column(l);
1082 :
1083 : /* if both selects retrieve one row each, it's not worth it to push both up */
1084 69410 : if (can_pushup_left && !can_pushup_right) {
1085 8 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1086 8 : nrel->l = l->l;
1087 8 : rel = rel_inplace_select(rel, nrel, l->exps);
1088 8 : assert(is_select(rel->op));
1089 8 : v->changes++;
1090 69402 : } else if (!can_pushup_left && can_pushup_right) {
1091 10 : sql_rel *nrel = rel_dup_copy(v->sql->sa, rel);
1092 10 : nrel->r = r->l;
1093 10 : rel = rel_inplace_select(rel, nrel, r->exps);
1094 10 : assert(is_select(rel->op));
1095 10 : v->changes++;
1096 : }
1097 : }
1098 : }
1099 1326921 : return rel;
1100 : }
1101 :
1102 : static int
1103 104019 : sql_class_base_score(visitor *v, sql_column *c, sql_subtype *t, bool equality_based)
1104 : {
1105 104019 : int de;
1106 :
1107 104019 : if (!t)
1108 : return 0;
1109 104019 : switch (ATOMstorage(t->type->localtype)) {
1110 : case TYPE_bte:
1111 : return 150 - 8;
1112 3846 : case TYPE_sht:
1113 3846 : return 150 - 16;
1114 39314 : case TYPE_int:
1115 39314 : return 150 - 32;
1116 947 : case TYPE_void:
1117 : case TYPE_lng:
1118 947 : return 150 - 64;
1119 : case TYPE_uuid:
1120 : #ifdef HAVE_HGE
1121 : case TYPE_hge:
1122 : #endif
1123 : return 150 - 128;
1124 5 : case TYPE_flt:
1125 5 : return 75 - 24;
1126 : case TYPE_dbl:
1127 : return 75 - 53;
1128 33047 : default:
1129 33047 : if (equality_based && c && v->storage_based_opt && (de = mvc_is_duplicate_eliminated(v->sql, c)))
1130 1397 : return 150 - de * 8;
1131 : /* strings and blobs not duplicate eliminated don't get any points here */
1132 : return 0;
1133 : }
1134 : }
1135 :
1136 : static int
1137 41904 : score_se_base(visitor *v, sql_rel *rel, sql_exp *e)
1138 : {
1139 41904 : int res = 0;
1140 41904 : sql_subtype *t = exp_subtype(e);
1141 41904 : sql_column *c = NULL;
1142 :
1143 : /* can we find out if the underlying table is sorted */
1144 41904 : if ((c = exp_find_column(rel, e, -2)) && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1145 41904 : res += 600;
1146 :
1147 : /* prefer the shorter var types over the longer ones */
1148 41904 : res += sql_class_base_score(v, c, t, is_equality_or_inequality_exp(e->flag)); /* smaller the type, better */
1149 41904 : return res;
1150 : }
1151 :
1152 : static int
1153 66329 : score_se(visitor *v, sql_rel *rel, sql_exp *e)
1154 : {
1155 66329 : int score = 0;
1156 66329 : if (e->type == e_cmp && !is_complex_exp(e->flag)) {
1157 41904 : sql_exp *l = e->l;
1158 :
1159 41904 : while (l->type == e_cmp) { /* go through nested comparisons */
1160 2 : sql_exp *ll;
1161 :
1162 2 : if (l->flag == cmp_filter || l->flag == cmp_or)
1163 0 : ll = ((list*)l->l)->h->data;
1164 : else
1165 2 : ll = l->l;
1166 2 : if (ll->type != e_cmp)
1167 : break;
1168 : l = ll;
1169 : }
1170 41904 : score += score_se_base(v, rel, l);
1171 : }
1172 66329 : score += exp_keyvalue(e);
1173 66329 : return score;
1174 : }
1175 :
1176 : static inline sql_rel *
1177 1326921 : rel_select_order(visitor *v, sql_rel *rel)
1178 : {
1179 1326921 : int *scores = NULL;
1180 1326921 : sql_exp **exps = NULL;
1181 :
1182 1326921 : if (is_select(rel->op) && list_length(rel->exps) > 1) {
1183 31071 : node *n;
1184 31071 : int i, nexps = list_length(rel->exps);
1185 31071 : scores = SA_NEW_ARRAY(v->sql->ta, int, nexps);
1186 31071 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, nexps);
1187 :
1188 97400 : for (i = 0, n = rel->exps->h; n; i++, n = n->next) {
1189 66378 : exps[i] = n->data;
1190 66378 : if (find_prop(exps[i]->p, PROP_HASHCOL))
1191 : return rel;
1192 66329 : scores[i] = score_se(v, rel, n->data);
1193 : }
1194 31022 : GDKqsort(scores, exps, NULL, nexps, sizeof(int), sizeof(void *), TYPE_int, true, true);
1195 :
1196 97351 : for (i = 0, n = rel->exps->h; n; i++, n = n->next)
1197 66329 : n->data = exps[i];
1198 : }
1199 :
1200 : return rel;
1201 : }
1202 :
1203 : /* Compute the efficiency of using this expression early in a group by list */
1204 : static int
1205 62115 : score_gbe(visitor *v, sql_rel *rel, sql_exp *e)
1206 : {
1207 62115 : int res = 0;
1208 62115 : sql_subtype *t = exp_subtype(e);
1209 62115 : sql_column *c = exp_find_column(rel, e, -2);
1210 :
1211 62115 : if (e->card == CARD_ATOM) /* constants are trivial to group */
1212 67 : res += 1000;
1213 : /* can we find out if the underlying table is sorted */
1214 62115 : if (is_unique(e) || find_prop(e->p, PROP_HASHCOL) || (c && v->storage_based_opt && mvc_is_unique(v->sql, c))) /* distinct columns */
1215 3920 : res += 700;
1216 41519 : if (c && v->storage_based_opt && mvc_is_sorted(v->sql, c))
1217 4000 : res += 500;
1218 62115 : if (find_prop(e->p, PROP_HASHIDX)) /* has hash index */
1219 0 : res += 200;
1220 :
1221 : /* prefer the shorter var types over the longer ones */
1222 62115 : res += sql_class_base_score(v, c, t, true); /* smaller the type, better */
1223 62115 : return res;
1224 : }
1225 :
1226 : /* reorder group by expressions */
1227 : static inline sql_rel *
1228 1326922 : rel_groupby_order(visitor *v, sql_rel *rel)
1229 : {
1230 1326922 : int *scores = NULL;
1231 1326922 : sql_exp **exps = NULL;
1232 :
1233 1326922 : if (is_groupby(rel->op) && list_length(rel->r) > 1) {
1234 26233 : node *n;
1235 26233 : list *gbe = rel->r;
1236 26233 : int i, ngbe = list_length(gbe);
1237 26233 : scores = SA_NEW_ARRAY(v->sql->ta, int, ngbe);
1238 26233 : exps = SA_NEW_ARRAY(v->sql->ta, sql_exp*, ngbe);
1239 :
1240 : /* first sorting step, give priority for integers and sorted columns */
1241 88348 : for (i = 0, n = gbe->h; n; i++, n = n->next) {
1242 62115 : exps[i] = n->data;
1243 62115 : scores[i] = score_gbe(v, rel, exps[i]);
1244 : }
1245 26233 : GDKqsort(scores, exps, NULL, ngbe, sizeof(int), sizeof(void *), TYPE_int, true, true);
1246 :
1247 : /* second sorting step, give priority to strings with lower number of digits */
1248 51847 : for (i = ngbe - 1; i && !scores[i]; i--); /* find expressions with no score from the first round */
1249 26233 : if (scores[i])
1250 25249 : i++;
1251 26233 : if (ngbe - i > 1) {
1252 8599 : for (int j = i; j < ngbe; j++) {
1253 6595 : sql_subtype *t = exp_subtype(exps[j]);
1254 6595 : scores[j] = t ? t->digits : 0;
1255 : }
1256 : /* the less number of digits the better, order ascending */
1257 2004 : GDKqsort(scores + i, exps + i, NULL, ngbe - i, sizeof(int), sizeof(void *), TYPE_int, false, true);
1258 : }
1259 :
1260 88348 : for (i = 0, n = gbe->h; n; i++, n = n->next)
1261 62115 : n->data = exps[i];
1262 : }
1263 :
1264 1326922 : return rel;
1265 : }
1266 :
1267 : /* This optimization loop contains optimizations that can potentially use statistics */
1268 : static sql_rel *
1269 1326921 : rel_final_optimization_loop_(visitor *v, sql_rel *rel)
1270 : {
1271 : /* Run rel_push_select_up only once at the end to avoid an infinite optimization loop */
1272 1326921 : rel = rel_push_select_up(v, rel);
1273 1326921 : rel = rel_select_order(v, rel);
1274 :
1275 : /* TODO? Maybe later add rel_simplify_count, rel_join2semijoin, rel_simplify_fk_joins,
1276 : rel_distinct_project2groupby, rel_simplify_predicates, rel_simplify_math,
1277 : rel_distinct_aggregate_on_unique_values */
1278 :
1279 1326922 : rel = rel_groupby_order(v, rel);
1280 1326922 : return rel;
1281 : }
1282 :
1283 : static sql_rel *
1284 55364 : rel_final_optimization_loop(visitor *v, global_props *gp, sql_rel *rel)
1285 : {
1286 55364 : (void) gp;
1287 55364 : return rel_visitor_bottomup(v, rel, &rel_final_optimization_loop_);
1288 : }
1289 :
1290 : run_optimizer
1291 686778 : bind_final_optimization_loop(visitor *v, global_props *gp)
1292 : {
1293 686778 : int flag = v->sql->sql_optimizer;
1294 : /* At the moment, this optimizer has dependency on 3 flags */
1295 530588 : return gp->opt_level == 1 && (gp->cnt[op_groupby] || gp->cnt[op_select]) &&
1296 742144 : (flag & push_select_up) && (flag & optimize_select_and_joins_topdown) && (flag & optimize_projections) ? rel_final_optimization_loop : NULL;
1297 : }
|