Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * In this file we have a number of functions that search a column
15 : * using binary search. The column must be sorted or reverse sorted
16 : * (for the SORTfnd* functions), or there must be an order index (for
17 : * the ORDERfnd* functions).
18 : *
19 : * All functions return a BUN, i.e. an offset from the start of the
20 : * column. The SORTfnd and ORDERfnd functions return BUN_NONE if the
21 : * value does not occur in the column.
22 : *
23 : * The ORDERfnd* functions return an offset in the order index, so to
24 : * get the actual position, (the OID, not the BUN), read the order
25 : * index at that offset.
26 : *
27 : * The *fndfirst functions return the BUN of the *first* value in the
28 : * column that is greater (less) than or equal to the value being
29 : * searched (when the column is sorted in ascending (descending)
30 : * order).
31 : *
32 : * The *fndlast functions return the BUN of the *first* value in the
33 : * column that is greater (less) than the value being searched (when
34 : * the column is sorted in ascending (descending) order).
35 : *
36 : * If the value to be found occurs, the following relationship holds:
37 : *
38 : * SORTfndfirst(b, v) <= SORTfnd(b, v) < SORTfndlast(b, v)
39 : * ORDERfndfirst(b, oidxh, v) <= ORDERfnd(b, oidxh, v) < ORDERfndlast(b, oidxh, v)
40 : *
41 : * and the range from *fndfirst (included) up to *fndlast (not
42 : * included) are all values in the column that are equal to the value.
43 : *
44 : * If the value to be found does not occur, SORTfnd and ORDERfnd
45 : * return BUN_NONE, and the other functions return the location of the
46 : * next larger value, or BATcount if the value being searched for is
47 : * larger (smaller if reverse sorted) than any in the column.
48 : *
49 : * Note that the NIL value is considered smaller than all other values.
50 : */
51 :
52 : #include "monetdb_config.h"
53 : #include "gdk.h"
54 : #include "gdk_private.h"
55 :
56 : #define VALUE(x) (vars ? \
57 : vars + VarHeapVal(vals, (x), width) : \
58 : (const char *) vals + ((x) * width))
59 :
60 : #define bte_LT(a, b) ((a) < (b))
61 : #define bte_LE(a, b) ((a) <= (b))
62 : #define bte_GT(a, b) ((a) > (b))
63 : #define bte_GE(a, b) ((a) >= (b))
64 : #define bte_EQ(a, b) ((a) == (b))
65 : #define sht_LT(a, b) ((a) < (b))
66 : #define sht_LE(a, b) ((a) <= (b))
67 : #define sht_GT(a, b) ((a) > (b))
68 : #define sht_GE(a, b) ((a) >= (b))
69 : #define sht_EQ(a, b) ((a) == (b))
70 : #define int_LT(a, b) ((a) < (b))
71 : #define int_LE(a, b) ((a) <= (b))
72 : #define int_GT(a, b) ((a) > (b))
73 : #define int_GE(a, b) ((a) >= (b))
74 : #define int_EQ(a, b) ((a) == (b))
75 : #define lng_LT(a, b) ((a) < (b))
76 : #define lng_LE(a, b) ((a) <= (b))
77 : #define lng_GT(a, b) ((a) > (b))
78 : #define lng_GE(a, b) ((a) >= (b))
79 : #define lng_EQ(a, b) ((a) == (b))
80 : #ifdef HAVE_HGE
81 : #define hge_LT(a, b) ((a) < (b))
82 : #define hge_LE(a, b) ((a) <= (b))
83 : #define hge_GT(a, b) ((a) > (b))
84 : #define hge_GE(a, b) ((a) >= (b))
85 : #define hge_EQ(a, b) ((a) == (b))
86 : #endif
87 : #define flt_LT(a, b) (!is_flt_nil(b) && (is_flt_nil(a) || (a) < (b)))
88 : #define flt_LE(a, b) (is_flt_nil(a) || (!is_flt_nil(b) && (a) <= (b)))
89 : #define flt_GT(a, b) (!is_flt_nil(a) && (is_flt_nil(b) || (a) > (b)))
90 : #define flt_GE(a, b) (is_flt_nil(b) || (!is_flt_nil(a) && (a) >= (b)))
91 : #define flt_EQ(a, b) (is_flt_nil(a) ? is_flt_nil(b) : !is_flt_nil(b) && (a) == (b))
92 : #define dbl_LT(a, b) (!is_dbl_nil(b) && (is_dbl_nil(a) || (a) < (b)))
93 : #define dbl_LE(a, b) (is_dbl_nil(a) || (!is_dbl_nil(b) && (a) <= (b)))
94 : #define dbl_GT(a, b) (!is_dbl_nil(a) && (is_dbl_nil(b) || (a) > (b)))
95 : #define dbl_GE(a, b) (is_dbl_nil(b) || (!is_dbl_nil(a) && (a) >= (b)))
96 : #define dbl_EQ(a, b) (is_dbl_nil(a) ? is_dbl_nil(b) : !is_dbl_nil(b) && (a) == (b))
97 :
98 : #define BINSEARCHFUNC(TYPE) \
99 : BUN \
100 : binsearch_##TYPE(const oid *restrict indir, oid offset, \
101 : const TYPE *restrict vals, BUN lo, BUN hi, TYPE v, \
102 : int ordering, int last) \
103 : { \
104 : BUN mid; \
105 : TYPE x; \
106 : \
107 : assert(ordering == 1 || ordering == -1); \
108 : assert(lo <= hi); \
109 : \
110 : if (ordering > 0) { \
111 : if (indir) { \
112 : if (last > 0) { \
113 : x = vals[indir[lo] - offset]; \
114 : if (TYPE##_GT(x, v)) \
115 : return lo; \
116 : x = vals[indir[hi] - offset]; \
117 : if (TYPE##_LE(x, v)) \
118 : return hi + 1; \
119 : \
120 : /* loop invariant: */ \
121 : /* value@lo <= v < value@hi */ \
122 : while (hi - lo > 1) { \
123 : mid = (hi + lo) / 2; \
124 : x = vals[indir[mid] - offset]; \
125 : if (TYPE##_GT(x, v)) \
126 : hi = mid; \
127 : else \
128 : lo = mid; \
129 : } \
130 : } else { \
131 : x = vals[indir[lo] - offset]; \
132 : if (TYPE##_GE(x, v)) \
133 : return last == 0 || TYPE##_EQ(x, v) ? lo : BUN_NONE; \
134 : x = vals[indir[hi] - offset]; \
135 : if (TYPE##_LT(x, v)) \
136 : return last == 0 ? hi + 1 : BUN_NONE; \
137 : \
138 : /* loop invariant: */ \
139 : /* value@lo < v <= value@hi */ \
140 : while (hi - lo > 1) { \
141 : mid = (hi + lo) / 2; \
142 : x = vals[indir[mid] - offset]; \
143 : if (TYPE##_GE(x, v)) \
144 : hi = mid; \
145 : else \
146 : lo = mid; \
147 : } \
148 : } \
149 : } else { \
150 : if (last > 0) { \
151 : x = vals[lo]; \
152 : if (TYPE##_GT(x, v)) \
153 : return lo; \
154 : x = vals[hi]; \
155 : if (TYPE##_LE(x, v)) \
156 : return hi + 1; \
157 : \
158 : /* loop invariant: */ \
159 : /* value@lo <= v < value@hi */ \
160 : while (hi - lo > 1) { \
161 : mid = (hi + lo) / 2; \
162 : x = vals[mid]; \
163 : if (TYPE##_GT(x, v)) \
164 : hi = mid; \
165 : else \
166 : lo = mid; \
167 : } \
168 : } else { \
169 : x = vals[lo]; \
170 : if (TYPE##_GE(x, v)) \
171 : return last == 0 || TYPE##_EQ(x, v) ? lo : BUN_NONE; \
172 : x = vals[hi]; \
173 : if (TYPE##_LT(x, v)) \
174 : return last == 0 ? hi + 1 : BUN_NONE; \
175 : \
176 : /* loop invariant: */ \
177 : /* value@lo < v <= value@hi */ \
178 : while (hi - lo > 1) { \
179 : mid = (hi + lo) / 2; \
180 : x = vals[mid]; \
181 : if (TYPE##_GE(x, v)) \
182 : hi = mid; \
183 : else \
184 : lo = mid; \
185 : } \
186 : } \
187 : } \
188 : } else { \
189 : if (indir) { \
190 : if (last > 0) { \
191 : x = vals[indir[lo] - offset]; \
192 : if (TYPE##_LT(x, v)) \
193 : return lo; \
194 : x = vals[indir[hi] - offset]; \
195 : if (TYPE##_GE(x, v)) \
196 : return hi + 1; \
197 : \
198 : /* loop invariant: */ \
199 : /* value@lo >= v > value@hi */ \
200 : while (hi - lo > 1) { \
201 : mid = (hi + lo) / 2; \
202 : x = vals[indir[mid] - offset]; \
203 : if (TYPE##_LT(x, v)) \
204 : hi = mid; \
205 : else \
206 : lo = mid; \
207 : } \
208 : } else { \
209 : x = vals[indir[lo] - offset]; \
210 : if (TYPE##_LE(x, v)) \
211 : return last == 0 || TYPE##_EQ(x, v) ? lo : BUN_NONE; \
212 : x = vals[indir[hi] - offset]; \
213 : if (TYPE##_GT(x, v)) \
214 : return last == 0 ? hi + 1 : BUN_NONE; \
215 : \
216 : /* loop invariant: */ \
217 : /* value@lo > v >= value@hi */ \
218 : while (hi - lo > 1) { \
219 : mid = (hi + lo) / 2; \
220 : x = vals[indir[mid] - offset]; \
221 : if (TYPE##_LE(x, v)) \
222 : hi = mid; \
223 : else \
224 : lo = mid; \
225 : } \
226 : } \
227 : } else { \
228 : if (last > 0) { \
229 : x = vals[lo]; \
230 : if (TYPE##_LT(x, v)) \
231 : return lo; \
232 : x = vals[hi]; \
233 : if (TYPE##_GE(x, v)) \
234 : return hi + 1; \
235 : \
236 : /* loop invariant: */ \
237 : /* value@lo >= v > value@hi */ \
238 : while (hi - lo > 1) { \
239 : mid = (hi + lo) / 2; \
240 : x = vals[mid]; \
241 : if (TYPE##_LT(x, v)) \
242 : hi = mid; \
243 : else \
244 : lo = mid; \
245 : } \
246 : } else { \
247 : x = vals[lo]; \
248 : if (TYPE##_LE(x, v)) \
249 : return last == 0 || TYPE##_EQ(x, v) ? lo : BUN_NONE; \
250 : x = vals[hi]; \
251 : if (TYPE##_GT(x, v)) \
252 : return last == 0 ? hi + 1 : BUN_NONE; \
253 : \
254 : /* loop invariant: */ \
255 : /* value@lo > v >= value@hi */ \
256 : while (hi - lo > 1) { \
257 : mid = (hi + lo) / 2; \
258 : x = vals[mid]; \
259 : if (TYPE##_LE(x, v)) \
260 : hi = mid; \
261 : else \
262 : lo = mid; \
263 : } \
264 : } \
265 : } \
266 : } \
267 : return last >= 0 || (x = (indir ? vals[indir[hi] - offset] : vals[hi]), TYPE##_EQ(x, v)) ? hi : BUN_NONE; \
268 : }
269 :
270 177273 : BINSEARCHFUNC(bte)
271 6039 : BINSEARCHFUNC(sht)
272 5039388 : BINSEARCHFUNC(int)
273 402129 : BINSEARCHFUNC(lng)
274 : #ifdef HAVE_HGE
275 455 : BINSEARCHFUNC(hge)
276 : #endif
277 60 : BINSEARCHFUNC(flt)
278 637 : BINSEARCHFUNC(dbl)
279 :
280 : #if 0
281 : /* some programs that produce editor tags files don't recognize the
282 : * binsearch function because before it are the BINSEARCHFUNC macro
283 : * calls that don't look like function definitions or variable
284 : * declarations, hence we have this hidden away function to realign the
285 : * tags program */
286 : void
287 : realign_tags(void)
288 : {
289 : }
290 : #endif
291 :
292 : /* Do a binary search for the first/last occurrence of v between lo and hi
293 : * (lo inclusive, hi not inclusive) in vals/vars.
294 : * If last > 0, return the index of the first value > v; if last == 0,
295 : * return the index of the first value >= v; if last < 0, return
296 : * BUN_NONE if v does not occur, any BUN where v occurs otherwise.
297 : * If ordering is -1, the values are sorted in reverse order and hence
298 : * all comparisons are reversed.
299 : */
300 : BUN
301 1021123 : binsearch(const oid *restrict indir, oid offset,
302 : int type, const void *restrict vals, const char * restrict vars,
303 : int width, BUN lo, BUN hi, const void *restrict v,
304 : int ordering, int last)
305 : {
306 1021123 : BUN mid;
307 1021123 : int c;
308 1021123 : int (*cmp)(const void *, const void *);
309 :
310 1021123 : assert(ordering == 1 || ordering == -1);
311 1021123 : assert(lo < hi);
312 :
313 1021123 : --hi; /* now hi is inclusive */
314 :
315 1801313 : switch (ATOMbasetype(type)) {
316 : /* TYPE_oid is covered by TYPE_int/TYPE_lng */
317 149644 : case TYPE_bte:
318 149644 : return binsearch_bte(indir, offset, (const bte *) vals,
319 149644 : lo, hi, *(const bte *) v, ordering, last);
320 3382 : case TYPE_sht:
321 3382 : return binsearch_sht(indir, offset, (const sht *) vals,
322 3382 : lo, hi, *(const sht *) v, ordering, last);
323 506624 : case TYPE_int:
324 506624 : return binsearch_int(indir, offset, (const int *) vals,
325 : lo, hi, *(const int *) v, ordering, last);
326 95739 : case TYPE_lng:
327 95739 : return binsearch_lng(indir, offset, (const lng *) vals,
328 : lo, hi, *(const lng *) v, ordering, last);
329 : #ifdef HAVE_HGE
330 367 : case TYPE_hge:
331 367 : return binsearch_hge(indir, offset, (const hge *) vals,
332 : lo, hi, *(const hge *) v, ordering, last);
333 : #endif
334 32 : case TYPE_flt:
335 32 : return binsearch_flt(indir, offset, (const flt *) vals,
336 : lo, hi, *(const flt *) v, ordering, last);
337 363 : case TYPE_dbl:
338 363 : return binsearch_dbl(indir, offset, (const dbl *) vals,
339 : lo, hi, *(const dbl *) v, ordering, last);
340 : }
341 :
342 264972 : cmp = ATOMcompare(type);
343 :
344 264972 : if (last > 0) {
345 136555 : if ((c = ordering * cmp(VALUE(indir ? indir[lo] - offset : lo), v)) > 0)
346 : return lo;
347 126572 : if ((c = ordering * cmp(VALUE(indir ? indir[hi] - offset : hi), v)) <= 0)
348 : return hi + 1;
349 128417 : } else if (last == 0) {
350 127358 : if ((c = ordering * cmp(VALUE(indir ? indir[lo] - offset : lo), v)) >= 0)
351 : return lo;
352 2237 : if ((c = ordering * cmp(VALUE(indir ? indir[hi] - offset : hi), v)) < 0)
353 : return hi + 1;
354 : } else {
355 1059 : if ((c = ordering * cmp(VALUE(indir ? indir[lo] - offset : lo), v)) > 0)
356 : return BUN_NONE;
357 1059 : if (c == 0)
358 : return lo;
359 1058 : if ((c = ordering * cmp(VALUE(indir ? indir[hi] - offset : hi), v)) < 0)
360 : return BUN_NONE;
361 354 : if (c == 0)
362 : return hi;
363 353 : if (hi - lo <= 1) {
364 : /* not the first, not the last, and nothing in
365 : * between */
366 : return BUN_NONE;
367 : }
368 : }
369 :
370 : /* loop invariant:
371 : * last ? value@lo <= v < value@hi : value@lo < v <= value@hi
372 : *
373 : * This version does some more work in the inner loop than the
374 : * type-expanded versions (ordering and indir checks) but is
375 : * slow due to the function call and the needed check for
376 : * vars (in VALUE()) already, so we're beyond caring. */
377 120509 : while (hi - lo > 1) {
378 7027 : mid = (hi + lo) / 2;
379 7027 : if ((c = ordering * cmp(VALUE(indir ? indir[mid] - offset : mid), v)) > 0 ||
380 5870 : (last <= 0 && c == 0))
381 : hi = mid;
382 : else
383 6962 : lo = mid;
384 : }
385 113482 : return last >= 0 || cmp(VALUE(indir ? indir[hi] - offset : hi), v) == 0 ? hi : BUN_NONE;
386 : }
387 :
388 : /* Return the BUN of any tail value in b that is equal to v; if no
389 : * match is found, return BUN_NONE. b must be sorted (reverse or
390 : * forward). */
391 : BUN
392 76040 : SORTfnd(BAT *b, const void *v)
393 : {
394 76040 : if (BATcount(b) == 0)
395 : return BUN_NONE;
396 76040 : if (BATtdense(b)) {
397 0 : if (is_oid_nil(*(oid*)v) ||
398 0 : *(oid*)v < b->tseqbase ||
399 0 : *(oid*)v >= b->tseqbase + BATcount(b))
400 : return BUN_NONE;
401 0 : return *(oid*)v - b->tseqbase;
402 : }
403 76040 : if (b->ttype == TYPE_void) {
404 0 : if (b->tvheap) {
405 0 : struct canditer ci;
406 0 : canditer_init(&ci, NULL, b);
407 0 : return canditer_search(&ci, *(const oid*)v, false);
408 : }
409 0 : assert(is_oid_nil(b->tseqbase));
410 0 : if (is_oid_nil(*(const oid *) v))
411 : return 0;
412 0 : return BUN_NONE;
413 : }
414 76040 : BATiter bi = bat_iterator(b);
415 76041 : BUN p = binsearch(NULL, 0, bi.type, bi.base,
416 76041 : bi.vh ? bi.vh->base : NULL, bi.width, 0,
417 76041 : bi.count, v, bi.sorted ? 1 : -1, -1);
418 76041 : bat_iterator_end(&bi);
419 76041 : return p;
420 : }
421 :
422 : /* use orderidx, returns BUN on order index */
423 : BUN
424 0 : ORDERfnd(BAT *b, Heap *oidxh, const void *v)
425 : {
426 0 : if (BATcount(b) == 0)
427 : return BUN_NONE;
428 0 : BATiter bi = bat_iterator(b);
429 0 : BUN p = binsearch((oid *) oidxh->base + ORDERIDXOFF, 0, bi.type,
430 0 : bi.base, bi.vh ? bi.vh->base : NULL,
431 0 : bi.width, 0, bi.count, v, 1, -1);
432 0 : bat_iterator_end(&bi);
433 0 : return p;
434 : }
435 :
436 : /* Return the BUN of the first (lowest numbered) tail value that is
437 : * equal to v; if no match is found, return the BUN of the next higher
438 : * value in b. b must be sorted (reverse or forward). */
439 : BUN
440 406365 : SORTfndfirst(BAT *b, const void *v)
441 : {
442 406365 : if (BATcount(b) == 0)
443 : return 0;
444 406365 : if (BATtdense(b)) {
445 604 : if (is_oid_nil(*(oid*)v) || *(oid*)v <= b->tseqbase)
446 : return 0;
447 302 : if (*(oid*)v >= b->tseqbase + BATcount(b))
448 : return BATcount(b);
449 0 : return *(oid*)v - b->tseqbase;
450 : }
451 405761 : if (b->ttype == TYPE_void) {
452 0 : if (b->tvheap) {
453 0 : struct canditer ci;
454 0 : canditer_init(&ci, NULL, b);
455 0 : return canditer_search(&ci, *(const oid*)v, true);
456 : }
457 0 : assert(is_oid_nil(b->tseqbase));
458 : return 0;
459 : }
460 405761 : BATiter bi = bat_iterator(b);
461 406225 : BUN p = binsearch(NULL, 0, bi.type, bi.base,
462 406225 : bi.vh ? bi.vh->base : NULL, bi.width, 0,
463 406225 : bi.count, v, bi.sorted ? 1 : -1, 0);
464 406168 : bat_iterator_end(&bi);
465 406168 : return p;
466 : }
467 :
468 : /* use orderidx, returns BUN on order index */
469 : BUN
470 36 : ORDERfndfirst(BAT *b, Heap *oidxh, const void *v)
471 : {
472 36 : if (BATcount(b) == 0)
473 : return 0;
474 36 : BATiter bi = bat_iterator(b);
475 0 : BUN p = binsearch((oid *) oidxh->base + ORDERIDXOFF, 0, bi.type,
476 36 : bi.base, bi.vh ? bi.vh->base : NULL,
477 36 : bi.width, 0, bi.count, v, 1, 0);
478 36 : bat_iterator_end(&bi);
479 36 : return p;
480 : }
481 :
482 : /* Return the BUN of the first (lowest numbered) tail value beyond v.
483 : * b must be sorted (reverse or forward). */
484 : BUN
485 444587 : SORTfndlast(BAT *b, const void *v)
486 : {
487 444587 : if (BATcount(b) == 0)
488 : return 0;
489 444587 : if (BATtdense(b)) {
490 23459 : if (is_oid_nil(*(oid*)v) || *(oid*)v <= b->tseqbase)
491 : return 0;
492 0 : if (*(oid*)v >= b->tseqbase + BATcount(b))
493 : return BATcount(b);
494 0 : return *(oid*)v - b->tseqbase;
495 : }
496 421128 : if (b->ttype == TYPE_void) {
497 0 : if (b->tvheap) {
498 0 : struct canditer ci;
499 0 : if (is_oid_nil(*(const oid*)v))
500 : return 0;
501 0 : canditer_init(&ci, NULL, b);
502 0 : return canditer_search(&ci, *(const oid*)v + 1, true);
503 : }
504 0 : assert(is_oid_nil(b->tseqbase));
505 : return BATcount(b);
506 : }
507 421128 : BATiter bi = bat_iterator(b);
508 421602 : BUN p = binsearch(NULL, 0, bi.type, bi.base,
509 421602 : bi.vh ? bi.vh->base : NULL, bi.width, 0,
510 421602 : bi.count, v, bi.sorted ? 1 : -1, 1);
511 421597 : bat_iterator_end(&bi);
512 421597 : return p;
513 : }
514 :
515 : /* use orderidx, returns BUN on order index */
516 : BUN
517 46 : ORDERfndlast(BAT *b, Heap *oidxh, const void *v)
518 : {
519 46 : if (BATcount(b) == 0)
520 : return 0;
521 46 : BATiter bi = bat_iterator(b);
522 0 : BUN p = binsearch((oid *) oidxh->base + ORDERIDXOFF, 0, bi.type,
523 46 : bi.base, bi.vh ? bi.vh->base : NULL,
524 46 : bi.width, 0, bi.count, v, 1, 1);
525 46 : bat_iterator_end(&bi);
526 46 : return p;
527 : }
|