Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "gdk.h"
15 : #include "gdk_private.h"
16 : #include "gdk_calc_private.h"
17 :
18 : /* BATfirstn select the smallest n elements from the input bat b (if
19 : * asc(ending) is set, else the largest n elements). Conceptually, b
20 : * is sorted in ascending or descending order (depending on the asc
21 : * argument) and then the OIDs of the first n elements are returned.
22 : * If there are NILs in the BAT, their relative ordering is set by
23 : * using the nilslast argument: if set, NILs come last (largest value
24 : * when ascending, smallest value when descending), so if there are
25 : * enough non-NIL values, no NILs will be returned. If unset (false),
26 : * NILs come first and will be returned.
27 : *
28 : * In addition to the input BAT b, there can be a standard candidate
29 : * list s. If s is specified (non-NULL), only elements in b that are
30 : * referred to in s are considered.
31 : *
32 : * If the third input bat g is non-NULL, then s must also be non-NULL
33 : * and must be aligned with g. G then specifies groups to which the
34 : * elements referred to in s belong. Conceptually, the group values
35 : * are sorted in ascending order together with the elements in b that
36 : * are referred to in s (in ascending or descending order depending on
37 : * asc), and the first n elements are then returned.
38 : *
39 : * If the output argument gids is NULL, only n elements are returned.
40 : * If the output argument gids is non-NULL, more than n elements may
41 : * be returned. If there are duplicate values (if g is given, the
42 : * group value counts in determining duplication), all duplicates are
43 : * returned.
44 : *
45 : * If distinct is set, the result contains n complete groups of values
46 : * instead of just n values (or slightly more than n if gids is set
47 : * since then the "last" group is returned completely).
48 : *
49 : * Note that BATfirstn can be called in cascading fashion to calculate
50 : * the first n values of a table of multiple columns:
51 : * BATfirstn(&s1, &g1, b1, NULL, NULL, n, asc, nilslast, distinct);
52 : * BATfirstn(&s2, &g2, b2, s1, g1, n, asc, nilslast, distinct);
53 : * BATfirstn(&s3, NULL, b3, s2, g2, n, asc, nilslast, distinct);
54 : * If the input BATs b1, b2, b3 are large enough, s3 will contain the
55 : * OIDs of the smallest (largest) n elements in the table consisting
56 : * of the columns b1, b2, b3 when ordered in ascending order with b1
57 : * the major key.
58 : */
59 :
60 : /* We use a binary heap for the implementation of the simplest form of
61 : * first-N. During processing, the oids list forms a heap with the
62 : * root at position 0 and the children of a node at position i at
63 : * positions 2*i+1 and 2*i+2. The parent node is always
64 : * smaller/larger (depending on the value of asc) than its children
65 : * (recursively). The heapify macro creates the heap from the input
66 : * in-place. We start off with a heap containing the first N elements
67 : * of the input, and then go over the rest of the input, replacing the
68 : * root of the heap with a new value if appropriate (if the new value
69 : * is among the first-N seen so far). The siftdown macro then
70 : * restores the heap property. */
71 : #define siftdown(OPER, START, SWAP) \
72 : do { \
73 : pos = (START); \
74 : childpos = (pos << 1) + 1; \
75 : while (childpos < n) { \
76 : /* find most extreme child */ \
77 : if (childpos + 1 < n && \
78 : !(OPER(childpos + 1, childpos))) \
79 : childpos++; \
80 : /* compare parent with most extreme child */ \
81 : if (!OPER(pos, childpos)) { \
82 : /* already correctly ordered */ \
83 : break; \
84 : } \
85 : /* exchange parent with child and sift child */ \
86 : /* further */ \
87 : SWAP(pos, childpos); \
88 : pos = childpos; \
89 : childpos = (pos << 1) + 1; \
90 : } \
91 : } while (0)
92 :
93 : #define heapify(OPER, SWAP) \
94 : do { \
95 : for (i = n / 2; i > 0; i--) \
96 : siftdown(OPER, i - 1, SWAP); \
97 : } while (0)
98 :
99 : /* we inherit LT and GT from gdk_calc_private.h */
100 :
101 : #define nLTbte(a, b) (!is_bte_nil(a) && (is_bte_nil(b) || (a) < (b)))
102 : #define nLTsht(a, b) (!is_sht_nil(a) && (is_sht_nil(b) || (a) < (b)))
103 : #define nLTint(a, b) (!is_int_nil(a) && (is_int_nil(b) || (a) < (b)))
104 : #define nLTlng(a, b) (!is_lng_nil(a) && (is_lng_nil(b) || (a) < (b)))
105 : #define nLThge(a, b) (!is_hge_nil(a) && (is_hge_nil(b) || (a) < (b)))
106 :
107 : #define nGTbte(a, b) (!is_bte_nil(b) && (is_bte_nil(a) || (a) > (b)))
108 : #define nGTsht(a, b) (!is_sht_nil(b) && (is_sht_nil(a) || (a) > (b)))
109 : #define nGTint(a, b) (!is_int_nil(b) && (is_int_nil(a) || (a) > (b)))
110 : #define nGTlng(a, b) (!is_lng_nil(b) && (is_lng_nil(a) || (a) > (b)))
111 : #define nGThge(a, b) (!is_hge_nil(b) && (is_hge_nil(a) || (a) > (b)))
112 :
113 : #define LTany(p1, p2) (cmp(BUNtail(*bi, oids[p1] - hseq), \
114 : BUNtail(*bi, oids[p2] - hseq)) < 0)
115 : #define GTany(p1, p2) (cmp(BUNtail(*bi, oids[p1] - hseq), \
116 : BUNtail(*bi, oids[p2] - hseq)) > 0)
117 :
118 : #define nLTany(p1, p2) (cmp(BUNtail(*bi, oids[p1] - hseq), nil) != 0 \
119 : && (cmp(BUNtail(*bi, oids[p2] - hseq), nil) == 0 \
120 : || cmp(BUNtail(*bi, oids[p1] - hseq), \
121 : BUNtail(*bi, oids[p2] - hseq)) < 0))
122 : #define nGTany(p1, p2) (cmp(BUNtail(*bi, oids[p2] - hseq), nil) != 0 \
123 : && (cmp(BUNtail(*bi, oids[p1] - hseq), nil) == 0 \
124 : || cmp(BUNtail(*bi, oids[p1] - hseq), \
125 : BUNtail(*bi, oids[p2] - hseq)) > 0))
126 :
127 : #define LTflt(a, b) (!is_flt_nil(b) && (is_flt_nil(a) || (a) < (b)))
128 : #define LTdbl(a, b) (!is_dbl_nil(b) && (is_dbl_nil(a) || (a) < (b)))
129 : #define GTflt(a, b) (!is_flt_nil(a) && (is_flt_nil(b) || (a) > (b)))
130 : #define GTdbl(a, b) (!is_dbl_nil(a) && (is_dbl_nil(b) || (a) > (b)))
131 :
132 : #define nLTflt(a, b) (!is_flt_nil(a) && (is_flt_nil(b) || (a) < (b)))
133 : #define nLTdbl(a, b) (!is_dbl_nil(a) && (is_dbl_nil(b) || (a) < (b)))
134 : #define nGTflt(a, b) (!is_flt_nil(b) && (is_flt_nil(a) || (a) > (b)))
135 : #define nGTdbl(a, b) (!is_dbl_nil(b) && (is_dbl_nil(a) || (a) > (b)))
136 :
137 : #define LTfltfix(p1, p2) LTflt(vals[oids[p1] - hseq], \
138 : vals[oids[p2] - hseq])
139 : #define GTfltfix(p1, p2) GTflt(vals[oids[p1] - hseq], \
140 : vals[oids[p2] - hseq])
141 : #define LTdblfix(p1, p2) LTdbl(vals[oids[p1] - hseq], \
142 : vals[oids[p2] - hseq])
143 : #define GTdblfix(p1, p2) GTdbl(vals[oids[p1] - hseq], \
144 : vals[oids[p2] - hseq])
145 : #define LTfix(p1, p2) LT(vals[oids[p1] - hseq], \
146 : vals[oids[p2] - hseq])
147 : #define GTfix(p1, p2) GT(vals[oids[p1] - hseq], \
148 : vals[oids[p2] - hseq])
149 :
150 : #define nLTfltfix(p1, p2) nLTflt(vals[oids[p1] - hseq], \
151 : vals[oids[p2] - hseq])
152 : #define nGTfltfix(p1, p2) nGTflt(vals[oids[p1] - hseq], \
153 : vals[oids[p2] - hseq])
154 : #define nLTdblfix(p1, p2) nLTdbl(vals[oids[p1] - hseq], \
155 : vals[oids[p2] - hseq])
156 : #define nGTdblfix(p1, p2) nGTdbl(vals[oids[p1] - hseq], \
157 : vals[oids[p2] - hseq])
158 : #define nLTbtefix(p1, p2) nLTbte(vals[oids[p1] - hseq], \
159 : vals[oids[p2] - hseq])
160 : #define nGTbtefix(p1, p2) nGTbte(vals[oids[p1] - hseq], \
161 : vals[oids[p2] - hseq])
162 : #define nLTshtfix(p1, p2) nLTsht(vals[oids[p1] - hseq], \
163 : vals[oids[p2] - hseq])
164 : #define nGTshtfix(p1, p2) nGTsht(vals[oids[p1] - hseq], \
165 : vals[oids[p2] - hseq])
166 : #define nLTintfix(p1, p2) nLTint(vals[oids[p1] - hseq], \
167 : vals[oids[p2] - hseq])
168 : #define nGTintfix(p1, p2) nGTint(vals[oids[p1] - hseq], \
169 : vals[oids[p2] - hseq])
170 : #define nLTlngfix(p1, p2) nLTlng(vals[oids[p1] - hseq], \
171 : vals[oids[p2] - hseq])
172 : #define nGTlngfix(p1, p2) nGTlng(vals[oids[p1] - hseq], \
173 : vals[oids[p2] - hseq])
174 : #define nLThgefix(p1, p2) nLThge(vals[oids[p1] - hseq], \
175 : vals[oids[p2] - hseq])
176 : #define nGThgefix(p1, p2) nGThge(vals[oids[p1] - hseq], \
177 : vals[oids[p2] - hseq])
178 :
179 : #define SWAP1(p1, p2) \
180 : do { \
181 : item = oids[p1]; \
182 : oids[p1] = oids[p2]; \
183 : oids[p2] = item; \
184 : } while (0)
185 :
186 : #define shuffle_unique(TYPE, OP) \
187 : do { \
188 : const TYPE *restrict vals = (const TYPE *) bi->base; \
189 : heapify(OP##fix, SWAP1); \
190 : while (cnt > 0) { \
191 : cnt--; \
192 : i = canditer_next(&ci); \
193 : if (OP(vals[i - hseq], \
194 : vals[oids[0] - hseq])) { \
195 : oids[0] = i; \
196 : siftdown(OP##fix, 0, SWAP1); \
197 : } \
198 : } \
199 : } while (0)
200 :
201 : /* This version of BATfirstn returns a list of N oids (where N is the
202 : * smallest among BATcount(b), BATcount(s), and n). The oids returned
203 : * refer to the N smallest/largest (depending on asc) tail values of b
204 : * (taking the optional candidate list s into account). If there are
205 : * multiple equal values to take us past N, we return a subset of those.
206 : *
207 : * If lastp is non-NULL, it is filled in with the oid of the "last"
208 : * value, i.e. the value of which there may be multiple occurrences
209 : * that are not all included in the first N.
210 : */
211 : static BAT *
212 858 : BATfirstn_unique(BATiter *bi, BAT *s, BUN n, bool asc, bool nilslast, oid *lastp, lng t0)
213 : {
214 858 : BAT *bn;
215 858 : oid *restrict oids;
216 858 : oid hseq = bi->b->hseqbase;
217 858 : BUN i, cnt;
218 858 : struct canditer ci;
219 858 : int tpe = bi->type;
220 858 : int (*cmp)(const void *, const void *);
221 858 : const void *nil;
222 : /* variables used in heapify/siftdown macros */
223 858 : oid item;
224 858 : BUN pos, childpos;
225 :
226 858 : MT_thread_setalgorithm(__func__);
227 858 : canditer_init(&ci, bi->b, s);
228 858 : cnt = ci.ncand;
229 :
230 858 : if (n >= cnt) {
231 : /* trivial: return all candidates */
232 443 : bn = canditer_slice(&ci, 0, ci.ncand);
233 443 : if (bn && lastp)
234 138 : *lastp = oid_nil;
235 443 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
236 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
237 : " (trivial -- " LLFMT " usec)\n",
238 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
239 : ALGOOPTBATPAR(bn), GDKusec() - t0);
240 443 : return bn;
241 : }
242 :
243 415 : if (BATtvoid(bi->b)) {
244 : /* nilslast doesn't make a difference: either all are
245 : * nil, or none are */
246 0 : if (asc || is_oid_nil(bi->tseq)) {
247 : /* return the first part of the candidate list
248 : * or of the BAT itself */
249 0 : bn = canditer_slice(&ci, 0, n);
250 0 : if (bn && lastp)
251 0 : *lastp = BUNtoid(bn, n - 1);
252 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
253 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
254 : " (initial slice -- " LLFMT " usec)\n",
255 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
256 : ALGOOPTBATPAR(bn), GDKusec() - t0);
257 0 : return bn;
258 : }
259 : /* return the last part of the candidate list or of
260 : * the BAT itself */
261 0 : bn = canditer_slice(&ci, cnt - n, cnt);
262 0 : if (bn && lastp)
263 0 : *lastp = BUNtoid(bn, 0);
264 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
265 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
266 : " (final slice -- " LLFMT " usec)\n",
267 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
268 : ALGOOPTBATPAR(bn), GDKusec() - t0);
269 0 : return bn;
270 : }
271 415 : if (bi->sorted || bi->revsorted) {
272 : /* trivial: b is sorted so we just need to return the
273 : * initial or final part of it (or of the candidate
274 : * list); however, if nilslast == asc, then the nil
275 : * values (if any) are in the wrong place, so we need
276 : * to do a little more work */
277 :
278 : /* after we create the to-be-returned BAT, we set pos
279 : * to the BUN in the new BAT whose value we should
280 : * return through *lastp */
281 215 : if (nilslast == asc && !bi->nonil) {
282 0 : pos = SORTfndlast(bi->b, ATOMnilptr(tpe));
283 0 : pos = canditer_search(&ci, hseq + pos, true);
284 : /* 0 <= pos <= cnt
285 : * 0 < n < cnt
286 : */
287 0 : if (bi->sorted) {
288 : /* [0..pos) -- nil
289 : * [pos..cnt) -- non-nil <<<
290 : */
291 0 : if (asc) { /* i.e. nilslast */
292 : /* prefer non-nil and
293 : * smallest */
294 0 : if (cnt - pos < n) {
295 0 : bn = canditer_slice(&ci, cnt - n, cnt);
296 0 : pos = 0;
297 : } else {
298 0 : bn = canditer_slice(&ci, pos, pos + n);
299 0 : pos = n - 1;
300 : }
301 : } else { /* i.e. !asc, !nilslast */
302 : /* prefer nil and largest */
303 0 : if (pos < n) {
304 0 : bn = canditer_slice2(&ci, 0, pos, cnt - (n - pos), cnt);
305 : /* pos = pos; */
306 : } else {
307 0 : bn = canditer_slice(&ci, 0, n);
308 0 : pos = 0;
309 : }
310 : }
311 : } else { /* i.e. trevsorted */
312 : /* [0..pos) -- non-nil >>>
313 : * [pos..cnt) -- nil
314 : */
315 0 : if (asc) { /* i.e. nilslast */
316 : /* prefer non-nil and
317 : * smallest */
318 0 : if (pos < n) {
319 0 : bn = canditer_slice(&ci, 0, n);
320 : /* pos = pos; */
321 : } else {
322 0 : bn = canditer_slice(&ci, pos - n, pos);
323 0 : pos = 0;
324 : }
325 : } else { /* i.e. !asc, !nilslast */
326 : /* prefer nil and largest */
327 0 : if (cnt - pos < n) {
328 0 : bn = canditer_slice2(&ci, 0, n - (cnt - pos), pos, cnt);
329 0 : pos = n - (cnt - pos) - 1;
330 : } else {
331 0 : bn = canditer_slice(&ci, pos, pos + n);
332 0 : pos = 0;
333 : }
334 : }
335 : }
336 : } else {
337 : /* either there are no nils, or they are in
338 : * the appropriate position already, so we can
339 : * just slice */
340 215 : if (asc ? bi->sorted : bi->revsorted) {
341 : /* return copy of first part of
342 : * candidate list */
343 193 : bn = canditer_slice(&ci, 0, n);
344 193 : pos = n - 1;
345 : } else {
346 : /* return copy of last part of
347 : * candidate list */
348 22 : bn = canditer_slice(&ci, cnt - n, cnt);
349 22 : pos = 0;
350 : }
351 : }
352 215 : if (bn && lastp)
353 56 : *lastp = BUNtoid(bn, pos);
354 215 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
355 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
356 : " (ordered -- " LLFMT " usec)\n",
357 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
358 : ALGOOPTBATPAR(bn), GDKusec() - t0);
359 215 : return bn;
360 : }
361 :
362 200 : bn = COLnew(0, TYPE_oid, n, TRANSIENT);
363 200 : if (bn == NULL)
364 : return NULL;
365 200 : BATsetcount(bn, n);
366 200 : oids = (oid *) Tloc(bn, 0);
367 200 : cmp = ATOMcompare(tpe);
368 200 : nil = ATOMnilptr(tpe);
369 : /* if base type has same comparison function as type itself, we
370 : * can use the base type */
371 200 : tpe = ATOMbasetype(tpe); /* takes care of oid */
372 : /* if the input happens to be almost sorted in ascending order
373 : * (likely a common use case), it is more efficient to start
374 : * off with the first n elements when doing a firstn-ascending
375 : * and to start off with the last n elements when doing a
376 : * firstn-descending so that most values that we look at after
377 : * this will be skipped. However, when using a bitmask
378 : * candidate list, the manipulation of the canditer structure
379 : * doesn't work like this, so we still work from the
380 : * beginning. */
381 200 : if (asc || ci.tpe == cand_mask) {
382 308699 : for (i = 0; i < n; i++)
383 308582 : oids[i] = canditer_next(&ci);
384 : } else {
385 83 : item = canditer_idx(&ci, cnt - n);
386 83 : ci.next = cnt - n;
387 83 : ci.add = item - ci.seq - (cnt - n);
388 684 : for (i = n; i > 0; i--)
389 601 : oids[i - 1] = canditer_next(&ci);
390 83 : canditer_reset(&ci);
391 : }
392 200 : cnt -= n;
393 :
394 200 : if (asc) {
395 117 : if (nilslast && !bi->nonil) {
396 14 : switch (tpe) {
397 2 : case TYPE_bte:
398 22 : shuffle_unique(bte, nLTbte);
399 : break;
400 0 : case TYPE_sht:
401 0 : shuffle_unique(sht, nLTsht);
402 : break;
403 2 : case TYPE_int:
404 20 : shuffle_unique(int, nLTint);
405 : break;
406 4 : case TYPE_lng:
407 43 : shuffle_unique(lng, nLTlng);
408 : break;
409 : #ifdef HAVE_HGE
410 0 : case TYPE_hge:
411 0 : shuffle_unique(hge, nLThge);
412 : break;
413 : #endif
414 2 : case TYPE_flt:
415 23 : shuffle_unique(flt, nLTflt);
416 : break;
417 0 : case TYPE_dbl:
418 0 : shuffle_unique(dbl, nLTdbl);
419 : break;
420 4 : default:
421 29 : heapify(nLTany, SWAP1);
422 10 : while (cnt > 0) {
423 6 : cnt--;
424 6 : i = canditer_next(&ci);
425 6 : if (cmp(BUNtail(*bi, i - hseq), nil) != 0
426 5 : && (cmp(BUNtail(*bi, oids[0] - hseq), nil) == 0
427 1 : || cmp(BUNtail(*bi, i - hseq),
428 1 : BUNtail(*bi, oids[0] - hseq)) < 0)) {
429 5 : oids[0] = i;
430 24 : siftdown(nLTany, 0, SWAP1);
431 : }
432 : }
433 : break;
434 : }
435 : } else {
436 103 : switch (tpe) {
437 0 : case TYPE_bte:
438 0 : shuffle_unique(bte, LT);
439 : break;
440 0 : case TYPE_sht:
441 0 : shuffle_unique(sht, LT);
442 : break;
443 25 : case TYPE_int:
444 7203 : shuffle_unique(int, LT);
445 : break;
446 5 : case TYPE_lng:
447 7652679 : shuffle_unique(lng, LT);
448 : break;
449 : #ifdef HAVE_HGE
450 8 : case TYPE_hge:
451 101658 : shuffle_unique(hge, LT);
452 : break;
453 : #endif
454 0 : case TYPE_flt:
455 0 : shuffle_unique(flt, LTflt);
456 : break;
457 15 : case TYPE_dbl:
458 94136 : shuffle_unique(dbl, LTdbl);
459 : break;
460 50 : default:
461 5519 : heapify(LTany, SWAP1);
462 140939 : while (cnt > 0) {
463 140889 : cnt--;
464 140889 : i = canditer_next(&ci);
465 140889 : if (cmp(BUNtail(*bi, i - hseq),
466 140889 : BUNtail(*bi, oids[0] - hseq)) < 0) {
467 6567 : oids[0] = i;
468 181744 : siftdown(LTany, 0, SWAP1);
469 : }
470 : }
471 : break;
472 : }
473 : }
474 : } else {
475 83 : if (nilslast || bi->nonil) {
476 62 : switch (tpe) {
477 0 : case TYPE_bte:
478 0 : shuffle_unique(bte, GT);
479 : break;
480 0 : case TYPE_sht:
481 0 : shuffle_unique(sht, GT);
482 : break;
483 24 : case TYPE_int:
484 200 : shuffle_unique(int, GT);
485 : break;
486 1 : case TYPE_lng:
487 2091 : shuffle_unique(lng, GT);
488 : break;
489 : #ifdef HAVE_HGE
490 26 : case TYPE_hge:
491 1493 : shuffle_unique(hge, GT);
492 : break;
493 : #endif
494 0 : case TYPE_flt:
495 0 : shuffle_unique(flt, GTflt);
496 : break;
497 2 : case TYPE_dbl:
498 22 : shuffle_unique(dbl, GTdbl);
499 : break;
500 9 : default:
501 51 : heapify(GTany, SWAP1);
502 40 : while (cnt > 0) {
503 31 : cnt--;
504 31 : i = canditer_next(&ci);
505 31 : if (cmp(BUNtail(*bi, i - hseq),
506 31 : BUNtail(*bi, oids[0] - hseq)) > 0) {
507 19 : oids[0] = i;
508 88 : siftdown(GTany, 0, SWAP1);
509 : }
510 : }
511 : break;
512 : }
513 : } else {
514 21 : switch (tpe) {
515 1 : case TYPE_bte:
516 9 : shuffle_unique(bte, nGTbte);
517 : break;
518 0 : case TYPE_sht:
519 0 : shuffle_unique(sht, nGTsht);
520 : break;
521 2 : case TYPE_int:
522 19 : shuffle_unique(int, nGTint);
523 : break;
524 8 : case TYPE_lng:
525 78 : shuffle_unique(lng, nGTlng);
526 : break;
527 : #ifdef HAVE_HGE
528 0 : case TYPE_hge:
529 0 : shuffle_unique(hge, nGThge);
530 : break;
531 : #endif
532 2 : case TYPE_flt:
533 18 : shuffle_unique(flt, nGTflt);
534 : break;
535 2 : case TYPE_dbl:
536 19 : shuffle_unique(dbl, nGTdbl);
537 : break;
538 6 : default:
539 38 : heapify(nGTany, SWAP1);
540 19 : while (cnt > 0) {
541 13 : cnt--;
542 13 : i = canditer_next(&ci);
543 13 : if (cmp(BUNtail(*bi, oids[0] - hseq), nil) != 0
544 13 : && (cmp(BUNtail(*bi, i - hseq), nil) == 0
545 11 : || cmp(BUNtail(*bi, i - hseq),
546 11 : BUNtail(*bi, oids[0] - hseq)) > 0)) {
547 13 : oids[0] = i;
548 53 : siftdown(nGTany, 0, SWAP1);
549 : }
550 : }
551 : break;
552 : }
553 : }
554 : }
555 200 : if (lastp)
556 87 : *lastp = oids[0]; /* store id of largest value */
557 : /* output must be sorted since it's a candidate list */
558 200 : GDKqsort(oids, NULL, NULL, (size_t) n, sizeof(oid), 0, TYPE_oid, false, false);
559 200 : bn->tsorted = true;
560 200 : bn->trevsorted = n <= 1;
561 200 : bn->tkey = true;
562 200 : bn->tseqbase = n <= 1 ? oids[0] : oid_nil;
563 200 : bn->tnil = false;
564 200 : bn->tnonil = true;
565 200 : bn = virtualize(bn);
566 200 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
567 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
568 : " (" LLFMT " usec)\n",
569 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
570 : ALGOOPTBATPAR(bn), GDKusec() - t0);
571 : return bn;
572 : }
573 :
574 : #define LTfixgrp(p1, p2) \
575 : (goids[p1] < goids[p2] || \
576 : (goids[p1] == goids[p2] && \
577 : LTfix(p1, p2)))
578 : #define nLTbtefixgrp(p1, p2) \
579 : (goids[p1] < goids[p2] || \
580 : (goids[p1] == goids[p2] && \
581 : nLTbtefix(p1, p2)))
582 : #define nLTshtfixgrp(p1, p2) \
583 : (goids[p1] < goids[p2] || \
584 : (goids[p1] == goids[p2] && \
585 : nLTshtfix(p1, p2)))
586 : #define nLTintfixgrp(p1, p2) \
587 : (goids[p1] < goids[p2] || \
588 : (goids[p1] == goids[p2] && \
589 : nLTintfix(p1, p2)))
590 : #define nLTlngfixgrp(p1, p2) \
591 : (goids[p1] < goids[p2] || \
592 : (goids[p1] == goids[p2] && \
593 : nLTlngfix(p1, p2)))
594 : #define nLThgefixgrp(p1, p2) \
595 : (goids[p1] < goids[p2] || \
596 : (goids[p1] == goids[p2] && \
597 : nLThgefix(p1, p2)))
598 : #define LTfltfixgrp(p1, p2) \
599 : (goids[p1] < goids[p2] || \
600 : (goids[p1] == goids[p2] && \
601 : LTfltfix(p1, p2)))
602 : #define LTdblfixgrp(p1, p2) \
603 : (goids[p1] < goids[p2] || \
604 : (goids[p1] == goids[p2] && \
605 : LTdblfix(p1, p2)))
606 : #define nLTfltfixgrp(p1, p2) \
607 : (goids[p1] < goids[p2] || \
608 : (goids[p1] == goids[p2] && \
609 : nLTfltfix(p1, p2)))
610 : #define nLTdblfixgrp(p1, p2) \
611 : (goids[p1] < goids[p2] || \
612 : (goids[p1] == goids[p2] && \
613 : nLTdblfix(p1, p2)))
614 : #define GTfixgrp(p1, p2) \
615 : (goids[p1] < goids[p2] || \
616 : (goids[p1] == goids[p2] && \
617 : GTfix(p1, p2)))
618 : #define nGTbtefixgrp(p1, p2) \
619 : (goids[p1] < goids[p2] || \
620 : (goids[p1] == goids[p2] && \
621 : nGTbtefix(p1, p2)))
622 : #define nGTshtfixgrp(p1, p2) \
623 : (goids[p1] < goids[p2] || \
624 : (goids[p1] == goids[p2] && \
625 : nGTshtfix(p1, p2)))
626 : #define nGTintfixgrp(p1, p2) \
627 : (goids[p1] < goids[p2] || \
628 : (goids[p1] == goids[p2] && \
629 : nGTintfix(p1, p2)))
630 : #define nGTlngfixgrp(p1, p2) \
631 : (goids[p1] < goids[p2] || \
632 : (goids[p1] == goids[p2] && \
633 : nGTlngfix(p1, p2)))
634 : #define nGThgefixgrp(p1, p2) \
635 : (goids[p1] < goids[p2] || \
636 : (goids[p1] == goids[p2] && \
637 : nGThgefix(p1, p2)))
638 : #define GTfltfixgrp(p1, p2) \
639 : (goids[p1] < goids[p2] || \
640 : (goids[p1] == goids[p2] && \
641 : GTfltfix(p1, p2)))
642 : #define GTdblfixgrp(p1, p2) \
643 : (goids[p1] < goids[p2] || \
644 : (goids[p1] == goids[p2] && \
645 : GTdblfix(p1, p2)))
646 : #define nGTfltfixgrp(p1, p2) \
647 : (goids[p1] < goids[p2] || \
648 : (goids[p1] == goids[p2] && \
649 : nGTfltfix(p1, p2)))
650 : #define nGTdblfixgrp(p1, p2) \
651 : (goids[p1] < goids[p2] || \
652 : (goids[p1] == goids[p2] && \
653 : nGTdblfix(p1, p2)))
654 : #define LTvoidgrp(p1, p2) \
655 : (goids[p1] < goids[p2] || \
656 : (goids[p1] == goids[p2] && oids[p1] < oids[p2]))
657 : #define GTvoidgrp(p1, p2) \
658 : (goids[p1] < goids[p2] || \
659 : (goids[p1] == goids[p2] && oids[p1] > oids[p2]))
660 : #define LTanygrp(p1, p2) \
661 : (goids[p1] < goids[p2] || \
662 : (goids[p1] == goids[p2] && \
663 : LTany(p1, p2)))
664 : #define GTanygrp(p1, p2) \
665 : (goids[p1] < goids[p2] || \
666 : (goids[p1] == goids[p2] && \
667 : GTany(p1, p2)))
668 : #define nLTanygrp(p1, p2) \
669 : (goids[p1] < goids[p2] || \
670 : (goids[p1] == goids[p2] && \
671 : nLTany(p1, p2)))
672 : #define nGTanygrp(p1, p2) \
673 : (goids[p1] < goids[p2] || \
674 : (goids[p1] == goids[p2] && \
675 : nGTany(p1, p2)))
676 : #define SWAP2(p1, p2) \
677 : do { \
678 : item = oids[p1]; \
679 : oids[p1] = oids[p2]; \
680 : oids[p2] = item; \
681 : item = goids[p1]; \
682 : goids[p1] = goids[p2]; \
683 : goids[p2] = item; \
684 : } while (0)
685 :
686 : #define shuffle_unique_with_groups(TYPE, OP) \
687 : do { \
688 : const TYPE *restrict vals = (const TYPE *) bi->base; \
689 : heapify(OP##fixgrp, SWAP2); \
690 : while (cnt > 0) { \
691 : cnt--; \
692 : i = canditer_next(&ci); \
693 : if (gv[j] < goids[0] || \
694 : (gv[j] == goids[0] && \
695 : OP(vals[i - hseq], \
696 : vals[oids[0] - hseq]))) { \
697 : oids[0] = i; \
698 : goids[0] = gv[j]; \
699 : siftdown(OP##fixgrp, 0, SWAP2); \
700 : } \
701 : j++; \
702 : } \
703 : } while (0)
704 :
705 : /* This version of BATfirstn is like the one above, except that it
706 : * also looks at groups. The values of the group IDs are important:
707 : * we return only the smallest N (i.e., not dependent on asc which
708 : * refers only to the values in the BAT b).
709 : *
710 : * If lastp is non-NULL, it is filled in with the oid of the "last"
711 : * value, i.e. the value of which there may be multiple occurrences
712 : * that are not all included in the first N. If lastgp is non-NULL,
713 : * it is filled with the group ID (not the oid of the group ID) for
714 : * that same value.
715 : */
716 : static BAT *
717 604 : BATfirstn_unique_with_groups(BATiter *bi, BAT *s, BAT *g, BUN n, bool asc, bool nilslast, oid *lastp, oid *lastgp, lng t0)
718 : {
719 604 : BAT *bn;
720 604 : oid *restrict oids, *restrict goids;
721 604 : oid hseq = bi->b->hseqbase;
722 604 : const oid *restrict gv;
723 604 : BUN i, j, cnt;
724 604 : struct canditer ci;
725 604 : int tpe = bi->type;
726 604 : int (*cmp)(const void *, const void *);
727 604 : const void *nil;
728 : /* variables used in heapify/siftdown macros */
729 604 : oid item;
730 604 : BUN pos, childpos;
731 :
732 604 : MT_thread_setalgorithm(__func__);
733 604 : canditer_init(&ci, bi->b, s);
734 603 : cnt = ci.ncand;
735 :
736 603 : if (n > cnt)
737 : n = cnt;
738 :
739 603 : if (n == 0) {
740 : /* candidate list might refer only to values outside
741 : * of the bat and hence be effectively empty */
742 0 : if (lastp)
743 0 : *lastp = 0;
744 0 : if (lastgp)
745 0 : *lastgp = 0;
746 0 : bn = BATdense(0, 0, 0);
747 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
748 : ",g=" ALGOBATFMT ",n=" BUNFMT " -> " ALGOOPTBATFMT
749 : " (empty -- " LLFMT " usec)\n",
750 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
751 : ALGOOPTBATPAR(bn), GDKusec() - t0);
752 0 : return bn;
753 : }
754 :
755 603 : if (BATtdense(g)) {
756 : /* trivial: g determines ordering, return reference to
757 : * initial part of b (or slice of s) */
758 69 : if (lastgp)
759 31 : *lastgp = g->tseqbase + n - 1;
760 69 : bn = canditer_slice(&ci, 0, n);
761 69 : if (bn && lastp)
762 31 : *lastp = BUNtoid(bn, n - 1);
763 69 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
764 : ",g=" ALGOBATFMT ",n=" BUNFMT " -> " ALGOOPTBATFMT
765 : " (dense group -- " LLFMT " usec)\n",
766 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
767 : ALGOOPTBATPAR(bn), GDKusec() - t0);
768 69 : return bn;
769 : }
770 :
771 534 : bn = COLnew(0, TYPE_oid, n, TRANSIENT);
772 535 : if (bn == NULL)
773 : return NULL;
774 535 : BATsetcount(bn, n);
775 535 : oids = (oid *) Tloc(bn, 0);
776 535 : gv = (const oid *) Tloc(g, 0);
777 535 : goids = GDKmalloc(n * sizeof(oid));
778 535 : if (goids == NULL) {
779 0 : BBPreclaim(bn);
780 0 : return NULL;
781 : }
782 :
783 535 : cmp = ATOMcompare(tpe);
784 535 : nil = ATOMnilptr(tpe);
785 : /* if base type has same comparison function as type itself, we
786 : * can use the base type */
787 535 : tpe = ATOMbasetype(tpe); /* takes care of oid */
788 535 : j = 0;
789 10821319 : for (i = 0; i < n; i++) {
790 10820784 : oids[i] = canditer_next(&ci);
791 10820784 : goids[i] = gv[j++];
792 : }
793 535 : cnt -= n;
794 :
795 535 : if (BATtvoid(bi->b)) {
796 : /* nilslast doesn't make a difference (all nil, or no nil) */
797 0 : if (asc) {
798 0 : heapify(LTvoidgrp, SWAP2);
799 0 : while (cnt > 0) {
800 0 : cnt--;
801 0 : i = canditer_next(&ci);
802 0 : if (gv[j] < goids[0]
803 : /* || (gv[j] == goids[0]
804 : && i < oids[0]) -- always false */) {
805 0 : oids[0] = i;
806 0 : goids[0] = gv[j];
807 0 : siftdown(LTvoidgrp, 0, SWAP2);
808 : }
809 0 : j++;
810 : }
811 : } else {
812 0 : heapify(GTvoidgrp, SWAP2);
813 0 : while (cnt > 0) {
814 0 : cnt--;
815 0 : i = canditer_next(&ci);
816 0 : if (gv[j] < goids[0]
817 0 : || (gv[j] == goids[0]
818 : /* && i > oids[0] -- always true */)) {
819 0 : oids[0] = i;
820 0 : goids[0] = gv[j];
821 0 : siftdown(GTvoidgrp, 0, SWAP2);
822 : }
823 0 : j++;
824 : }
825 : }
826 535 : } else if (asc) {
827 474 : if (nilslast && !bi->nonil) {
828 0 : switch (tpe) {
829 0 : case TYPE_bte:
830 0 : shuffle_unique_with_groups(bte, nLTbte);
831 : break;
832 0 : case TYPE_sht:
833 0 : shuffle_unique_with_groups(sht, nLTsht);
834 : break;
835 0 : case TYPE_int:
836 0 : shuffle_unique_with_groups(int, nLTint);
837 : break;
838 0 : case TYPE_lng:
839 0 : shuffle_unique_with_groups(lng, nLTlng);
840 : break;
841 : #ifdef HAVE_HGE
842 0 : case TYPE_hge:
843 0 : shuffle_unique_with_groups(hge, nLThge);
844 : break;
845 : #endif
846 0 : case TYPE_flt:
847 0 : shuffle_unique_with_groups(flt, nLTflt);
848 : break;
849 0 : case TYPE_dbl:
850 0 : shuffle_unique_with_groups(dbl, nLTdbl);
851 : break;
852 0 : default:
853 0 : heapify(nLTanygrp, SWAP2);
854 0 : while (cnt > 0) {
855 0 : cnt--;
856 0 : i = canditer_next(&ci);
857 0 : if (gv[j] < goids[0]
858 0 : || (gv[j] == goids[0]
859 0 : && cmp(BUNtail(*bi, i - hseq), nil) != 0
860 0 : && (cmp(BUNtail(*bi, oids[0] - hseq), nil) == 0
861 0 : || cmp(BUNtail(*bi, i - hseq),
862 0 : BUNtail(*bi, oids[0] - hseq)) < 0))) {
863 0 : oids[0] = i;
864 0 : goids[0] = gv[j];
865 0 : siftdown(nLTanygrp, 0, SWAP2);
866 : }
867 0 : j++;
868 : }
869 : break;
870 : }
871 : } else {
872 474 : switch (tpe) {
873 20 : case TYPE_bte:
874 1548534 : shuffle_unique_with_groups(bte, LT);
875 : break;
876 14 : case TYPE_sht:
877 1000063 : shuffle_unique_with_groups(sht, LT);
878 : break;
879 144 : case TYPE_int:
880 39396 : shuffle_unique_with_groups(int, LT);
881 : break;
882 10 : case TYPE_lng:
883 502825 : shuffle_unique_with_groups(lng, LT);
884 : break;
885 : #ifdef HAVE_HGE
886 27 : case TYPE_hge:
887 2491 : shuffle_unique_with_groups(hge, LT);
888 : break;
889 : #endif
890 0 : case TYPE_flt:
891 0 : shuffle_unique_with_groups(flt, LTflt);
892 : break;
893 4 : case TYPE_dbl:
894 15 : shuffle_unique_with_groups(dbl, LTdbl);
895 : break;
896 255 : default:
897 274054 : heapify(LTanygrp, SWAP2);
898 26893 : while (cnt > 0) {
899 26638 : cnt--;
900 26638 : i = canditer_next(&ci);
901 26627 : if (gv[j] < goids[0] ||
902 26282 : (gv[j] == goids[0] &&
903 26276 : cmp(BUNtail(*bi, i - hseq),
904 26281 : BUNtail(*bi, oids[0] - hseq)) < 0)) {
905 4385 : oids[0] = i;
906 4385 : goids[0] = gv[j];
907 26912 : siftdown(LTanygrp, 0, SWAP2);
908 : }
909 26638 : j++;
910 : }
911 : break;
912 : }
913 : }
914 61 : } else if (nilslast || bi->nonil) {
915 42 : switch (tpe) {
916 5 : case TYPE_bte:
917 227245 : shuffle_unique_with_groups(bte, GT);
918 : break;
919 0 : case TYPE_sht:
920 0 : shuffle_unique_with_groups(sht, GT);
921 : break;
922 15 : case TYPE_int:
923 5301732 : shuffle_unique_with_groups(int, GT);
924 : break;
925 0 : case TYPE_lng:
926 0 : shuffle_unique_with_groups(lng, GT);
927 : break;
928 : #ifdef HAVE_HGE
929 10 : case TYPE_hge:
930 246936 : shuffle_unique_with_groups(hge, GT);
931 : break;
932 : #endif
933 0 : case TYPE_flt:
934 0 : shuffle_unique_with_groups(flt, GTflt);
935 : break;
936 1 : case TYPE_dbl:
937 12 : shuffle_unique_with_groups(dbl, GTdbl);
938 : break;
939 11 : default:
940 71 : heapify(GTanygrp, SWAP2);
941 14 : while (cnt > 0) {
942 3 : cnt--;
943 3 : i = canditer_next(&ci);
944 3 : if (gv[j] < goids[0] ||
945 2 : (gv[j] == goids[0] &&
946 2 : cmp(BUNtail(*bi, i - hseq),
947 2 : BUNtail(*bi, oids[0] - hseq)) > 0)) {
948 1 : oids[0] = i;
949 1 : goids[0] = gv[j];
950 3 : siftdown(GTanygrp, 0, SWAP2);
951 : }
952 3 : j++;
953 : }
954 : break;
955 : }
956 : } else {
957 19 : switch (tpe) {
958 2 : case TYPE_bte:
959 441468 : shuffle_unique_with_groups(bte, nGTbte);
960 : break;
961 0 : case TYPE_sht:
962 0 : shuffle_unique_with_groups(sht, nGTsht);
963 : break;
964 0 : case TYPE_int:
965 0 : shuffle_unique_with_groups(int, nGTint);
966 : break;
967 0 : case TYPE_lng:
968 0 : shuffle_unique_with_groups(lng, nGTlng);
969 : break;
970 : #ifdef HAVE_HGE
971 17 : case TYPE_hge:
972 1619099 : shuffle_unique_with_groups(hge, nGThge);
973 : break;
974 : #endif
975 0 : case TYPE_flt:
976 0 : shuffle_unique_with_groups(flt, nGTflt);
977 : break;
978 0 : case TYPE_dbl:
979 0 : shuffle_unique_with_groups(dbl, nGTdbl);
980 : break;
981 0 : default:
982 0 : heapify(nGTanygrp, SWAP2);
983 0 : while (cnt > 0) {
984 0 : cnt--;
985 0 : i = canditer_next(&ci);
986 0 : if (gv[j] < goids[0]
987 0 : || (gv[j] == goids[0]
988 0 : && cmp(BUNtail(*bi, oids[0] - hseq), nil) != 0
989 0 : && (cmp(BUNtail(*bi, i - hseq), nil) == 0
990 0 : || cmp(BUNtail(*bi, i - hseq),
991 0 : BUNtail(*bi, oids[0] - hseq)) > 0))) {
992 0 : oids[0] = i;
993 0 : goids[0] = gv[j];
994 0 : siftdown(nGTanygrp, 0, SWAP2);
995 : }
996 0 : j++;
997 : }
998 : break;
999 : }
1000 : }
1001 535 : if (lastp)
1002 308 : *lastp = oids[0];
1003 535 : if (lastgp)
1004 308 : *lastgp = goids[0];
1005 535 : GDKfree(goids);
1006 : /* output must be sorted since it's a candidate list */
1007 535 : GDKqsort(oids, NULL, NULL, (size_t) n, sizeof(oid), 0, TYPE_oid, false, false);
1008 535 : bn->tsorted = true;
1009 535 : bn->trevsorted = n <= 1;
1010 535 : bn->tkey = true;
1011 535 : bn->tseqbase = n <= 1 ? oids[0] : oid_nil;
1012 535 : bn->tnil = false;
1013 535 : bn->tnonil = true;
1014 535 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1015 : ",g=" ALGOBATFMT ",n=" BUNFMT " -> " ALGOOPTBATFMT
1016 : " (" LLFMT " usec)\n",
1017 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
1018 : ALGOOPTBATPAR(bn), GDKusec() - t0);
1019 : return bn;
1020 : }
1021 :
1022 : static gdk_return
1023 281 : BATfirstn_grouped(BAT **topn, BAT **gids, BATiter *bi, BAT *s, BUN n, bool asc, bool nilslast, bool distinct, lng t0)
1024 : {
1025 281 : BAT *bn, *gn = NULL, *su = NULL;
1026 281 : oid last;
1027 281 : gdk_return rc;
1028 :
1029 281 : MT_thread_setalgorithm(__func__);
1030 281 : if (distinct && !bi->key) {
1031 0 : su = s;
1032 0 : s = BATunique(bi->b, s);
1033 0 : if (s == NULL)
1034 : return GDK_FAIL;
1035 : }
1036 281 : bn = BATfirstn_unique(bi, s, n, asc, nilslast, &last, t0);
1037 281 : if (bn == NULL)
1038 : return GDK_FAIL;
1039 281 : if (BATcount(bn) == 0) {
1040 0 : if (gids) {
1041 0 : gn = BATdense(0, 0, 0);
1042 0 : if (gn == NULL) {
1043 0 : BBPunfix(bn->batCacheid);
1044 0 : return GDK_FAIL;
1045 : }
1046 0 : *gids = gn;
1047 : }
1048 0 : *topn = bn;
1049 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1050 : ",n=" BUNFMT " -> " ALGOOPTBATFMT "," ALGOOPTBATFMT
1051 : " (empty -- " LLFMT " usec)\n",
1052 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
1053 : ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn), GDKusec() - t0);
1054 0 : return GDK_SUCCEED;
1055 : }
1056 281 : if (!bi->key) {
1057 236 : if (distinct) {
1058 0 : BAT *bn1;
1059 :
1060 0 : bn1 = bn;
1061 0 : BBPunfix(s->batCacheid);
1062 0 : bn = BATintersect(bi->b, bi->b, su, bn1, true, false, BUN_NONE);
1063 0 : BBPunfix(bn1->batCacheid);
1064 0 : if (bn == NULL)
1065 : return GDK_FAIL;
1066 236 : } else if (last != oid_nil) {
1067 142 : BAT *bn1, *bn2;
1068 :
1069 142 : bn1 = bn;
1070 142 : bn2 = BATselect(bi->b, s, BUNtail(*bi, last - bi->b->hseqbase), NULL, true, false, false);
1071 142 : if (bn2 == NULL) {
1072 0 : BBPunfix(bn1->batCacheid);
1073 0 : return GDK_FAIL;
1074 : }
1075 142 : bn = BATmergecand(bn1, bn2);
1076 142 : BBPunfix(bn1->batCacheid);
1077 142 : BBPunfix(bn2->batCacheid);
1078 142 : if (bn == NULL)
1079 : return GDK_FAIL;
1080 : }
1081 : }
1082 281 : if (gids) {
1083 281 : BAT *bn1, *bn2, *bn3, *bn4;
1084 281 : bn1 = BATproject(bn, bi->b);
1085 281 : if (bn1 == NULL) {
1086 0 : BBPunfix(bn->batCacheid);
1087 0 : return GDK_FAIL;
1088 : }
1089 281 : rc = BATsort(NULL, &bn2, &bn3, bn1, NULL, NULL, !asc, !asc, false);
1090 281 : BBPunfix(bn1->batCacheid);
1091 281 : if (rc != GDK_SUCCEED) {
1092 0 : BBPunfix(bn->batCacheid);
1093 0 : return GDK_FAIL;
1094 : }
1095 281 : rc = BATsort(NULL, &bn4, NULL, bn2, NULL, NULL, false, false, false);
1096 281 : BBPunfix(bn2->batCacheid);
1097 281 : if (rc != GDK_SUCCEED) {
1098 0 : BBPunfix(bn->batCacheid);
1099 0 : BBPunfix(bn3->batCacheid);
1100 0 : return GDK_FAIL;
1101 : }
1102 281 : gn = BATproject(bn4, bn3);
1103 281 : BBPunfix(bn3->batCacheid);
1104 281 : BBPunfix(bn4->batCacheid);
1105 281 : if (gn == NULL) {
1106 0 : BBPunfix(bn->batCacheid);
1107 0 : return GDK_FAIL;
1108 : }
1109 281 : *gids = gn;
1110 281 : assert(BATcount(gn) == BATcount(bn));
1111 : }
1112 281 : *topn = bn;
1113 281 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1114 : ",n=" BUNFMT " -> " ALGOOPTBATFMT "," ALGOOPTBATFMT
1115 : " (" LLFMT " usec)\n",
1116 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
1117 : ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn), GDKusec() - t0);
1118 : return GDK_SUCCEED;
1119 : }
1120 :
1121 : static gdk_return
1122 339 : BATfirstn_grouped_with_groups(BAT **topn, BAT **gids, BATiter *bi, BAT *s, BAT *g, BUN n, bool asc, bool nilslast, bool distinct, lng t0)
1123 : {
1124 339 : BAT *bn, *gn = NULL;
1125 339 : oid hseq = bi->b->hseqbase;
1126 339 : oid last, lastg;
1127 339 : gdk_return rc;
1128 :
1129 339 : MT_thread_setalgorithm(__func__);
1130 339 : if (distinct) {
1131 0 : BAT *bn1, *bn2, *bn3, *bn4, *bn5, *bn6, *bn7;
1132 0 : if (BATgroup(&bn1, &bn2, NULL, bi->b, s, g, NULL, NULL) != GDK_SUCCEED)
1133 0 : return GDK_FAIL;
1134 0 : bn3 = BATproject(bn2, bi->b);
1135 0 : if (bn3 == NULL) {
1136 0 : BBPunfix(bn1->batCacheid);
1137 0 : BBPunfix(bn2->batCacheid);
1138 0 : return GDK_FAIL;
1139 : }
1140 0 : bn4 = BATintersect(s, bn2, NULL, NULL, false, false, BUN_NONE);
1141 0 : BBPunfix(bn2->batCacheid);
1142 0 : if (bn4 == NULL) {
1143 0 : BBPunfix(bn1->batCacheid);
1144 0 : return GDK_FAIL;
1145 : }
1146 0 : bn5 = BATproject(bn4, g);
1147 0 : BBPunfix(bn4->batCacheid);
1148 0 : if (bn5 == NULL) {
1149 0 : BBPunfix(bn1->batCacheid);
1150 0 : return GDK_FAIL;
1151 : }
1152 0 : BATiter bn3i = bat_iterator(bn3);
1153 0 : bn6 = BATfirstn_unique_with_groups(&bn3i, NULL, bn5, n, asc, nilslast, NULL, NULL, t0);
1154 0 : bat_iterator_end(&bn3i);
1155 0 : BBPunfix(bn3->batCacheid);
1156 0 : BBPunfix(bn5->batCacheid);
1157 0 : if (bn6 == NULL) {
1158 0 : BBPunfix(bn1->batCacheid);
1159 0 : return GDK_FAIL;
1160 : }
1161 0 : rc = BATleftjoin(&bn7, NULL, bn1, bn6, NULL, NULL, false, BUN_NONE);
1162 0 : BBPunfix(bn6->batCacheid);
1163 0 : if (rc != GDK_SUCCEED)
1164 : return GDK_FAIL;
1165 0 : bn = BATproject(bn7, s);
1166 0 : BBPunfix(bn7->batCacheid);
1167 0 : if (bn == NULL)
1168 : return GDK_FAIL;
1169 : } else {
1170 339 : bn = BATfirstn_unique_with_groups(bi, s, g, n, asc, nilslast, &last, &lastg, t0);
1171 339 : if (bn == NULL)
1172 : return GDK_FAIL;
1173 : }
1174 339 : if (BATcount(bn) == 0) {
1175 0 : if (gids) {
1176 0 : gn = BATdense(0, 0, 0);
1177 0 : if (gn == NULL) {
1178 0 : BBPunfix(bn->batCacheid);
1179 0 : return GDK_FAIL;
1180 : }
1181 0 : *gids = gn;
1182 : }
1183 0 : *topn = bn;
1184 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1185 : ",g=" ALGOBATFMT ",n=" BUNFMT
1186 : " -> " ALGOOPTBATFMT "," ALGOOPTBATFMT
1187 : " (empty -- " LLFMT " usec)\n",
1188 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
1189 : ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn), GDKusec() - t0);
1190 0 : return GDK_SUCCEED;
1191 : }
1192 339 : if (!distinct && !bi->key) {
1193 304 : BAT *bn1, *bn2, *bn3, *bn4;
1194 :
1195 304 : bn1 = bn;
1196 304 : bn2 = BATselect(g, NULL, &lastg, NULL, true, false, false);
1197 304 : if (bn2 == NULL) {
1198 0 : BBPunfix(bn1->batCacheid);
1199 0 : return GDK_FAIL;
1200 : }
1201 304 : bn3 = BATproject(bn2, s);
1202 304 : BBPunfix(bn2->batCacheid);
1203 304 : if (bn3 == NULL) {
1204 0 : BBPunfix(bn1->batCacheid);
1205 0 : return GDK_FAIL;
1206 : }
1207 304 : bn4 = BATselect(bi->b, bn3, BUNtail(*bi, last - hseq), NULL, true, false, false);
1208 304 : BBPunfix(bn3->batCacheid);
1209 304 : if (bn4 == NULL) {
1210 0 : BBPunfix(bn1->batCacheid);
1211 0 : return GDK_FAIL;
1212 : }
1213 304 : bn = BATmergecand(bn1, bn4);
1214 304 : BBPunfix(bn1->batCacheid);
1215 304 : BBPunfix(bn4->batCacheid);
1216 304 : if (bn == NULL)
1217 : return GDK_FAIL;
1218 : }
1219 339 : if (gids) {
1220 339 : BAT *bn1, *bn2, *bn3, *bn4, *bn5, *bn6, *bn7, *bn8;
1221 :
1222 339 : if ((bn1 = BATintersect(s, bn, NULL, NULL, false, false, BUN_NONE)) == NULL) {
1223 0 : BBPunfix(bn->batCacheid);
1224 0 : return GDK_FAIL;
1225 : }
1226 339 : bn2 = BATproject(bn1, g);
1227 339 : BBPunfix(bn1->batCacheid);
1228 339 : if (bn2 == NULL) {
1229 0 : BBPunfix(bn->batCacheid);
1230 0 : return GDK_FAIL;
1231 : }
1232 339 : bn3 = BATproject(bn, bi->b);
1233 339 : if (bn3 == NULL) {
1234 0 : BBPunfix(bn2->batCacheid);
1235 0 : return GDK_FAIL;
1236 : }
1237 339 : rc = BATsort(NULL, &bn4, &bn5, bn2, NULL, NULL, false, false, false);
1238 339 : BBPunfix(bn2->batCacheid);
1239 339 : if (rc != GDK_SUCCEED) {
1240 0 : BBPunfix(bn->batCacheid);
1241 0 : BBPunfix(bn3->batCacheid);
1242 0 : return GDK_FAIL;
1243 : }
1244 339 : rc = BATsort(NULL, &bn6, &bn7, bn3, bn4, bn5, !asc, !asc, false);
1245 339 : BBPunfix(bn3->batCacheid);
1246 339 : BBPunfix(bn4->batCacheid);
1247 339 : BBPunfix(bn5->batCacheid);
1248 339 : if (rc != GDK_SUCCEED) {
1249 0 : BBPunfix(bn->batCacheid);
1250 0 : return GDK_FAIL;
1251 : }
1252 339 : rc = BATsort(NULL, &bn8, NULL, bn6, NULL, NULL, false, false, false);
1253 339 : BBPunfix(bn6->batCacheid);
1254 339 : if (rc != GDK_SUCCEED) {
1255 0 : BBPunfix(bn->batCacheid);
1256 0 : BBPunfix(bn7->batCacheid);
1257 0 : return GDK_FAIL;
1258 : }
1259 339 : gn = BATproject(bn8, bn7);
1260 339 : BBPunfix(bn7->batCacheid);
1261 339 : BBPunfix(bn8->batCacheid);
1262 339 : if (gn == NULL) {
1263 0 : BBPunfix(bn->batCacheid);
1264 0 : return GDK_FAIL;
1265 : }
1266 339 : *gids = gn;
1267 339 : assert(BATcount(gn) == BATcount(bn));
1268 : }
1269 339 : *topn = bn;
1270 339 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1271 : ",g=" ALGOBATFMT ",n=" BUNFMT
1272 : " -> " ALGOOPTBATFMT "," ALGOOPTBATFMT
1273 : " (" LLFMT " usec)\n",
1274 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
1275 : ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn), GDKusec() - t0);
1276 : return GDK_SUCCEED;
1277 : }
1278 :
1279 : gdk_return
1280 1523 : BATfirstn(BAT **topn, BAT **gids, BAT *b, BAT *s, BAT *g, BUN n, bool asc, bool nilslast, bool distinct)
1281 : {
1282 1523 : lng t0 = 0;
1283 1523 : gdk_return rc = GDK_SUCCEED;
1284 :
1285 1523 : assert(topn != NULL);
1286 1523 : if (b == NULL) {
1287 0 : *topn = NULL;
1288 0 : return GDK_SUCCEED;
1289 : }
1290 :
1291 1523 : TRC_DEBUG_IF(ALGO) t0 = GDKusec();
1292 :
1293 1523 : (void) BATordered(b);
1294 1523 : (void) BATordered_rev(b);
1295 :
1296 1523 : BATiter bi = bat_iterator(b);
1297 :
1298 : /* if g specified, then so must s */
1299 1523 : assert(g == NULL || s != NULL);
1300 : /* g and s must be aligned (same size, same hseqbase) */
1301 1523 : assert(g == NULL || BATcount(s) == BATcount(g));
1302 625 : assert(g == NULL || BATcount(g) == 0 || s->hseqbase == g->hseqbase);
1303 :
1304 1523 : if (n == 0 || BATcount(b) == 0 || (s != NULL && BATcount(s) == 0)) {
1305 : /* trivial: empty result */
1306 61 : *topn = BATdense(0, 0, 0);
1307 61 : if (*topn == NULL) {
1308 : rc = GDK_FAIL;
1309 61 : } else if (gids) {
1310 23 : *gids = BATdense(0, 0, 0);
1311 23 : if (*gids == NULL) {
1312 0 : BBPreclaim(*topn);
1313 : rc = GDK_FAIL;
1314 : }
1315 : }
1316 1462 : } else if (g == NULL) {
1317 858 : if (gids == NULL && !distinct) {
1318 577 : *topn = BATfirstn_unique(&bi, s, n, asc, nilslast, NULL, t0);
1319 577 : rc = *topn ? GDK_SUCCEED : GDK_FAIL;
1320 : } else {
1321 281 : rc = BATfirstn_grouped(topn, gids, &bi, s, n, asc, nilslast, distinct, t0);
1322 : }
1323 604 : } else if (gids == NULL && !distinct) {
1324 265 : *topn = BATfirstn_unique_with_groups(&bi, s, g, n, asc, nilslast, NULL, NULL, t0);
1325 265 : rc = *topn ? GDK_SUCCEED : GDK_FAIL;
1326 : } else {
1327 339 : rc = BATfirstn_grouped_with_groups(topn, gids, &bi, s, g, n, asc, nilslast, distinct, t0);
1328 : }
1329 1523 : bat_iterator_end(&bi);
1330 1523 : return rc;
1331 : }
|