Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "gdk.h"
15 : #include "gdk_private.h"
16 : #include "gdk_calc_private.h"
17 :
18 : /* BATfirstn select the smallest n elements from the input bat b (if
19 : * asc(ending) is set, else the largest n elements). Conceptually, b
20 : * is sorted in ascending or descending order (depending on the asc
21 : * argument) and then the OIDs of the first n elements are returned.
22 : * If there are NILs in the BAT, their relative ordering is set by
23 : * using the nilslast argument: if set, NILs come last (largest value
24 : * when ascending, smallest value when descending), so if there are
25 : * enough non-NIL values, no NILs will be returned. If unset (false),
26 : * NILs come first and will be returned.
27 : *
28 : * In addition to the input BAT b, there can be a standard candidate
29 : * list s. If s is specified (non-NULL), only elements in b that are
30 : * referred to in s are considered.
31 : *
32 : * If the third input bat g is non-NULL, then s must also be non-NULL
33 : * and must be aligned with g. G then specifies groups to which the
34 : * elements referred to in s belong. Conceptually, the group values
35 : * are sorted in ascending order together with the elements in b that
36 : * are referred to in s (in ascending or descending order depending on
37 : * asc), and the first n elements are then returned.
38 : *
39 : * If the output argument gids is NULL, only n elements are returned.
40 : * If the output argument gids is non-NULL, more than n elements may
41 : * be returned. If there are duplicate values (if g is given, the
42 : * group value counts in determining duplication), all duplicates are
43 : * returned.
44 : *
45 : * If distinct is set, the result contains n complete groups of values
46 : * instead of just n values (or slightly more than n if gids is set
47 : * since then the "last" group is returned completely).
48 : *
49 : * Note that BATfirstn can be called in cascading fashion to calculate
50 : * the first n values of a table of multiple columns:
51 : * BATfirstn(&s1, &g1, b1, NULL, NULL, n, asc, nilslast, distinct);
52 : * BATfirstn(&s2, &g2, b2, s1, g1, n, asc, nilslast, distinct);
53 : * BATfirstn(&s3, NULL, b3, s2, g2, n, asc, nilslast, distinct);
54 : * If the input BATs b1, b2, b3 are large enough, s3 will contain the
55 : * OIDs of the smallest (largest) n elements in the table consisting
56 : * of the columns b1, b2, b3 when ordered in ascending order with b1
57 : * the major key.
58 : */
59 :
60 : /* We use a binary heap for the implementation of the simplest form of
61 : * first-N. During processing, the oids list forms a heap with the
62 : * root at position 0 and the children of a node at position i at
63 : * positions 2*i+1 and 2*i+2. The parent node is always
64 : * smaller/larger (depending on the value of asc) than its children
65 : * (recursively). The heapify macro creates the heap from the input
66 : * in-place. We start off with a heap containing the first N elements
67 : * of the input, and then go over the rest of the input, replacing the
68 : * root of the heap with a new value if appropriate (if the new value
69 : * is among the first-N seen so far). The siftdown macro then
70 : * restores the heap property. */
71 : #define siftdown(OPER, START, SWAP) \
72 : do { \
73 : pos = (START); \
74 : childpos = (pos << 1) + 1; \
75 : while (childpos < n) { \
76 : /* find most extreme child */ \
77 : if (childpos + 1 < n && \
78 : !(OPER(childpos + 1, childpos))) \
79 : childpos++; \
80 : /* compare parent with most extreme child */ \
81 : if (!OPER(pos, childpos)) { \
82 : /* already correctly ordered */ \
83 : break; \
84 : } \
85 : /* exchange parent with child and sift child */ \
86 : /* further */ \
87 : SWAP(pos, childpos); \
88 : pos = childpos; \
89 : childpos = (pos << 1) + 1; \
90 : } \
91 : } while (0)
92 :
93 : #define heapify(OPER, SWAP) \
94 : do { \
95 : for (i = n / 2; i > 0; i--) \
96 : siftdown(OPER, i - 1, SWAP); \
97 : } while (0)
98 :
99 : /* we inherit LT and GT from gdk_calc_private.h */
100 :
101 : #define nLTbte(a, b) (!is_bte_nil(a) && (is_bte_nil(b) || (a) < (b)))
102 : #define nLTsht(a, b) (!is_sht_nil(a) && (is_sht_nil(b) || (a) < (b)))
103 : #define nLTint(a, b) (!is_int_nil(a) && (is_int_nil(b) || (a) < (b)))
104 : #define nLTlng(a, b) (!is_lng_nil(a) && (is_lng_nil(b) || (a) < (b)))
105 : #define nLThge(a, b) (!is_hge_nil(a) && (is_hge_nil(b) || (a) < (b)))
106 :
107 : #define nGTbte(a, b) (!is_bte_nil(b) && (is_bte_nil(a) || (a) > (b)))
108 : #define nGTsht(a, b) (!is_sht_nil(b) && (is_sht_nil(a) || (a) > (b)))
109 : #define nGTint(a, b) (!is_int_nil(b) && (is_int_nil(a) || (a) > (b)))
110 : #define nGTlng(a, b) (!is_lng_nil(b) && (is_lng_nil(a) || (a) > (b)))
111 : #define nGThge(a, b) (!is_hge_nil(b) && (is_hge_nil(a) || (a) > (b)))
112 :
113 : #define LTany(p1, p2) (cmp(BUNtail(*bi, oids[p1] - hseq), \
114 : BUNtail(*bi, oids[p2] - hseq)) < 0)
115 : #define GTany(p1, p2) (cmp(BUNtail(*bi, oids[p1] - hseq), \
116 : BUNtail(*bi, oids[p2] - hseq)) > 0)
117 :
118 : #define nLTany(p1, p2) (cmp(BUNtail(*bi, oids[p1] - hseq), nil) != 0 \
119 : && (cmp(BUNtail(*bi, oids[p2] - hseq), nil) == 0 \
120 : || cmp(BUNtail(*bi, oids[p1] - hseq), \
121 : BUNtail(*bi, oids[p2] - hseq)) < 0))
122 : #define nGTany(p1, p2) (cmp(BUNtail(*bi, oids[p2] - hseq), nil) != 0 \
123 : && (cmp(BUNtail(*bi, oids[p1] - hseq), nil) == 0 \
124 : || cmp(BUNtail(*bi, oids[p1] - hseq), \
125 : BUNtail(*bi, oids[p2] - hseq)) > 0))
126 :
127 : #define LTflt(a, b) (!is_flt_nil(b) && (is_flt_nil(a) || (a) < (b)))
128 : #define LTdbl(a, b) (!is_dbl_nil(b) && (is_dbl_nil(a) || (a) < (b)))
129 : #define GTflt(a, b) (!is_flt_nil(a) && (is_flt_nil(b) || (a) > (b)))
130 : #define GTdbl(a, b) (!is_dbl_nil(a) && (is_dbl_nil(b) || (a) > (b)))
131 :
132 : #define nLTflt(a, b) (!is_flt_nil(a) && (is_flt_nil(b) || (a) < (b)))
133 : #define nLTdbl(a, b) (!is_dbl_nil(a) && (is_dbl_nil(b) || (a) < (b)))
134 : #define nGTflt(a, b) (!is_flt_nil(b) && (is_flt_nil(a) || (a) > (b)))
135 : #define nGTdbl(a, b) (!is_dbl_nil(b) && (is_dbl_nil(a) || (a) > (b)))
136 :
137 : #define LTfltfix(p1, p2) LTflt(vals[oids[p1] - hseq], \
138 : vals[oids[p2] - hseq])
139 : #define GTfltfix(p1, p2) GTflt(vals[oids[p1] - hseq], \
140 : vals[oids[p2] - hseq])
141 : #define LTdblfix(p1, p2) LTdbl(vals[oids[p1] - hseq], \
142 : vals[oids[p2] - hseq])
143 : #define GTdblfix(p1, p2) GTdbl(vals[oids[p1] - hseq], \
144 : vals[oids[p2] - hseq])
145 : #define LTfix(p1, p2) LT(vals[oids[p1] - hseq], \
146 : vals[oids[p2] - hseq])
147 : #define GTfix(p1, p2) GT(vals[oids[p1] - hseq], \
148 : vals[oids[p2] - hseq])
149 :
150 : #define nLTfltfix(p1, p2) nLTflt(vals[oids[p1] - hseq], \
151 : vals[oids[p2] - hseq])
152 : #define nGTfltfix(p1, p2) nGTflt(vals[oids[p1] - hseq], \
153 : vals[oids[p2] - hseq])
154 : #define nLTdblfix(p1, p2) nLTdbl(vals[oids[p1] - hseq], \
155 : vals[oids[p2] - hseq])
156 : #define nGTdblfix(p1, p2) nGTdbl(vals[oids[p1] - hseq], \
157 : vals[oids[p2] - hseq])
158 : #define nLTbtefix(p1, p2) nLTbte(vals[oids[p1] - hseq], \
159 : vals[oids[p2] - hseq])
160 : #define nGTbtefix(p1, p2) nGTbte(vals[oids[p1] - hseq], \
161 : vals[oids[p2] - hseq])
162 : #define nLTshtfix(p1, p2) nLTsht(vals[oids[p1] - hseq], \
163 : vals[oids[p2] - hseq])
164 : #define nGTshtfix(p1, p2) nGTsht(vals[oids[p1] - hseq], \
165 : vals[oids[p2] - hseq])
166 : #define nLTintfix(p1, p2) nLTint(vals[oids[p1] - hseq], \
167 : vals[oids[p2] - hseq])
168 : #define nGTintfix(p1, p2) nGTint(vals[oids[p1] - hseq], \
169 : vals[oids[p2] - hseq])
170 : #define nLTlngfix(p1, p2) nLTlng(vals[oids[p1] - hseq], \
171 : vals[oids[p2] - hseq])
172 : #define nGTlngfix(p1, p2) nGTlng(vals[oids[p1] - hseq], \
173 : vals[oids[p2] - hseq])
174 : #define nLThgefix(p1, p2) nLThge(vals[oids[p1] - hseq], \
175 : vals[oids[p2] - hseq])
176 : #define nGThgefix(p1, p2) nGThge(vals[oids[p1] - hseq], \
177 : vals[oids[p2] - hseq])
178 :
179 : #define SWAP1(p1, p2) \
180 : do { \
181 : item = oids[p1]; \
182 : oids[p1] = oids[p2]; \
183 : oids[p2] = item; \
184 : } while (0)
185 :
186 : #define shuffle_unique(TYPE, OP) \
187 : do { \
188 : const TYPE *restrict vals = (const TYPE *) bi->base; \
189 : heapify(OP##fix, SWAP1); \
190 : while (cnt > 0) { \
191 : cnt--; \
192 : i = canditer_next(&ci); \
193 : if (OP(vals[i - hseq], \
194 : vals[oids[0] - hseq])) { \
195 : oids[0] = i; \
196 : siftdown(OP##fix, 0, SWAP1); \
197 : } \
198 : } \
199 : } while (0)
200 :
201 : /* This version of BATfirstn returns a list of N oids (where N is the
202 : * smallest among BATcount(b), BATcount(s), and n). The oids returned
203 : * refer to the N smallest/largest (depending on asc) tail values of b
204 : * (taking the optional candidate list s into account). If there are
205 : * multiple equal values to take us past N, we return a subset of those.
206 : *
207 : * If lastp is non-NULL, it is filled in with the oid of the "last"
208 : * value, i.e. the value of which there may be multiple occurrences
209 : * that are not all included in the first N.
210 : */
211 : static BAT *
212 737 : BATfirstn_unique(BATiter *bi, BAT *s, BUN n, bool asc, bool nilslast, oid *lastp, lng t0)
213 : {
214 737 : BAT *bn;
215 737 : oid *restrict oids;
216 737 : oid hseq = bi->b->hseqbase;
217 737 : BUN i, cnt;
218 737 : struct canditer ci;
219 737 : int tpe = bi->type;
220 737 : int (*cmp)(const void *, const void *);
221 737 : const void *nil;
222 : /* variables used in heapify/siftdown macros */
223 737 : oid item;
224 737 : BUN pos, childpos;
225 :
226 737 : MT_thread_setalgorithm(__func__);
227 737 : canditer_init(&ci, bi->b, s);
228 737 : cnt = ci.ncand;
229 :
230 737 : if (n >= cnt) {
231 : /* trivial: return all candidates */
232 410 : bn = canditer_slice(&ci, 0, ci.ncand);
233 410 : if (bn && lastp)
234 111 : *lastp = oid_nil;
235 410 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
236 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
237 : " (trivial -- " LLFMT " usec)\n",
238 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
239 : ALGOOPTBATPAR(bn), GDKusec() - t0);
240 410 : return bn;
241 : }
242 :
243 327 : if (BATtvoid(bi->b)) {
244 : /* nilslast doesn't make a difference: either all are
245 : * nil, or none are */
246 0 : if (asc || is_oid_nil(bi->tseq)) {
247 : /* return the first part of the candidate list
248 : * or of the BAT itself */
249 0 : bn = canditer_slice(&ci, 0, n);
250 0 : if (bn && lastp)
251 0 : *lastp = BUNtoid(bn, n - 1);
252 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
253 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
254 : " (initial slice -- " LLFMT " usec)\n",
255 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
256 : ALGOOPTBATPAR(bn), GDKusec() - t0);
257 0 : return bn;
258 : }
259 : /* return the last part of the candidate list or of
260 : * the BAT itself */
261 0 : bn = canditer_slice(&ci, cnt - n, cnt);
262 0 : if (bn && lastp)
263 0 : *lastp = BUNtoid(bn, 0);
264 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
265 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
266 : " (final slice -- " LLFMT " usec)\n",
267 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
268 : ALGOOPTBATPAR(bn), GDKusec() - t0);
269 0 : return bn;
270 : }
271 327 : if (bi->sorted || bi->revsorted) {
272 : /* trivial: b is sorted so we just need to return the
273 : * initial or final part of it (or of the candidate
274 : * list); however, if nilslast == asc, then the nil
275 : * values (if any) are in the wrong place, so we need
276 : * to do a little more work */
277 :
278 : /* after we create the to-be-returned BAT, we set pos
279 : * to the BUN in the new BAT whose value we should
280 : * return through *lastp */
281 179 : if (nilslast == asc && !bi->nonil) {
282 0 : pos = SORTfndlast(bi->b, ATOMnilptr(tpe));
283 0 : pos = canditer_search(&ci, hseq + pos, true);
284 : /* 0 <= pos <= cnt
285 : * 0 < n < cnt
286 : */
287 0 : if (bi->sorted) {
288 : /* [0..pos) -- nil
289 : * [pos..cnt) -- non-nil <<<
290 : */
291 0 : if (asc) { /* i.e. nilslast */
292 : /* prefer non-nil and
293 : * smallest */
294 0 : if (cnt - pos < n) {
295 0 : bn = canditer_slice(&ci, cnt - n, cnt);
296 0 : pos = 0;
297 : } else {
298 0 : bn = canditer_slice(&ci, pos, pos + n);
299 0 : pos = n - 1;
300 : }
301 : } else { /* i.e. !asc, !nilslast */
302 : /* prefer nil and largest */
303 0 : if (pos < n) {
304 0 : bn = canditer_slice2(&ci, 0, pos, cnt - (n - pos), cnt);
305 : /* pos = pos; */
306 : } else {
307 0 : bn = canditer_slice(&ci, 0, n);
308 0 : pos = 0;
309 : }
310 : }
311 : } else { /* i.e. trevsorted */
312 : /* [0..pos) -- non-nil >>>
313 : * [pos..cnt) -- nil
314 : */
315 0 : if (asc) { /* i.e. nilslast */
316 : /* prefer non-nil and
317 : * smallest */
318 0 : if (pos < n) {
319 0 : bn = canditer_slice(&ci, 0, n);
320 : /* pos = pos; */
321 : } else {
322 0 : bn = canditer_slice(&ci, pos - n, pos);
323 0 : pos = 0;
324 : }
325 : } else { /* i.e. !asc, !nilslast */
326 : /* prefer nil and largest */
327 0 : if (cnt - pos < n) {
328 0 : bn = canditer_slice2(&ci, 0, n - (cnt - pos), pos, cnt);
329 0 : pos = n - (cnt - pos) - 1;
330 : } else {
331 0 : bn = canditer_slice(&ci, pos, pos + n);
332 0 : pos = 0;
333 : }
334 : }
335 : }
336 : } else {
337 : /* either there are no nils, or they are in
338 : * the appropriate position already, so we can
339 : * just slice */
340 179 : if (asc ? bi->sorted : bi->revsorted) {
341 : /* return copy of first part of
342 : * candidate list */
343 162 : bn = canditer_slice(&ci, 0, n);
344 162 : pos = n - 1;
345 : } else {
346 : /* return copy of last part of
347 : * candidate list */
348 17 : bn = canditer_slice(&ci, cnt - n, cnt);
349 17 : pos = 0;
350 : }
351 : }
352 179 : if (bn && lastp)
353 39 : *lastp = BUNtoid(bn, pos);
354 179 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
355 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
356 : " (ordered -- " LLFMT " usec)\n",
357 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
358 : ALGOOPTBATPAR(bn), GDKusec() - t0);
359 179 : return bn;
360 : }
361 :
362 148 : bn = COLnew(0, TYPE_oid, n, TRANSIENT);
363 148 : if (bn == NULL)
364 : return NULL;
365 148 : BATsetcount(bn, n);
366 148 : oids = (oid *) Tloc(bn, 0);
367 148 : cmp = ATOMcompare(tpe);
368 148 : nil = ATOMnilptr(tpe);
369 : /* if base type has same comparison function as type itself, we
370 : * can use the base type */
371 148 : tpe = ATOMbasetype(tpe); /* takes care of oid */
372 : /* if the input happens to be almost sorted in ascending order
373 : * (likely a common use case), it is more efficient to start
374 : * off with the first n elements when doing a firstn-ascending
375 : * and to start off with the last n elements when doing a
376 : * firstn-descending so that most values that we look at after
377 : * this will be skipped. However, when using a bitmask
378 : * candidate list, the manipulation of the canditer structure
379 : * doesn't work like this, so we still work from the
380 : * beginning. */
381 148 : if (asc || ci.tpe == cand_mask) {
382 305568 : for (i = 0; i < n; i++)
383 305482 : oids[i] = canditer_next(&ci);
384 : } else {
385 62 : item = canditer_idx(&ci, cnt - n);
386 62 : ci.next = cnt - n;
387 62 : ci.add = item - ci.seq - (cnt - n);
388 642 : for (i = n; i > 0; i--)
389 580 : oids[i - 1] = canditer_next(&ci);
390 62 : canditer_reset(&ci);
391 : }
392 148 : cnt -= n;
393 :
394 148 : if (asc) {
395 86 : if (nilslast && !bi->nonil) {
396 14 : switch (tpe) {
397 2 : case TYPE_bte:
398 22 : shuffle_unique(bte, nLTbte);
399 : break;
400 0 : case TYPE_sht:
401 0 : shuffle_unique(sht, nLTsht);
402 : break;
403 2 : case TYPE_int:
404 20 : shuffle_unique(int, nLTint);
405 : break;
406 4 : case TYPE_lng:
407 43 : shuffle_unique(lng, nLTlng);
408 : break;
409 : #ifdef HAVE_HGE
410 0 : case TYPE_hge:
411 0 : shuffle_unique(hge, nLThge);
412 : break;
413 : #endif
414 2 : case TYPE_flt:
415 23 : shuffle_unique(flt, nLTflt);
416 : break;
417 0 : case TYPE_dbl:
418 0 : shuffle_unique(dbl, nLTdbl);
419 : break;
420 4 : default:
421 29 : heapify(nLTany, SWAP1);
422 10 : while (cnt > 0) {
423 6 : cnt--;
424 6 : i = canditer_next(&ci);
425 6 : if (cmp(BUNtail(*bi, i - hseq), nil) != 0
426 5 : && (cmp(BUNtail(*bi, oids[0] - hseq), nil) == 0
427 1 : || cmp(BUNtail(*bi, i - hseq),
428 1 : BUNtail(*bi, oids[0] - hseq)) < 0)) {
429 5 : oids[0] = i;
430 24 : siftdown(nLTany, 0, SWAP1);
431 : }
432 : }
433 : break;
434 : }
435 : } else {
436 72 : switch (tpe) {
437 0 : case TYPE_bte:
438 0 : shuffle_unique(bte, LT);
439 : break;
440 0 : case TYPE_sht:
441 0 : shuffle_unique(sht, LT);
442 : break;
443 25 : case TYPE_int:
444 7218 : shuffle_unique(int, LT);
445 : break;
446 5 : case TYPE_lng:
447 7652679 : shuffle_unique(lng, LT);
448 : break;
449 : #ifdef HAVE_HGE
450 8 : case TYPE_hge:
451 101654 : shuffle_unique(hge, LT);
452 : break;
453 : #endif
454 0 : case TYPE_flt:
455 0 : shuffle_unique(flt, LTflt);
456 : break;
457 3 : case TYPE_dbl:
458 78261 : shuffle_unique(dbl, LTdbl);
459 : break;
460 31 : default:
461 3520 : heapify(LTany, SWAP1);
462 142924 : while (cnt > 0) {
463 142893 : cnt--;
464 142893 : i = canditer_next(&ci);
465 142893 : if (cmp(BUNtail(*bi, i - hseq),
466 142893 : BUNtail(*bi, oids[0] - hseq)) < 0) {
467 5668 : oids[0] = i;
468 178116 : siftdown(LTany, 0, SWAP1);
469 : }
470 : }
471 : break;
472 : }
473 : }
474 : } else {
475 62 : if (nilslast || bi->nonil) {
476 41 : switch (tpe) {
477 0 : case TYPE_bte:
478 0 : shuffle_unique(bte, GT);
479 : break;
480 0 : case TYPE_sht:
481 0 : shuffle_unique(sht, GT);
482 : break;
483 24 : case TYPE_int:
484 200 : shuffle_unique(int, GT);
485 : break;
486 1 : case TYPE_lng:
487 2091 : shuffle_unique(lng, GT);
488 : break;
489 : #ifdef HAVE_HGE
490 5 : case TYPE_hge:
491 1496 : shuffle_unique(hge, GT);
492 : break;
493 : #endif
494 0 : case TYPE_flt:
495 0 : shuffle_unique(flt, GTflt);
496 : break;
497 2 : case TYPE_dbl:
498 22 : shuffle_unique(dbl, GTdbl);
499 : break;
500 9 : default:
501 51 : heapify(GTany, SWAP1);
502 40 : while (cnt > 0) {
503 31 : cnt--;
504 31 : i = canditer_next(&ci);
505 31 : if (cmp(BUNtail(*bi, i - hseq),
506 31 : BUNtail(*bi, oids[0] - hseq)) > 0) {
507 19 : oids[0] = i;
508 88 : siftdown(GTany, 0, SWAP1);
509 : }
510 : }
511 : break;
512 : }
513 : } else {
514 21 : switch (tpe) {
515 1 : case TYPE_bte:
516 9 : shuffle_unique(bte, nGTbte);
517 : break;
518 0 : case TYPE_sht:
519 0 : shuffle_unique(sht, nGTsht);
520 : break;
521 2 : case TYPE_int:
522 19 : shuffle_unique(int, nGTint);
523 : break;
524 8 : case TYPE_lng:
525 78 : shuffle_unique(lng, nGTlng);
526 : break;
527 : #ifdef HAVE_HGE
528 0 : case TYPE_hge:
529 0 : shuffle_unique(hge, nGThge);
530 : break;
531 : #endif
532 2 : case TYPE_flt:
533 18 : shuffle_unique(flt, nGTflt);
534 : break;
535 2 : case TYPE_dbl:
536 19 : shuffle_unique(dbl, nGTdbl);
537 : break;
538 6 : default:
539 38 : heapify(nGTany, SWAP1);
540 19 : while (cnt > 0) {
541 13 : cnt--;
542 13 : i = canditer_next(&ci);
543 13 : if (cmp(BUNtail(*bi, oids[0] - hseq), nil) != 0
544 13 : && (cmp(BUNtail(*bi, i - hseq), nil) == 0
545 11 : || cmp(BUNtail(*bi, i - hseq),
546 11 : BUNtail(*bi, oids[0] - hseq)) > 0)) {
547 13 : oids[0] = i;
548 53 : siftdown(nGTany, 0, SWAP1);
549 : }
550 : }
551 : break;
552 : }
553 : }
554 : }
555 148 : if (lastp)
556 56 : *lastp = oids[0]; /* store id of largest value */
557 : /* output must be sorted since it's a candidate list */
558 148 : GDKqsort(oids, NULL, NULL, (size_t) n, sizeof(oid), 0, TYPE_oid, false, false);
559 148 : bn->tsorted = true;
560 148 : bn->trevsorted = n <= 1;
561 148 : bn->tkey = true;
562 148 : bn->tseqbase = n <= 1 ? oids[0] : oid_nil;
563 148 : bn->tnil = false;
564 148 : bn->tnonil = true;
565 148 : bn = virtualize(bn);
566 148 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
567 : ",n=" BUNFMT " -> " ALGOOPTBATFMT
568 : " (" LLFMT " usec)\n",
569 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
570 : ALGOOPTBATPAR(bn), GDKusec() - t0);
571 : return bn;
572 : }
573 :
574 : #define LTfixgrp(p1, p2) \
575 : (goids[p1] < goids[p2] || \
576 : (goids[p1] == goids[p2] && \
577 : LTfix(p1, p2)))
578 : #define nLTbtefixgrp(p1, p2) \
579 : (goids[p1] < goids[p2] || \
580 : (goids[p1] == goids[p2] && \
581 : nLTbtefix(p1, p2)))
582 : #define nLTshtfixgrp(p1, p2) \
583 : (goids[p1] < goids[p2] || \
584 : (goids[p1] == goids[p2] && \
585 : nLTshtfix(p1, p2)))
586 : #define nLTintfixgrp(p1, p2) \
587 : (goids[p1] < goids[p2] || \
588 : (goids[p1] == goids[p2] && \
589 : nLTintfix(p1, p2)))
590 : #define nLTlngfixgrp(p1, p2) \
591 : (goids[p1] < goids[p2] || \
592 : (goids[p1] == goids[p2] && \
593 : nLTlngfix(p1, p2)))
594 : #define nLThgefixgrp(p1, p2) \
595 : (goids[p1] < goids[p2] || \
596 : (goids[p1] == goids[p2] && \
597 : nLThgefix(p1, p2)))
598 : #define LTfltfixgrp(p1, p2) \
599 : (goids[p1] < goids[p2] || \
600 : (goids[p1] == goids[p2] && \
601 : LTfltfix(p1, p2)))
602 : #define LTdblfixgrp(p1, p2) \
603 : (goids[p1] < goids[p2] || \
604 : (goids[p1] == goids[p2] && \
605 : LTdblfix(p1, p2)))
606 : #define nLTfltfixgrp(p1, p2) \
607 : (goids[p1] < goids[p2] || \
608 : (goids[p1] == goids[p2] && \
609 : nLTfltfix(p1, p2)))
610 : #define nLTdblfixgrp(p1, p2) \
611 : (goids[p1] < goids[p2] || \
612 : (goids[p1] == goids[p2] && \
613 : nLTdblfix(p1, p2)))
614 : #define GTfixgrp(p1, p2) \
615 : (goids[p1] < goids[p2] || \
616 : (goids[p1] == goids[p2] && \
617 : GTfix(p1, p2)))
618 : #define nGTbtefixgrp(p1, p2) \
619 : (goids[p1] < goids[p2] || \
620 : (goids[p1] == goids[p2] && \
621 : nGTbtefix(p1, p2)))
622 : #define nGTshtfixgrp(p1, p2) \
623 : (goids[p1] < goids[p2] || \
624 : (goids[p1] == goids[p2] && \
625 : nGTshtfix(p1, p2)))
626 : #define nGTintfixgrp(p1, p2) \
627 : (goids[p1] < goids[p2] || \
628 : (goids[p1] == goids[p2] && \
629 : nGTintfix(p1, p2)))
630 : #define nGTlngfixgrp(p1, p2) \
631 : (goids[p1] < goids[p2] || \
632 : (goids[p1] == goids[p2] && \
633 : nGTlngfix(p1, p2)))
634 : #define nGThgefixgrp(p1, p2) \
635 : (goids[p1] < goids[p2] || \
636 : (goids[p1] == goids[p2] && \
637 : nGThgefix(p1, p2)))
638 : #define GTfltfixgrp(p1, p2) \
639 : (goids[p1] < goids[p2] || \
640 : (goids[p1] == goids[p2] && \
641 : GTfltfix(p1, p2)))
642 : #define GTdblfixgrp(p1, p2) \
643 : (goids[p1] < goids[p2] || \
644 : (goids[p1] == goids[p2] && \
645 : GTdblfix(p1, p2)))
646 : #define nGTfltfixgrp(p1, p2) \
647 : (goids[p1] < goids[p2] || \
648 : (goids[p1] == goids[p2] && \
649 : nGTfltfix(p1, p2)))
650 : #define nGTdblfixgrp(p1, p2) \
651 : (goids[p1] < goids[p2] || \
652 : (goids[p1] == goids[p2] && \
653 : nGTdblfix(p1, p2)))
654 : #define LTvoidgrp(p1, p2) \
655 : (goids[p1] < goids[p2] || \
656 : (goids[p1] == goids[p2] && oids[p1] < oids[p2]))
657 : #define GTvoidgrp(p1, p2) \
658 : (goids[p1] < goids[p2] || \
659 : (goids[p1] == goids[p2] && oids[p1] > oids[p2]))
660 : #define LTanygrp(p1, p2) \
661 : (goids[p1] < goids[p2] || \
662 : (goids[p1] == goids[p2] && \
663 : LTany(p1, p2)))
664 : #define GTanygrp(p1, p2) \
665 : (goids[p1] < goids[p2] || \
666 : (goids[p1] == goids[p2] && \
667 : GTany(p1, p2)))
668 : #define nLTanygrp(p1, p2) \
669 : (goids[p1] < goids[p2] || \
670 : (goids[p1] == goids[p2] && \
671 : nLTany(p1, p2)))
672 : #define nGTanygrp(p1, p2) \
673 : (goids[p1] < goids[p2] || \
674 : (goids[p1] == goids[p2] && \
675 : nGTany(p1, p2)))
676 : #define SWAP2(p1, p2) \
677 : do { \
678 : item = oids[p1]; \
679 : oids[p1] = oids[p2]; \
680 : oids[p2] = item; \
681 : item = goids[p1]; \
682 : goids[p1] = goids[p2]; \
683 : goids[p2] = item; \
684 : } while (0)
685 :
686 : #define shuffle_unique_with_groups(TYPE, OP) \
687 : do { \
688 : const TYPE *restrict vals = (const TYPE *) bi->base; \
689 : heapify(OP##fixgrp, SWAP2); \
690 : while (cnt > 0) { \
691 : cnt--; \
692 : i = canditer_next(&ci); \
693 : if (gv[j] < goids[0] || \
694 : (gv[j] == goids[0] && \
695 : OP(vals[i - hseq], \
696 : vals[oids[0] - hseq]))) { \
697 : oids[0] = i; \
698 : goids[0] = gv[j]; \
699 : siftdown(OP##fixgrp, 0, SWAP2); \
700 : } \
701 : j++; \
702 : } \
703 : } while (0)
704 :
705 : /* This version of BATfirstn is like the one above, except that it
706 : * also looks at groups. The values of the group IDs are important:
707 : * we return only the smallest N (i.e., not dependent on asc which
708 : * refers only to the values in the BAT b).
709 : *
710 : * If lastp is non-NULL, it is filled in with the oid of the "last"
711 : * value, i.e. the value of which there may be multiple occurrences
712 : * that are not all included in the first N. If lastgp is non-NULL,
713 : * it is filled with the group ID (not the oid of the group ID) for
714 : * that same value.
715 : */
716 : static BAT *
717 421 : BATfirstn_unique_with_groups(BATiter *bi, BAT *s, BAT *g, BUN n, bool asc, bool nilslast, oid *lastp, oid *lastgp, lng t0)
718 : {
719 421 : BAT *bn;
720 421 : oid *restrict oids, *restrict goids;
721 421 : oid hseq = bi->b->hseqbase;
722 421 : const oid *restrict gv;
723 421 : BUN i, j, cnt;
724 421 : struct canditer ci;
725 421 : int tpe = bi->type;
726 421 : int (*cmp)(const void *, const void *);
727 421 : const void *nil;
728 : /* variables used in heapify/siftdown macros */
729 421 : oid item;
730 421 : BUN pos, childpos;
731 :
732 421 : MT_thread_setalgorithm(__func__);
733 421 : canditer_init(&ci, bi->b, s);
734 421 : cnt = ci.ncand;
735 :
736 421 : if (n > cnt)
737 : n = cnt;
738 :
739 421 : if (n == 0) {
740 : /* candidate list might refer only to values outside
741 : * of the bat and hence be effectively empty */
742 0 : if (lastp)
743 0 : *lastp = 0;
744 0 : if (lastgp)
745 0 : *lastgp = 0;
746 0 : bn = BATdense(0, 0, 0);
747 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
748 : ",g=" ALGOBATFMT ",n=" BUNFMT " -> " ALGOOPTBATFMT
749 : " (empty -- " LLFMT " usec)\n",
750 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
751 : ALGOOPTBATPAR(bn), GDKusec() - t0);
752 0 : return bn;
753 : }
754 :
755 421 : if (BATtdense(g)) {
756 : /* trivial: g determines ordering, return reference to
757 : * initial part of b (or slice of s) */
758 37 : if (lastgp)
759 16 : *lastgp = g->tseqbase + n - 1;
760 37 : bn = canditer_slice(&ci, 0, n);
761 37 : if (bn && lastp)
762 16 : *lastp = BUNtoid(bn, n - 1);
763 37 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
764 : ",g=" ALGOBATFMT ",n=" BUNFMT " -> " ALGOOPTBATFMT
765 : " (dense group -- " LLFMT " usec)\n",
766 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
767 : ALGOOPTBATPAR(bn), GDKusec() - t0);
768 37 : return bn;
769 : }
770 :
771 384 : bn = COLnew(0, TYPE_oid, n, TRANSIENT);
772 384 : if (bn == NULL)
773 : return NULL;
774 384 : BATsetcount(bn, n);
775 384 : oids = (oid *) Tloc(bn, 0);
776 384 : gv = (const oid *) Tloc(g, 0);
777 384 : goids = GDKmalloc(n * sizeof(oid));
778 384 : if (goids == NULL) {
779 0 : BBPreclaim(bn);
780 0 : return NULL;
781 : }
782 :
783 384 : cmp = ATOMcompare(tpe);
784 384 : nil = ATOMnilptr(tpe);
785 : /* if base type has same comparison function as type itself, we
786 : * can use the base type */
787 384 : tpe = ATOMbasetype(tpe); /* takes care of oid */
788 384 : j = 0;
789 11027738 : for (i = 0; i < n; i++) {
790 11027355 : oids[i] = canditer_next(&ci);
791 11027354 : goids[i] = gv[j++];
792 : }
793 383 : cnt -= n;
794 :
795 383 : if (BATtvoid(bi->b)) {
796 : /* nilslast doesn't make a difference (all nil, or no nil) */
797 0 : if (asc) {
798 0 : heapify(LTvoidgrp, SWAP2);
799 0 : while (cnt > 0) {
800 0 : cnt--;
801 0 : i = canditer_next(&ci);
802 0 : if (gv[j] < goids[0]
803 : /* || (gv[j] == goids[0]
804 : && i < oids[0]) -- always false */) {
805 0 : oids[0] = i;
806 0 : goids[0] = gv[j];
807 0 : siftdown(LTvoidgrp, 0, SWAP2);
808 : }
809 0 : j++;
810 : }
811 : } else {
812 0 : heapify(GTvoidgrp, SWAP2);
813 0 : while (cnt > 0) {
814 0 : cnt--;
815 0 : i = canditer_next(&ci);
816 0 : if (gv[j] < goids[0]
817 0 : || (gv[j] == goids[0]
818 : /* && i > oids[0] -- always true */)) {
819 0 : oids[0] = i;
820 0 : goids[0] = gv[j];
821 0 : siftdown(GTvoidgrp, 0, SWAP2);
822 : }
823 0 : j++;
824 : }
825 : }
826 383 : } else if (asc) {
827 324 : if (nilslast && !bi->nonil) {
828 0 : switch (tpe) {
829 0 : case TYPE_bte:
830 0 : shuffle_unique_with_groups(bte, nLTbte);
831 : break;
832 0 : case TYPE_sht:
833 0 : shuffle_unique_with_groups(sht, nLTsht);
834 : break;
835 0 : case TYPE_int:
836 0 : shuffle_unique_with_groups(int, nLTint);
837 : break;
838 0 : case TYPE_lng:
839 0 : shuffle_unique_with_groups(lng, nLTlng);
840 : break;
841 : #ifdef HAVE_HGE
842 0 : case TYPE_hge:
843 0 : shuffle_unique_with_groups(hge, nLThge);
844 : break;
845 : #endif
846 0 : case TYPE_flt:
847 0 : shuffle_unique_with_groups(flt, nLTflt);
848 : break;
849 0 : case TYPE_dbl:
850 0 : shuffle_unique_with_groups(dbl, nLTdbl);
851 : break;
852 0 : default:
853 0 : heapify(nLTanygrp, SWAP2);
854 0 : while (cnt > 0) {
855 0 : cnt--;
856 0 : i = canditer_next(&ci);
857 0 : if (gv[j] < goids[0]
858 0 : || (gv[j] == goids[0]
859 0 : && cmp(BUNtail(*bi, i - hseq), nil) != 0
860 0 : && (cmp(BUNtail(*bi, oids[0] - hseq), nil) == 0
861 0 : || cmp(BUNtail(*bi, i - hseq),
862 0 : BUNtail(*bi, oids[0] - hseq)) < 0))) {
863 0 : oids[0] = i;
864 0 : goids[0] = gv[j];
865 0 : siftdown(nLTanygrp, 0, SWAP2);
866 : }
867 0 : j++;
868 : }
869 : break;
870 : }
871 : } else {
872 324 : switch (tpe) {
873 21 : case TYPE_bte:
874 1597890 : shuffle_unique_with_groups(bte, LT);
875 : break;
876 14 : case TYPE_sht:
877 1000063 : shuffle_unique_with_groups(sht, LT);
878 : break;
879 109 : case TYPE_int:
880 33754 : shuffle_unique_with_groups(int, LT);
881 : break;
882 10 : case TYPE_lng:
883 597895 : shuffle_unique_with_groups(lng, LT);
884 : break;
885 : #ifdef HAVE_HGE
886 26 : case TYPE_hge:
887 2490 : shuffle_unique_with_groups(hge, LT);
888 : break;
889 : #endif
890 0 : case TYPE_flt:
891 0 : shuffle_unique_with_groups(flt, LTflt);
892 : break;
893 4 : case TYPE_dbl:
894 17 : shuffle_unique_with_groups(dbl, LTdbl);
895 : break;
896 140 : default:
897 259249 : heapify(LTanygrp, SWAP2);
898 15811 : while (cnt > 0) {
899 15671 : cnt--;
900 15671 : i = canditer_next(&ci);
901 15671 : if (gv[j] < goids[0] ||
902 15501 : (gv[j] == goids[0] &&
903 15501 : cmp(BUNtail(*bi, i - hseq),
904 15501 : BUNtail(*bi, oids[0] - hseq)) < 0)) {
905 2321 : oids[0] = i;
906 2321 : goids[0] = gv[j];
907 14484 : siftdown(LTanygrp, 0, SWAP2);
908 : }
909 15671 : j++;
910 : }
911 : break;
912 : }
913 : }
914 59 : } else if (nilslast || bi->nonil) {
915 40 : switch (tpe) {
916 5 : case TYPE_bte:
917 181797 : shuffle_unique_with_groups(bte, GT);
918 : break;
919 0 : case TYPE_sht:
920 0 : shuffle_unique_with_groups(sht, GT);
921 : break;
922 15 : case TYPE_int:
923 5301732 : shuffle_unique_with_groups(int, GT);
924 : break;
925 0 : case TYPE_lng:
926 0 : shuffle_unique_with_groups(lng, GT);
927 : break;
928 : #ifdef HAVE_HGE
929 5 : case TYPE_hge:
930 723 : shuffle_unique_with_groups(hge, GT);
931 : break;
932 : #endif
933 0 : case TYPE_flt:
934 0 : shuffle_unique_with_groups(flt, GTflt);
935 : break;
936 6 : case TYPE_dbl:
937 246225 : shuffle_unique_with_groups(dbl, GTdbl);
938 : break;
939 9 : default:
940 54 : heapify(GTanygrp, SWAP2);
941 12 : while (cnt > 0) {
942 3 : cnt--;
943 3 : i = canditer_next(&ci);
944 3 : if (gv[j] < goids[0] ||
945 2 : (gv[j] == goids[0] &&
946 2 : cmp(BUNtail(*bi, i - hseq),
947 2 : BUNtail(*bi, oids[0] - hseq)) > 0)) {
948 2 : oids[0] = i;
949 2 : goids[0] = gv[j];
950 5 : siftdown(GTanygrp, 0, SWAP2);
951 : }
952 3 : j++;
953 : }
954 : break;
955 : }
956 : } else {
957 19 : switch (tpe) {
958 2 : case TYPE_bte:
959 441468 : shuffle_unique_with_groups(bte, nGTbte);
960 : break;
961 0 : case TYPE_sht:
962 0 : shuffle_unique_with_groups(sht, nGTsht);
963 : break;
964 0 : case TYPE_int:
965 0 : shuffle_unique_with_groups(int, nGTint);
966 : break;
967 0 : case TYPE_lng:
968 0 : shuffle_unique_with_groups(lng, nGTlng);
969 : break;
970 : #ifdef HAVE_HGE
971 17 : case TYPE_hge:
972 1619099 : shuffle_unique_with_groups(hge, nGThge);
973 : break;
974 : #endif
975 0 : case TYPE_flt:
976 0 : shuffle_unique_with_groups(flt, nGTflt);
977 : break;
978 0 : case TYPE_dbl:
979 0 : shuffle_unique_with_groups(dbl, nGTdbl);
980 : break;
981 0 : default:
982 0 : heapify(nGTanygrp, SWAP2);
983 0 : while (cnt > 0) {
984 0 : cnt--;
985 0 : i = canditer_next(&ci);
986 0 : if (gv[j] < goids[0]
987 0 : || (gv[j] == goids[0]
988 0 : && cmp(BUNtail(*bi, oids[0] - hseq), nil) != 0
989 0 : && (cmp(BUNtail(*bi, i - hseq), nil) == 0
990 0 : || cmp(BUNtail(*bi, i - hseq),
991 0 : BUNtail(*bi, oids[0] - hseq)) > 0))) {
992 0 : oids[0] = i;
993 0 : goids[0] = gv[j];
994 0 : siftdown(nGTanygrp, 0, SWAP2);
995 : }
996 0 : j++;
997 : }
998 : break;
999 : }
1000 : }
1001 384 : if (lastp)
1002 215 : *lastp = oids[0];
1003 384 : if (lastgp)
1004 215 : *lastgp = goids[0];
1005 384 : GDKfree(goids);
1006 : /* output must be sorted since it's a candidate list */
1007 384 : GDKqsort(oids, NULL, NULL, (size_t) n, sizeof(oid), 0, TYPE_oid, false, false);
1008 384 : bn->tsorted = true;
1009 384 : bn->trevsorted = n <= 1;
1010 384 : bn->tkey = true;
1011 384 : bn->tseqbase = n <= 1 ? oids[0] : oid_nil;
1012 384 : bn->tnil = false;
1013 384 : bn->tnonil = true;
1014 384 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1015 : ",g=" ALGOBATFMT ",n=" BUNFMT " -> " ALGOOPTBATFMT
1016 : " (" LLFMT " usec)\n",
1017 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
1018 : ALGOOPTBATPAR(bn), GDKusec() - t0);
1019 : return bn;
1020 : }
1021 :
1022 : static gdk_return
1023 206 : BATfirstn_grouped(BAT **topn, BAT **gids, BATiter *bi, BAT *s, BUN n, bool asc, bool nilslast, bool distinct, lng t0)
1024 : {
1025 206 : BAT *bn, *gn = NULL, *su = NULL;
1026 206 : oid last;
1027 206 : gdk_return rc;
1028 :
1029 206 : MT_thread_setalgorithm(__func__);
1030 206 : if (distinct && !bi->key) {
1031 0 : su = s;
1032 0 : s = BATunique(bi->b, s);
1033 0 : if (s == NULL)
1034 : return GDK_FAIL;
1035 : }
1036 206 : bn = BATfirstn_unique(bi, s, n, asc, nilslast, &last, t0);
1037 206 : if (bn == NULL)
1038 : return GDK_FAIL;
1039 206 : if (BATcount(bn) == 0) {
1040 0 : if (gids) {
1041 0 : gn = BATdense(0, 0, 0);
1042 0 : if (gn == NULL) {
1043 0 : BBPunfix(bn->batCacheid);
1044 0 : return GDK_FAIL;
1045 : }
1046 0 : *gids = gn;
1047 : }
1048 0 : *topn = bn;
1049 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1050 : ",n=" BUNFMT " -> " ALGOOPTBATFMT "," ALGOOPTBATFMT
1051 : " (empty -- " LLFMT " usec)\n",
1052 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
1053 : ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn), GDKusec() - t0);
1054 0 : return GDK_SUCCEED;
1055 : }
1056 206 : if (!bi->key) {
1057 180 : if (distinct) {
1058 0 : BAT *bn1;
1059 :
1060 0 : bn1 = bn;
1061 0 : BBPunfix(s->batCacheid);
1062 0 : bn = BATintersect(bi->b, bi->b, su, bn1, true, false, BUN_NONE);
1063 0 : BBPunfix(bn1->batCacheid);
1064 0 : if (bn == NULL)
1065 : return GDK_FAIL;
1066 180 : } else if (last != oid_nil) {
1067 94 : BAT *bn1, *bn2;
1068 :
1069 94 : bn1 = bn;
1070 94 : bn2 = BATselect(bi->b, s, BUNtail(*bi, last - bi->b->hseqbase), NULL, true, false, false);
1071 94 : if (bn2 == NULL) {
1072 0 : BBPunfix(bn1->batCacheid);
1073 0 : return GDK_FAIL;
1074 : }
1075 94 : bn = BATmergecand(bn1, bn2);
1076 94 : BBPunfix(bn1->batCacheid);
1077 94 : BBPunfix(bn2->batCacheid);
1078 94 : if (bn == NULL)
1079 : return GDK_FAIL;
1080 : }
1081 : }
1082 206 : if (gids) {
1083 206 : BAT *bn1, *bn2, *bn3, *bn4;
1084 206 : bn1 = BATproject(bn, bi->b);
1085 206 : if (bn1 == NULL) {
1086 0 : BBPunfix(bn->batCacheid);
1087 0 : return GDK_FAIL;
1088 : }
1089 206 : rc = BATsort(NULL, &bn2, &bn3, bn1, NULL, NULL, !asc, !asc, false);
1090 206 : BBPunfix(bn1->batCacheid);
1091 206 : if (rc != GDK_SUCCEED) {
1092 0 : BBPunfix(bn->batCacheid);
1093 0 : return GDK_FAIL;
1094 : }
1095 206 : rc = BATsort(NULL, &bn4, NULL, bn2, NULL, NULL, false, false, false);
1096 206 : BBPunfix(bn2->batCacheid);
1097 206 : if (rc != GDK_SUCCEED) {
1098 0 : BBPunfix(bn->batCacheid);
1099 0 : BBPunfix(bn3->batCacheid);
1100 0 : return GDK_FAIL;
1101 : }
1102 206 : gn = BATproject(bn4, bn3);
1103 206 : BBPunfix(bn3->batCacheid);
1104 206 : BBPunfix(bn4->batCacheid);
1105 206 : if (gn == NULL) {
1106 0 : BBPunfix(bn->batCacheid);
1107 0 : return GDK_FAIL;
1108 : }
1109 206 : *gids = gn;
1110 206 : assert(BATcount(gn) == BATcount(bn));
1111 : }
1112 206 : *topn = bn;
1113 206 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1114 : ",n=" BUNFMT " -> " ALGOOPTBATFMT "," ALGOOPTBATFMT
1115 : " (" LLFMT " usec)\n",
1116 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), n,
1117 : ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn), GDKusec() - t0);
1118 : return GDK_SUCCEED;
1119 : }
1120 :
1121 : static gdk_return
1122 231 : BATfirstn_grouped_with_groups(BAT **topn, BAT **gids, BATiter *bi, BAT *s, BAT *g, BUN n, bool asc, bool nilslast, bool distinct, lng t0)
1123 : {
1124 231 : BAT *bn, *gn = NULL;
1125 231 : oid hseq = bi->b->hseqbase;
1126 231 : oid last, lastg;
1127 231 : gdk_return rc;
1128 :
1129 231 : MT_thread_setalgorithm(__func__);
1130 231 : if (distinct) {
1131 0 : BAT *bn1, *bn2, *bn3, *bn4, *bn5, *bn6, *bn7;
1132 0 : if (BATgroup(&bn1, &bn2, NULL, bi->b, s, g, NULL, NULL) != GDK_SUCCEED)
1133 0 : return GDK_FAIL;
1134 0 : bn3 = BATproject(bn2, bi->b);
1135 0 : if (bn3 == NULL) {
1136 0 : BBPunfix(bn1->batCacheid);
1137 0 : BBPunfix(bn2->batCacheid);
1138 0 : return GDK_FAIL;
1139 : }
1140 0 : bn4 = BATintersect(s, bn2, NULL, NULL, false, false, BUN_NONE);
1141 0 : BBPunfix(bn2->batCacheid);
1142 0 : if (bn4 == NULL) {
1143 0 : BBPunfix(bn1->batCacheid);
1144 0 : return GDK_FAIL;
1145 : }
1146 0 : bn5 = BATproject(bn4, g);
1147 0 : BBPunfix(bn4->batCacheid);
1148 0 : if (bn5 == NULL) {
1149 0 : BBPunfix(bn1->batCacheid);
1150 0 : return GDK_FAIL;
1151 : }
1152 0 : BATiter bn3i = bat_iterator(bn3);
1153 0 : bn6 = BATfirstn_unique_with_groups(&bn3i, NULL, bn5, n, asc, nilslast, NULL, NULL, t0);
1154 0 : bat_iterator_end(&bn3i);
1155 0 : BBPunfix(bn3->batCacheid);
1156 0 : BBPunfix(bn5->batCacheid);
1157 0 : if (bn6 == NULL) {
1158 0 : BBPunfix(bn1->batCacheid);
1159 0 : return GDK_FAIL;
1160 : }
1161 0 : rc = BATleftjoin(&bn7, NULL, bn1, bn6, NULL, NULL, false, BUN_NONE);
1162 0 : BBPunfix(bn6->batCacheid);
1163 0 : if (rc != GDK_SUCCEED)
1164 : return GDK_FAIL;
1165 0 : bn = BATproject(bn7, s);
1166 0 : BBPunfix(bn7->batCacheid);
1167 0 : if (bn == NULL)
1168 : return GDK_FAIL;
1169 : } else {
1170 231 : bn = BATfirstn_unique_with_groups(bi, s, g, n, asc, nilslast, &last, &lastg, t0);
1171 231 : if (bn == NULL)
1172 : return GDK_FAIL;
1173 : }
1174 231 : if (BATcount(bn) == 0) {
1175 0 : if (gids) {
1176 0 : gn = BATdense(0, 0, 0);
1177 0 : if (gn == NULL) {
1178 0 : BBPunfix(bn->batCacheid);
1179 0 : return GDK_FAIL;
1180 : }
1181 0 : *gids = gn;
1182 : }
1183 0 : *topn = bn;
1184 0 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1185 : ",g=" ALGOBATFMT ",n=" BUNFMT
1186 : " -> " ALGOOPTBATFMT "," ALGOOPTBATFMT
1187 : " (empty -- " LLFMT " usec)\n",
1188 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
1189 : ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn), GDKusec() - t0);
1190 0 : return GDK_SUCCEED;
1191 : }
1192 231 : if (!distinct && !bi->key) {
1193 212 : BAT *bn1, *bn2, *bn3, *bn4;
1194 :
1195 212 : bn1 = bn;
1196 212 : bn2 = BATselect(g, NULL, &lastg, NULL, true, false, false);
1197 212 : if (bn2 == NULL) {
1198 0 : BBPunfix(bn1->batCacheid);
1199 0 : return GDK_FAIL;
1200 : }
1201 212 : bn3 = BATproject(bn2, s);
1202 212 : BBPunfix(bn2->batCacheid);
1203 212 : if (bn3 == NULL) {
1204 0 : BBPunfix(bn1->batCacheid);
1205 0 : return GDK_FAIL;
1206 : }
1207 212 : bn4 = BATselect(bi->b, bn3, BUNtail(*bi, last - hseq), NULL, true, false, false);
1208 212 : BBPunfix(bn3->batCacheid);
1209 212 : if (bn4 == NULL) {
1210 0 : BBPunfix(bn1->batCacheid);
1211 0 : return GDK_FAIL;
1212 : }
1213 212 : bn = BATmergecand(bn1, bn4);
1214 212 : BBPunfix(bn1->batCacheid);
1215 212 : BBPunfix(bn4->batCacheid);
1216 212 : if (bn == NULL)
1217 : return GDK_FAIL;
1218 : }
1219 231 : if (gids) {
1220 231 : BAT *bn1, *bn2, *bn3, *bn4, *bn5, *bn6, *bn7, *bn8;
1221 :
1222 231 : if ((bn1 = BATintersect(s, bn, NULL, NULL, false, false, BUN_NONE)) == NULL) {
1223 0 : BBPunfix(bn->batCacheid);
1224 0 : return GDK_FAIL;
1225 : }
1226 231 : bn2 = BATproject(bn1, g);
1227 231 : BBPunfix(bn1->batCacheid);
1228 231 : if (bn2 == NULL) {
1229 0 : BBPunfix(bn->batCacheid);
1230 0 : return GDK_FAIL;
1231 : }
1232 231 : bn3 = BATproject(bn, bi->b);
1233 231 : if (bn3 == NULL) {
1234 0 : BBPunfix(bn2->batCacheid);
1235 0 : return GDK_FAIL;
1236 : }
1237 231 : rc = BATsort(NULL, &bn4, &bn5, bn2, NULL, NULL, false, false, false);
1238 231 : BBPunfix(bn2->batCacheid);
1239 231 : if (rc != GDK_SUCCEED) {
1240 0 : BBPunfix(bn->batCacheid);
1241 0 : BBPunfix(bn3->batCacheid);
1242 0 : return GDK_FAIL;
1243 : }
1244 231 : rc = BATsort(NULL, &bn6, &bn7, bn3, bn4, bn5, !asc, !asc, false);
1245 231 : BBPunfix(bn3->batCacheid);
1246 231 : BBPunfix(bn4->batCacheid);
1247 231 : BBPunfix(bn5->batCacheid);
1248 231 : if (rc != GDK_SUCCEED) {
1249 0 : BBPunfix(bn->batCacheid);
1250 0 : return GDK_FAIL;
1251 : }
1252 231 : rc = BATsort(NULL, &bn8, NULL, bn6, NULL, NULL, false, false, false);
1253 231 : BBPunfix(bn6->batCacheid);
1254 231 : if (rc != GDK_SUCCEED) {
1255 0 : BBPunfix(bn->batCacheid);
1256 0 : BBPunfix(bn7->batCacheid);
1257 0 : return GDK_FAIL;
1258 : }
1259 231 : gn = BATproject(bn8, bn7);
1260 231 : BBPunfix(bn7->batCacheid);
1261 231 : BBPunfix(bn8->batCacheid);
1262 231 : if (gn == NULL) {
1263 0 : BBPunfix(bn->batCacheid);
1264 0 : return GDK_FAIL;
1265 : }
1266 231 : *gids = gn;
1267 231 : assert(BATcount(gn) == BATcount(bn));
1268 : }
1269 231 : *topn = bn;
1270 231 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
1271 : ",g=" ALGOBATFMT ",n=" BUNFMT
1272 : " -> " ALGOOPTBATFMT "," ALGOOPTBATFMT
1273 : " (" LLFMT " usec)\n",
1274 : ALGOBATPAR(bi->b), ALGOOPTBATPAR(s), ALGOBATPAR(g), n,
1275 : ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn), GDKusec() - t0);
1276 : return GDK_SUCCEED;
1277 : }
1278 :
1279 : gdk_return
1280 1217 : BATfirstn(BAT **topn, BAT **gids, BAT *b, BAT *s, BAT *g, BUN n, bool asc, bool nilslast, bool distinct)
1281 : {
1282 1217 : lng t0 = 0;
1283 1217 : gdk_return rc = GDK_SUCCEED;
1284 :
1285 1217 : assert(topn != NULL);
1286 1217 : if (b == NULL) {
1287 0 : *topn = NULL;
1288 0 : return GDK_SUCCEED;
1289 : }
1290 :
1291 1217 : TRC_DEBUG_IF(ALGO) t0 = GDKusec();
1292 :
1293 1217 : (void) BATordered(b);
1294 1217 : (void) BATordered_rev(b);
1295 :
1296 1217 : BATiter bi = bat_iterator(b);
1297 :
1298 : /* if g specified, then so must s */
1299 1216 : assert(g == NULL || s != NULL);
1300 : /* g and s must be aligned (same size, same hseqbase) */
1301 1216 : assert(g == NULL || BATcount(s) == BATcount(g));
1302 444 : assert(g == NULL || BATcount(g) == 0 || s->hseqbase == g->hseqbase);
1303 :
1304 1216 : if (n == 0 || BATcount(b) == 0 || (s != NULL && BATcount(s) == 0)) {
1305 : /* trivial: empty result */
1306 59 : *topn = BATdense(0, 0, 0);
1307 59 : if (*topn == NULL) {
1308 : rc = GDK_FAIL;
1309 59 : } else if (gids) {
1310 25 : *gids = BATdense(0, 0, 0);
1311 25 : if (*gids == NULL) {
1312 0 : BBPreclaim(*topn);
1313 : rc = GDK_FAIL;
1314 : }
1315 : }
1316 1157 : } else if (g == NULL) {
1317 736 : if (gids == NULL && !distinct) {
1318 530 : *topn = BATfirstn_unique(&bi, s, n, asc, nilslast, NULL, t0);
1319 531 : rc = *topn ? GDK_SUCCEED : GDK_FAIL;
1320 : } else {
1321 206 : rc = BATfirstn_grouped(topn, gids, &bi, s, n, asc, nilslast, distinct, t0);
1322 : }
1323 421 : } else if (gids == NULL && !distinct) {
1324 190 : *topn = BATfirstn_unique_with_groups(&bi, s, g, n, asc, nilslast, NULL, NULL, t0);
1325 190 : rc = *topn ? GDK_SUCCEED : GDK_FAIL;
1326 : } else {
1327 231 : rc = BATfirstn_grouped_with_groups(topn, gids, &bi, s, g, n, asc, nilslast, distinct, t0);
1328 : }
1329 1217 : bat_iterator_end(&bi);
1330 1217 : return rc;
1331 : }
|