Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "gdk.h"
15 : #include "gdk_private.h"
16 : #include "gdk_calc_private.h"
17 :
18 : #define VALUE(x) (vars ? vars + VarHeapVal(vals, (x), width) : vals + (x) * width)
19 : /* BATunique returns a bat that indicates the unique tail values of
20 : * the input bat. This is essentially the same output as the
21 : * "extents" output of BATgroup. The difference is that BATunique
22 : * does not return the grouping bat.
23 : *
24 : * The first input is the bat from which unique rows are selected, the
25 : * second input is an optional candidate list.
26 : *
27 : * The return value is a candidate list.
28 : */
29 : BAT *
30 1435 : BATunique(BAT *b, BAT *s)
31 : {
32 1435 : BAT *bn;
33 1435 : const void *v;
34 1435 : const char *vals;
35 1435 : const char *vars;
36 1435 : int width;
37 1435 : oid i, o, hseq;
38 1435 : const char *nme;
39 1435 : Hash *hs = NULL;
40 1435 : BUN hb;
41 1435 : int (*cmp)(const void *, const void *);
42 1435 : struct canditer ci;
43 1435 : const char *algomsg = "";
44 1435 : lng t0 = 0;
45 :
46 1435 : lng timeoffset = 0;
47 1435 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
48 1435 : if (qry_ctx != NULL) {
49 1419 : timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
50 : }
51 :
52 1435 : TRC_DEBUG_IF(ALGO) t0 = GDKusec();
53 :
54 1435 : BATcheck(b, NULL);
55 1435 : canditer_init(&ci, b, s);
56 :
57 1435 : (void) BATordered(b);
58 1435 : (void) BATordered_rev(b);
59 1435 : BATiter bi = bat_iterator(b);
60 1435 : if (bi.key || ci.ncand <= 1 || BATtdensebi(&bi)) {
61 : /* trivial: already unique */
62 910 : bn = canditer_slice(&ci, 0, ci.ncand);
63 910 : bat_iterator_end(&bi);
64 910 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT
65 : ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
66 : " (already unique, slice candidates -- "
67 : LLFMT "usec)\n",
68 : ALGOBATPAR(b), ALGOOPTBATPAR(s),
69 : ALGOOPTBATPAR(bn), GDKusec() - t0);
70 910 : return bn;
71 : }
72 :
73 525 : if ((bi.sorted && bi.revsorted) ||
74 462 : (bi.type == TYPE_void && is_oid_nil(b->tseqbase))) {
75 : /* trivial: all values are the same */
76 63 : bn = BATdense(0, ci.seq, 1);
77 63 : bat_iterator_end(&bi);
78 63 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT
79 : ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
80 : " (all equal -- "
81 : LLFMT "usec)\n",
82 : ALGOBATPAR(b), ALGOOPTBATPAR(s),
83 : ALGOOPTBATPAR(bn), GDKusec() - t0);
84 63 : return bn;
85 : }
86 :
87 462 : assert(bi.type != TYPE_void);
88 :
89 462 : BUN initsize = BUN_NONE;
90 462 : if (s == NULL) {
91 264 : MT_rwlock_rdlock(&b->thashlock);
92 264 : if (b->thash != NULL && b->thash != (Hash *) 1)
93 5 : initsize = b->thash->nunique;
94 264 : MT_rwlock_rdunlock(&b->thashlock);
95 264 : if (initsize == BUN_NONE && bi.unique_est != 0)
96 132 : initsize = (BUN) bi.unique_est;
97 : }
98 259 : if (initsize == BUN_NONE)
99 : initsize = 1024;
100 462 : bn = COLnew(0, TYPE_oid, initsize, TRANSIENT);
101 462 : if (bn == NULL) {
102 0 : bat_iterator_end(&bi);
103 0 : return NULL;
104 : }
105 462 : vals = bi.base;
106 462 : if (bi.vh && bi.type)
107 91 : vars = bi.vh->base;
108 : else
109 : vars = NULL;
110 462 : width = bi.width;
111 462 : cmp = ATOMcompare(bi.type);
112 462 : hseq = b->hseqbase;
113 :
114 462 : if (ATOMbasetype(bi.type) == TYPE_bte ||
115 50 : (bi.width == 1 &&
116 50 : ATOMstorage(bi.type) == TYPE_str &&
117 50 : GDK_ELIMDOUBLES(bi.vh))) {
118 67 : uint8_t val;
119 :
120 67 : algomsg = "unique: byte-sized atoms";
121 67 : uint32_t seen[256 >> 5];
122 67 : memset(seen, 0, sizeof(seen));
123 304003 : TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
124 303790 : o = canditer_next(&ci);
125 303790 : val = ((const uint8_t *) vals)[o - hseq];
126 303790 : uint32_t m = UINT32_C(1) << (val & 0x1F);
127 303790 : if (!(seen[val >> 5] & m)) {
128 290 : seen[val >> 5] |= m;
129 290 : if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
130 0 : goto bunins_failed;
131 290 : if (bn->batCount == 256) {
132 : /* there cannot be more than
133 : * 256 distinct values */
134 : break;
135 : }
136 : }
137 : }
138 67 : TIMEOUT_CHECK(timeoffset,
139 : GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
140 395 : } else if (ATOMbasetype(bi.type) == TYPE_sht ||
141 35 : (bi.width == 2 &&
142 35 : ATOMstorage(bi.type) == TYPE_str &&
143 35 : GDK_ELIMDOUBLES(bi.vh))) {
144 42 : uint16_t val;
145 :
146 42 : algomsg = "unique: short-sized atoms";
147 42 : uint32_t seen[65536 >> 5];
148 42 : memset(seen, 0, sizeof(seen));
149 50040 : TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
150 49914 : o = canditer_next(&ci);
151 49914 : val = ((const uint16_t *) vals)[o - hseq];
152 49914 : uint32_t m = UINT32_C(1) << (val & 0x1F);
153 49914 : if (!(seen[val >> 5] & m)) {
154 4573 : seen[val >> 5] |= m;
155 4573 : if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
156 0 : goto bunins_failed;
157 4573 : if (bn->batCount == 65536) {
158 : /* there cannot be more than
159 : * 65536 distinct values */
160 : break;
161 : }
162 : }
163 : }
164 42 : TIMEOUT_CHECK(timeoffset,
165 : GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
166 353 : } else if (bi.sorted || bi.revsorted) {
167 44 : const void *prev = NULL;
168 44 : algomsg = "unique: sorted";
169 238753 : TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
170 238609 : o = canditer_next(&ci);
171 238609 : v = VALUE(o - hseq);
172 238609 : if (prev == NULL || (*cmp)(v, prev) != 0) {
173 23355 : if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
174 0 : goto bunins_failed;
175 : }
176 238609 : prev = v;
177 : }
178 44 : TIMEOUT_CHECK(timeoffset,
179 : GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
180 309 : } else if (BATcheckhash(b) ||
181 296 : ((!bi.transient ||
182 85 : (b->batRole == PERSISTENT && GDKinmemory(0))) &&
183 322 : ci.ncand == bi.count &&
184 111 : BAThash(b) == GDK_SUCCEED)) {
185 124 : BUN lo = 0;
186 :
187 : /* we already have a hash table on b, or b is
188 : * persistent and we could create a hash table, or b
189 : * is a view on a bat that already has a hash table */
190 124 : algomsg = "unique: existing hash";
191 124 : MT_rwlock_rdlock(&b->thashlock);
192 124 : hs = b->thash;
193 124 : if (hs == NULL) {
194 0 : MT_rwlock_rdunlock(&b->thashlock);
195 0 : goto lost_hash;
196 : }
197 86163 : TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
198 85791 : BUN p;
199 :
200 85791 : o = canditer_next(&ci);
201 85791 : p = o - hseq;
202 85791 : v = VALUE(p);
203 85791 : for (hb = HASHgetlink(hs, p + lo);
204 100668 : hb != BUN_NONE && hb >= lo;
205 14877 : hb = HASHgetlink(hs, hb)) {
206 63752 : assert(hb < p + lo);
207 113011 : if (cmp(v, BUNtail(bi, hb)) == 0 &&
208 49259 : canditer_contains(&ci, hb - lo + hseq)) {
209 : /* we've seen this value
210 : * before */
211 : break;
212 : }
213 : }
214 85791 : if (hb == BUN_NONE || hb < lo) {
215 36916 : if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED) {
216 0 : MT_rwlock_rdunlock(&b->thashlock);
217 0 : hs = NULL;
218 0 : goto bunins_failed;
219 : }
220 : }
221 : }
222 124 : MT_rwlock_rdunlock(&b->thashlock);
223 124 : TIMEOUT_CHECK(timeoffset,
224 : GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
225 : } else {
226 185 : BUN prb;
227 185 : BUN p;
228 185 : BUN mask;
229 :
230 185 : lost_hash:
231 185 : GDKclrerr(); /* not interested in BAThash errors */
232 185 : algomsg = "unique: new partial hash";
233 185 : nme = BBP_physical(b->batCacheid);
234 185 : if (ATOMbasetype(bi.type) == TYPE_bte) {
235 : mask = (BUN) 1 << 8;
236 : cmp = NULL; /* no compare needed, "hash" is perfect */
237 185 : } else if (ATOMbasetype(bi.type) == TYPE_sht) {
238 : mask = (BUN) 1 << 16;
239 : cmp = NULL; /* no compare needed, "hash" is perfect */
240 : } else {
241 185 : mask = HASHmask(ci.ncand);
242 185 : if (mask < ((BUN) 1 << 16))
243 : mask = (BUN) 1 << 16;
244 : }
245 185 : if ((hs = GDKzalloc(sizeof(Hash))) == NULL) {
246 0 : GDKerror("cannot allocate hash table\n");
247 0 : goto bunins_failed;
248 : }
249 185 : hs->heapbckt.parentid = b->batCacheid;
250 185 : hs->heaplink.parentid = b->batCacheid;
251 185 : if ((hs->heaplink.farmid = BBPselectfarm(TRANSIENT, bi.type, hashheap)) < 0 ||
252 185 : (hs->heapbckt.farmid = BBPselectfarm(TRANSIENT, bi.type, hashheap)) < 0 ||
253 185 : snprintf(hs->heaplink.filename, sizeof(hs->heaplink.filename), "%s.thshunil%x", nme, (unsigned) MT_getpid()) >= (int) sizeof(hs->heaplink.filename) ||
254 370 : snprintf(hs->heapbckt.filename, sizeof(hs->heapbckt.filename), "%s.thshunib%x", nme, (unsigned) MT_getpid()) >= (int) sizeof(hs->heapbckt.filename) ||
255 185 : HASHnew(hs, bi.type, BATcount(b), mask, BUN_NONE, false) != GDK_SUCCEED) {
256 0 : GDKfree(hs);
257 0 : hs = NULL;
258 0 : GDKerror("cannot allocate hash table\n");
259 0 : goto bunins_failed;
260 : }
261 495666 : TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
262 495084 : o = canditer_next(&ci);
263 468196 : v = VALUE(o - hseq);
264 468196 : prb = HASHprobe(hs, v);
265 468227 : for (hb = HASHget(hs, prb);
266 494316 : hb != BUN_NONE;
267 26089 : hb = HASHgetlink(hs, hb)) {
268 440690 : if (cmp == NULL || cmp(v, BUNtail(bi, hb)) == 0)
269 : break;
270 : }
271 494241 : if (hb == BUN_NONE) {
272 53619 : p = o - hseq;
273 53619 : if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
274 0 : goto bunins_failed;
275 : /* enter into hash table */
276 53619 : HASHputlink(hs, p, HASHget(hs, prb));
277 54053 : HASHput(hs, prb, p);
278 : }
279 : }
280 185 : TIMEOUT_CHECK(timeoffset,
281 : GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
282 185 : HEAPfree(&hs->heaplink, true);
283 185 : HEAPfree(&hs->heapbckt, true);
284 185 : GDKfree(hs);
285 : }
286 462 : if (BATcount(bn) == bi.count) {
287 : /* it turns out all values are distinct */
288 71 : MT_lock_set(&b->theaplock);
289 71 : if (BATcount(b) == bi.count) {
290 : /* and the input hasn't changed in the mean
291 : * time--the only allowed change being appends;
292 : * updates not allowed since the candidate list
293 : * covers the complete bat */
294 71 : assert(b->tnokey[0] == 0);
295 71 : assert(b->tnokey[1] == 0);
296 71 : b->tkey = true;
297 : }
298 71 : MT_lock_unset(&b->theaplock);
299 : }
300 462 : bat_iterator_end(&bi);
301 :
302 462 : bn->theap->dirty = true;
303 462 : bn->tsorted = true;
304 462 : bn->trevsorted = BATcount(bn) <= 1;
305 462 : bn->tkey = true;
306 462 : bn->tnil = false;
307 462 : bn->tnonil = true;
308 462 : bn = virtualize(bn);
309 462 : MT_thread_setalgorithm(algomsg);
310 462 : TRC_DEBUG(ALGO, "b=" ALGOBATFMT
311 : ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
312 : " (%s -- " LLFMT "usec)\n",
313 : ALGOBATPAR(b), ALGOOPTBATPAR(s),
314 : ALGOOPTBATPAR(bn), algomsg, GDKusec() - t0);
315 : return bn;
316 :
317 0 : bunins_failed:
318 0 : bat_iterator_end(&bi);
319 0 : if (hs != NULL) {
320 0 : HEAPfree(&hs->heaplink, true);
321 0 : HEAPfree(&hs->heapbckt, true);
322 0 : GDKfree(hs);
323 : }
324 0 : BBPreclaim(bn);
325 0 : return NULL;
326 : }
|