LCOV - code coverage report
Current view: top level - gdk - gdk_unique.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 175 200 87.5 %
Date: 2024-04-25 20:03:45 Functions: 1 1 100.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : #include "monetdb_config.h"
      14             : #include "gdk.h"
      15             : #include "gdk_private.h"
      16             : #include "gdk_calc_private.h"
      17             : 
      18             : #define VALUE(x)        (vars ? vars + VarHeapVal(vals, (x), width) : vals + (x) * width)
      19             : /* BATunique returns a bat that indicates the unique tail values of
      20             :  * the input bat.  This is essentially the same output as the
      21             :  * "extents" output of BATgroup.  The difference is that BATunique
      22             :  * does not return the grouping bat.
      23             :  *
      24             :  * The first input is the bat from which unique rows are selected, the
      25             :  * second input is an optional candidate list.
      26             :  *
      27             :  * The return value is a candidate list.
      28             :  */
      29             : BAT *
      30        1435 : BATunique(BAT *b, BAT *s)
      31             : {
      32        1435 :         BAT *bn;
      33        1435 :         const void *v;
      34        1435 :         const char *vals;
      35        1435 :         const char *vars;
      36        1435 :         int width;
      37        1435 :         oid i, o, hseq;
      38        1435 :         const char *nme;
      39        1435 :         Hash *hs = NULL;
      40        1435 :         BUN hb;
      41        1435 :         int (*cmp)(const void *, const void *);
      42        1435 :         struct canditer ci;
      43        1435 :         const char *algomsg = "";
      44        1435 :         lng t0 = 0;
      45             : 
      46        1435 :         lng timeoffset = 0;
      47        1435 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
      48        1435 :         if (qry_ctx != NULL) {
      49        1419 :                 timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
      50             :         }
      51             : 
      52        1435 :         TRC_DEBUG_IF(ALGO) t0 = GDKusec();
      53             : 
      54        1435 :         BATcheck(b, NULL);
      55        1435 :         canditer_init(&ci, b, s);
      56             : 
      57        1435 :         (void) BATordered(b);
      58        1435 :         (void) BATordered_rev(b);
      59        1435 :         BATiter bi = bat_iterator(b);
      60        1435 :         if (bi.key || ci.ncand <= 1 || BATtdensebi(&bi)) {
      61             :                 /* trivial: already unique */
      62         910 :                 bn = canditer_slice(&ci, 0, ci.ncand);
      63         910 :                 bat_iterator_end(&bi);
      64         910 :                 TRC_DEBUG(ALGO, "b=" ALGOBATFMT
      65             :                           ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
      66             :                           " (already unique, slice candidates -- "
      67             :                           LLFMT "usec)\n",
      68             :                           ALGOBATPAR(b), ALGOOPTBATPAR(s),
      69             :                           ALGOOPTBATPAR(bn), GDKusec() - t0);
      70         910 :                 return bn;
      71             :         }
      72             : 
      73         525 :         if ((bi.sorted && bi.revsorted) ||
      74         462 :             (bi.type == TYPE_void && is_oid_nil(b->tseqbase))) {
      75             :                 /* trivial: all values are the same */
      76          63 :                 bn = BATdense(0, ci.seq, 1);
      77          63 :                 bat_iterator_end(&bi);
      78          63 :                 TRC_DEBUG(ALGO, "b=" ALGOBATFMT
      79             :                           ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
      80             :                           " (all equal -- "
      81             :                           LLFMT "usec)\n",
      82             :                           ALGOBATPAR(b), ALGOOPTBATPAR(s),
      83             :                           ALGOOPTBATPAR(bn), GDKusec() - t0);
      84          63 :                 return bn;
      85             :         }
      86             : 
      87         462 :         assert(bi.type != TYPE_void);
      88             : 
      89         462 :         BUN initsize = BUN_NONE;
      90         462 :         if (s == NULL) {
      91         264 :                 MT_rwlock_rdlock(&b->thashlock);
      92         264 :                 if (b->thash != NULL && b->thash != (Hash *) 1)
      93           5 :                         initsize = b->thash->nunique;
      94         264 :                 MT_rwlock_rdunlock(&b->thashlock);
      95         264 :                 if (initsize == BUN_NONE && bi.unique_est != 0)
      96         132 :                         initsize = (BUN) bi.unique_est;
      97             :         }
      98         259 :         if (initsize == BUN_NONE)
      99             :                 initsize = 1024;
     100         462 :         bn = COLnew(0, TYPE_oid, initsize, TRANSIENT);
     101         462 :         if (bn == NULL) {
     102           0 :                 bat_iterator_end(&bi);
     103           0 :                 return NULL;
     104             :         }
     105         462 :         vals = bi.base;
     106         462 :         if (bi.vh && bi.type)
     107          91 :                 vars = bi.vh->base;
     108             :         else
     109             :                 vars = NULL;
     110         462 :         width = bi.width;
     111         462 :         cmp = ATOMcompare(bi.type);
     112         462 :         hseq = b->hseqbase;
     113             : 
     114         462 :         if (ATOMbasetype(bi.type) == TYPE_bte ||
     115          50 :             (bi.width == 1 &&
     116          50 :              ATOMstorage(bi.type) == TYPE_str &&
     117          50 :              GDK_ELIMDOUBLES(bi.vh))) {
     118          67 :                 uint8_t val;
     119             : 
     120          67 :                 algomsg = "unique: byte-sized atoms";
     121          67 :                 uint32_t seen[256 >> 5];
     122          67 :                 memset(seen, 0, sizeof(seen));
     123      304003 :                 TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
     124      303790 :                         o = canditer_next(&ci);
     125      303790 :                         val = ((const uint8_t *) vals)[o - hseq];
     126      303790 :                         uint32_t m = UINT32_C(1) << (val & 0x1F);
     127      303790 :                         if (!(seen[val >> 5] & m)) {
     128         290 :                                 seen[val >> 5] |= m;
     129         290 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     130           0 :                                         goto bunins_failed;
     131         290 :                                 if (bn->batCount == 256) {
     132             :                                         /* there cannot be more than
     133             :                                          * 256 distinct values */
     134             :                                         break;
     135             :                                 }
     136             :                         }
     137             :                 }
     138          67 :                 TIMEOUT_CHECK(timeoffset,
     139             :                               GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     140         395 :         } else if (ATOMbasetype(bi.type) == TYPE_sht ||
     141          35 :                    (bi.width == 2 &&
     142          35 :                     ATOMstorage(bi.type) == TYPE_str &&
     143          35 :                     GDK_ELIMDOUBLES(bi.vh))) {
     144          42 :                 uint16_t val;
     145             : 
     146          42 :                 algomsg = "unique: short-sized atoms";
     147          42 :                 uint32_t seen[65536 >> 5];
     148          42 :                 memset(seen, 0, sizeof(seen));
     149       50040 :                 TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
     150       49914 :                         o = canditer_next(&ci);
     151       49914 :                         val = ((const uint16_t *) vals)[o - hseq];
     152       49914 :                         uint32_t m = UINT32_C(1) << (val & 0x1F);
     153       49914 :                         if (!(seen[val >> 5] & m)) {
     154        4573 :                                 seen[val >> 5] |= m;
     155        4573 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     156           0 :                                         goto bunins_failed;
     157        4573 :                                 if (bn->batCount == 65536) {
     158             :                                         /* there cannot be more than
     159             :                                          * 65536 distinct values */
     160             :                                         break;
     161             :                                 }
     162             :                         }
     163             :                 }
     164          42 :                 TIMEOUT_CHECK(timeoffset,
     165             :                               GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     166         353 :         } else if (bi.sorted || bi.revsorted) {
     167          44 :                 const void *prev = NULL;
     168          44 :                 algomsg = "unique: sorted";
     169      238753 :                 TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
     170      238609 :                         o = canditer_next(&ci);
     171      238609 :                         v = VALUE(o - hseq);
     172      238609 :                         if (prev == NULL || (*cmp)(v, prev) != 0) {
     173       23355 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     174           0 :                                         goto bunins_failed;
     175             :                         }
     176      238609 :                         prev = v;
     177             :                 }
     178          44 :                 TIMEOUT_CHECK(timeoffset,
     179             :                               GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     180         309 :         } else if (BATcheckhash(b) ||
     181         296 :                    ((!bi.transient ||
     182          85 :                      (b->batRole == PERSISTENT && GDKinmemory(0))) &&
     183         322 :                     ci.ncand == bi.count &&
     184         111 :                     BAThash(b) == GDK_SUCCEED)) {
     185         124 :                 BUN lo = 0;
     186             : 
     187             :                 /* we already have a hash table on b, or b is
     188             :                  * persistent and we could create a hash table, or b
     189             :                  * is a view on a bat that already has a hash table */
     190         124 :                 algomsg = "unique: existing hash";
     191         124 :                 MT_rwlock_rdlock(&b->thashlock);
     192         124 :                 hs = b->thash;
     193         124 :                 if (hs == NULL) {
     194           0 :                         MT_rwlock_rdunlock(&b->thashlock);
     195           0 :                         goto lost_hash;
     196             :                 }
     197       86163 :                 TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
     198       85791 :                         BUN p;
     199             : 
     200       85791 :                         o = canditer_next(&ci);
     201       85791 :                         p = o - hseq;
     202       85791 :                         v = VALUE(p);
     203       85791 :                         for (hb = HASHgetlink(hs, p + lo);
     204      100668 :                              hb != BUN_NONE && hb >= lo;
     205       14877 :                              hb = HASHgetlink(hs, hb)) {
     206       63752 :                                 assert(hb < p + lo);
     207      113011 :                                 if (cmp(v, BUNtail(bi, hb)) == 0 &&
     208       49259 :                                     canditer_contains(&ci, hb - lo + hseq)) {
     209             :                                         /* we've seen this value
     210             :                                          * before */
     211             :                                         break;
     212             :                                 }
     213             :                         }
     214       85791 :                         if (hb == BUN_NONE || hb < lo) {
     215       36916 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED) {
     216           0 :                                         MT_rwlock_rdunlock(&b->thashlock);
     217           0 :                                         hs = NULL;
     218           0 :                                         goto bunins_failed;
     219             :                                 }
     220             :                         }
     221             :                 }
     222         124 :                 MT_rwlock_rdunlock(&b->thashlock);
     223         124 :                 TIMEOUT_CHECK(timeoffset,
     224             :                               GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     225             :         } else {
     226         185 :                 BUN prb;
     227         185 :                 BUN p;
     228         185 :                 BUN mask;
     229             : 
     230         185 :           lost_hash:
     231         185 :                 GDKclrerr();    /* not interested in BAThash errors */
     232         185 :                 algomsg = "unique: new partial hash";
     233         185 :                 nme = BBP_physical(b->batCacheid);
     234         185 :                 if (ATOMbasetype(bi.type) == TYPE_bte) {
     235             :                         mask = (BUN) 1 << 8;
     236             :                         cmp = NULL; /* no compare needed, "hash" is perfect */
     237         185 :                 } else if (ATOMbasetype(bi.type) == TYPE_sht) {
     238             :                         mask = (BUN) 1 << 16;
     239             :                         cmp = NULL; /* no compare needed, "hash" is perfect */
     240             :                 } else {
     241         185 :                         mask = HASHmask(ci.ncand);
     242         185 :                         if (mask < ((BUN) 1 << 16))
     243             :                                 mask = (BUN) 1 << 16;
     244             :                 }
     245         185 :                 if ((hs = GDKzalloc(sizeof(Hash))) == NULL) {
     246           0 :                         GDKerror("cannot allocate hash table\n");
     247           0 :                         goto bunins_failed;
     248             :                 }
     249         185 :                 hs->heapbckt.parentid = b->batCacheid;
     250         185 :                 hs->heaplink.parentid = b->batCacheid;
     251         185 :                 if ((hs->heaplink.farmid = BBPselectfarm(TRANSIENT, bi.type, hashheap)) < 0 ||
     252         185 :                     (hs->heapbckt.farmid = BBPselectfarm(TRANSIENT, bi.type, hashheap)) < 0 ||
     253         185 :                     snprintf(hs->heaplink.filename, sizeof(hs->heaplink.filename), "%s.thshunil%x", nme, (unsigned) MT_getpid()) >= (int) sizeof(hs->heaplink.filename) ||
     254         370 :                     snprintf(hs->heapbckt.filename, sizeof(hs->heapbckt.filename), "%s.thshunib%x", nme, (unsigned) MT_getpid()) >= (int) sizeof(hs->heapbckt.filename) ||
     255         185 :                     HASHnew(hs, bi.type, BATcount(b), mask, BUN_NONE, false) != GDK_SUCCEED) {
     256           0 :                         GDKfree(hs);
     257           0 :                         hs = NULL;
     258           0 :                         GDKerror("cannot allocate hash table\n");
     259           0 :                         goto bunins_failed;
     260             :                 }
     261      495666 :                 TIMEOUT_LOOP_IDX(i, ci.ncand, timeoffset) {
     262      495084 :                         o = canditer_next(&ci);
     263      468196 :                         v = VALUE(o - hseq);
     264      468196 :                         prb = HASHprobe(hs, v);
     265      468227 :                         for (hb = HASHget(hs, prb);
     266      494316 :                              hb != BUN_NONE;
     267       26089 :                              hb = HASHgetlink(hs, hb)) {
     268      440690 :                                 if (cmp == NULL || cmp(v, BUNtail(bi, hb)) == 0)
     269             :                                         break;
     270             :                         }
     271      494241 :                         if (hb == BUN_NONE) {
     272       53619 :                                 p = o - hseq;
     273       53619 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     274           0 :                                         goto bunins_failed;
     275             :                                 /* enter into hash table */
     276       53619 :                                 HASHputlink(hs, p, HASHget(hs, prb));
     277       54053 :                                 HASHput(hs, prb, p);
     278             :                         }
     279             :                 }
     280         185 :                 TIMEOUT_CHECK(timeoffset,
     281             :                               GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     282         185 :                 HEAPfree(&hs->heaplink, true);
     283         185 :                 HEAPfree(&hs->heapbckt, true);
     284         185 :                 GDKfree(hs);
     285             :         }
     286         462 :         if (BATcount(bn) == bi.count) {
     287             :                 /* it turns out all values are distinct */
     288          71 :                 MT_lock_set(&b->theaplock);
     289          71 :                 if (BATcount(b) == bi.count) {
     290             :                         /* and the input hasn't changed in the mean
     291             :                          * time--the only allowed change being appends;
     292             :                          * updates not allowed since the candidate list
     293             :                          * covers the complete bat */
     294          71 :                         assert(b->tnokey[0] == 0);
     295          71 :                         assert(b->tnokey[1] == 0);
     296          71 :                         b->tkey = true;
     297             :                 }
     298          71 :                 MT_lock_unset(&b->theaplock);
     299             :         }
     300         462 :         bat_iterator_end(&bi);
     301             : 
     302         462 :         bn->theap->dirty = true;
     303         462 :         bn->tsorted = true;
     304         462 :         bn->trevsorted = BATcount(bn) <= 1;
     305         462 :         bn->tkey = true;
     306         462 :         bn->tnil = false;
     307         462 :         bn->tnonil = true;
     308         462 :         bn = virtualize(bn);
     309         462 :         MT_thread_setalgorithm(algomsg);
     310         462 :         TRC_DEBUG(ALGO, "b=" ALGOBATFMT
     311             :                   ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
     312             :                   " (%s -- " LLFMT "usec)\n",
     313             :                   ALGOBATPAR(b), ALGOOPTBATPAR(s),
     314             :                   ALGOOPTBATPAR(bn), algomsg, GDKusec() - t0);
     315             :         return bn;
     316             : 
     317           0 :   bunins_failed:
     318           0 :         bat_iterator_end(&bi);
     319           0 :         if (hs != NULL) {
     320           0 :                 HEAPfree(&hs->heaplink, true);
     321           0 :                 HEAPfree(&hs->heapbckt, true);
     322           0 :                 GDKfree(hs);
     323             :         }
     324           0 :         BBPreclaim(bn);
     325           0 :         return NULL;
     326             : }

Generated by: LCOV version 1.14