LCOV - code coverage report
Current view: top level - gdk - gdk_sample.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 90 92 97.8 %
Date: 2024-10-04 20:04:04 Functions: 6 6 100.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  * @a Lefteris Sidirourgos, Hannes Muehleisen
      15             :  * @* Low level sample facilities
      16             :  *
      17             :  * This sampling implementation generates a sorted set of OIDs by
      18             :  * calling the random number generator, and uses a binary tree to
      19             :  * eliminate duplicates.  The elements of the tree are then used to
      20             :  * create a sorted sample BAT.  This implementation has a logarithmic
      21             :  * complexity that only depends on the sample size.
      22             :  *
      23             :  * There is a pathological case when the sample size is almost the
      24             :  * size of the BAT.  Then, many collisions occur and performance
      25             :  * degrades. To catch this, we switch to antiset semantics when the
      26             :  * sample size is larger than half the BAT size. Then, we generate the
      27             :  * values that should be omitted from the sample.
      28             :  */
      29             : 
      30             : #include "monetdb_config.h"
      31             : #include "gdk.h"
      32             : #include "gdk_private.h"
      33             : #include "xoshiro256starstar.h"
      34             : 
      35             : /* this is a straightforward implementation of a binary tree */
      36             : struct oidtreenode {
      37             :         union {
      38             :                 struct {        /* use as a binary tree */
      39             :                         oid o;
      40             :                         struct oidtreenode *left;
      41             :                         struct oidtreenode *right;
      42             :                 };
      43             :                 uint64_t r;     /* temporary storage for random numbers */
      44             :         };
      45             : };
      46             : 
      47             : static bool
      48    40997556 : OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
      49             : {
      50    40997556 :         struct oidtreenode **nodep;
      51             : 
      52    40997556 :         if (allocated == 0) {
      53      128127 :                 tree->left = tree->right = NULL;
      54      128127 :                 tree->o = o;
      55      128127 :                 return true;
      56             :         }
      57             :         nodep = &tree;
      58   435335043 :         while (*nodep) {
      59   398646018 :                 if (o == (*nodep)->o)
      60             :                         return false;
      61   394465614 :                 if (o < (*nodep)->o)
      62   197325751 :                         nodep = &(*nodep)->left;
      63             :                 else
      64   197139863 :                         nodep = &(*nodep)->right;
      65             :         }
      66    36689025 :         *nodep = &tree[allocated];
      67    36689025 :         tree[allocated].left = tree[allocated].right = NULL;
      68    36689025 :         tree[allocated].o = o;
      69    36689025 :         return true;
      70             : }
      71             : 
      72             : /* inorder traversal, gives us a sorted BAT */
      73             : static void
      74    11313918 : OIDTreeToBAT(struct oidtreenode *node, BAT *bn)
      75             : {
      76    22601362 :         if (node->left != NULL)
      77    11291310 :                 OIDTreeToBAT(node->left, bn);
      78    22601362 :         ((oid *) bn->theap->base)[bn->batCount++] = node->o;
      79    22601362 :         if (node->right != NULL )
      80             :                 OIDTreeToBAT(node->right, bn);
      81    11313918 : }
      82             : 
      83             : /* Antiset traversal, give us all values but the ones in the tree */
      84             : static void
      85     7161803 : OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bn, oid start, oid stop)
      86             : {
      87    14215790 :         oid noid;
      88             : 
      89    14215790 :         if (node->left != NULL)
      90     7056284 :                 OIDTreeToBATAntiset(node->left, bn, start, node->o);
      91             :         else
      92    59858797 :                 for (noid = start; noid < node->o; noid++)
      93    52699291 :                         ((oid *) bn->theap->base)[bn->batCount++] = noid;
      94             : 
      95    14215790 :         if (node->right != NULL)
      96     7053987 :                 OIDTreeToBATAntiset(node->right, bn, node->o + 1, stop);
      97             :         else
      98    59977134 :                 for (noid = node->o+1; noid < stop; noid++)
      99    52815331 :                         ((oid *) bn->theap->base)[bn->batCount++] = noid;
     100     7161803 : }
     101             : 
     102             : static BAT *
     103     1004135 : do_batsample(oid hseq, BUN cnt, BUN n, random_state_engine rse, MT_Lock *lock)
     104             : {
     105     1004135 :         BAT *bn;
     106     1004135 :         BUN slen;
     107     1004135 :         BUN rescnt;
     108     1004135 :         struct oidtreenode *tree = NULL;
     109             : 
     110     1004135 :         ERRORcheck(n > BUN_MAX, "sample size larger than BUN_MAX\n", NULL);
     111             :         /* empty sample size */
     112     1004135 :         if (n == 0) {
     113           2 :                 bn = BATdense(0, 0, 0);
     114     1004133 :         } else if (cnt <= n) {
     115             :                 /* sample size is larger than the input BAT, return
     116             :                  * all oids */
     117      876006 :                 bn = BATdense(0, hseq, cnt);
     118             :         } else {
     119      128127 :                 oid minoid = hseq;
     120      128127 :                 oid maxoid = hseq + cnt;
     121             : 
     122             :                 /* if someone samples more than half of our tree, we
     123             :                  * do the antiset */
     124      128127 :                 bool antiset = n > cnt / 2;
     125      128127 :                 slen = n;
     126      128127 :                 if (antiset)
     127      105519 :                         n = cnt - n;
     128             : 
     129      128127 :                 tree = GDKmalloc(n * sizeof(struct oidtreenode));
     130      128127 :                 if (tree == NULL) {
     131             :                         return NULL;
     132             :                 }
     133      128127 :                 bn = COLnew(0, TYPE_oid, slen, TRANSIENT);
     134      128126 :                 if (bn == NULL) {
     135           0 :                         GDKfree(tree);
     136           0 :                         return NULL;
     137             :                 }
     138             : 
     139      128126 :                 if (lock)
     140      128116 :                         MT_lock_set(lock);
     141             :                 /* generate a list of random numbers; note we use the
     142             :                  * "tree" array, but we use the value from each location
     143             :                  * before it is overwritten by the use as part of the
     144             :                  * binary tree */
     145    36945279 :                 for (rescnt = 0; rescnt < n; rescnt++)
     146    36817152 :                         tree[rescnt].r = next(rse);
     147             : 
     148             :                 /* while we do not have enough sample OIDs yet */
     149             :                 BUN rnd = 0;
     150    36945279 :                 for (rescnt = 0; rescnt < n; rescnt++) {
     151    40997556 :                         oid candoid;
     152    40997556 :                         do {
     153    40997556 :                                 if (rnd == n) {
     154             :                                         /* we ran out of random numbers,
     155             :                                          * so generate more */
     156     4431919 :                                         for (rnd = rescnt; rnd < n; rnd++)
     157     4180404 :                                                 tree[rnd].r = next(rse);
     158             :                                         rnd = rescnt;
     159             :                                 }
     160    40997556 :                                 candoid = minoid + tree[rnd++].r % cnt;
     161             :                                 /* if that candidate OID was already
     162             :                                  * generated, try again */
     163    40997556 :                         } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
     164             :                 }
     165      128127 :                 if (lock)
     166      128117 :                         MT_lock_unset(lock);
     167      128127 :                 if (!antiset) {
     168       22608 :                         OIDTreeToBAT(tree, bn);
     169             :                 } else {
     170      105519 :                         OIDTreeToBATAntiset(tree, bn, minoid, maxoid);
     171             :                 }
     172      128127 :                 GDKfree(tree);
     173             : 
     174      128127 :                 BATsetcount(bn, slen);
     175      128127 :                 bn->trevsorted = bn->batCount <= 1;
     176      128127 :                 bn->tsorted = true;
     177      128127 :                 bn->tkey = true;
     178      128127 :                 bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil;
     179             :         }
     180             :         return bn;
     181             : }
     182             : 
     183             : /* BATsample implements sampling for BATs */
     184             : BAT *
     185          10 : BATsample_with_seed(BAT *b, BUN n, uint64_t seed)
     186             : {
     187          10 :         random_state_engine rse;
     188             : 
     189          10 :         init_random_state_engine(rse, seed);
     190             : 
     191          10 :         BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, NULL);
     192          10 :         TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
     193             :                   ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
     194          10 :         return bn;
     195             : }
     196             : 
     197             : static MT_Lock rse_lock = MT_LOCK_INITIALIZER(rse_lock);
     198             : BAT *
     199     1004125 : BATsample(BAT *b, BUN n)
     200             : {
     201     1004125 :         static random_state_engine rse;
     202             : 
     203     1004125 :         MT_lock_set(&rse_lock);
     204     1004125 :         if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0)
     205         319 :                 init_random_state_engine(rse, (uint64_t) GDKusec());
     206     1004125 :         MT_lock_unset(&rse_lock);
     207     1004125 :         BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, &rse_lock);
     208     1004121 :         TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
     209             :                   ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
     210     1004121 :         return bn;
     211             : }

Generated by: LCOV version 1.14