Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * @a Lefteris Sidirourgos, Hannes Muehleisen
15 : * @* Low level sample facilities
16 : *
17 : * This sampling implementation generates a sorted set of OIDs by
18 : * calling the random number generator, and uses a binary tree to
19 : * eliminate duplicates. The elements of the tree are then used to
20 : * create a sorted sample BAT. This implementation has a logarithmic
21 : * complexity that only depends on the sample size.
22 : *
23 : * There is a pathological case when the sample size is almost the
24 : * size of the BAT. Then, many collisions occur and performance
25 : * degrades. To catch this, we switch to antiset semantics when the
26 : * sample size is larger than half the BAT size. Then, we generate the
27 : * values that should be omitted from the sample.
28 : */
29 :
30 : #include "monetdb_config.h"
31 : #include "gdk.h"
32 : #include "gdk_private.h"
33 : #include "xoshiro256starstar.h"
34 :
35 : /* this is a straightforward implementation of a binary tree */
36 : struct oidtreenode {
37 : union {
38 : struct { /* use as a binary tree */
39 : oid o;
40 : struct oidtreenode *left;
41 : struct oidtreenode *right;
42 : };
43 : uint64_t r; /* temporary storage for random numbers */
44 : };
45 : };
46 :
47 : static bool
48 42299299 : OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
49 : {
50 42299299 : struct oidtreenode **nodep;
51 :
52 42299299 : if (allocated == 0) {
53 129802 : tree->left = tree->right = NULL;
54 129802 : tree->o = o;
55 129802 : return true;
56 : }
57 : nodep = &tree;
58 450505663 : while (*nodep) {
59 412653699 : if (o == (*nodep)->o)
60 : return false;
61 408336166 : if (o < (*nodep)->o)
62 204238858 : nodep = &(*nodep)->left;
63 : else
64 204097308 : nodep = &(*nodep)->right;
65 : }
66 37851964 : *nodep = &tree[allocated];
67 37851964 : tree[allocated].left = tree[allocated].right = NULL;
68 37851964 : tree[allocated].o = o;
69 37851964 : return true;
70 : }
71 :
72 : /* inorder traversal, gives us a sorted BAT */
73 : static void
74 11776713 : OIDTreeToBAT(struct oidtreenode *node, BAT *bn)
75 : {
76 23534410 : if (node->left != NULL)
77 11753173 : OIDTreeToBAT(node->left, bn);
78 23534410 : ((oid *) bn->theap->base)[bn->batCount++] = node->o;
79 23534410 : if (node->right != NULL )
80 : OIDTreeToBAT(node->right, bn);
81 11776713 : }
82 :
83 : /* Antiset traversal, give us all values but the ones in the tree */
84 : static void
85 7277510 : OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bn, oid start, oid stop)
86 : {
87 14447169 : oid noid;
88 :
89 14447169 : if (node->left != NULL)
90 7171246 : OIDTreeToBATAntiset(node->left, bn, start, node->o);
91 : else
92 60412277 : for (noid = start; noid < node->o; noid++)
93 53136354 : ((oid *) bn->theap->base)[bn->batCount++] = noid;
94 :
95 14447145 : if (node->right != NULL)
96 7169659 : OIDTreeToBATAntiset(node->right, bn, node->o + 1, stop);
97 : else
98 60398869 : for (noid = node->o+1; noid < stop; noid++)
99 53121383 : ((oid *) bn->theap->base)[bn->batCount++] = noid;
100 7277486 : }
101 :
102 : static BAT *
103 1015037 : do_batsample(oid hseq, BUN cnt, BUN n, random_state_engine rse, MT_Lock *lock)
104 : {
105 1015037 : BAT *bn;
106 1015037 : BUN slen;
107 1015037 : BUN rescnt;
108 1015037 : struct oidtreenode *tree = NULL;
109 :
110 1015037 : ERRORcheck(n > BUN_MAX, "sample size larger than BUN_MAX\n", NULL);
111 : /* empty sample size */
112 1015037 : if (n == 0) {
113 2 : bn = BATdense(0, 0, 0);
114 1015035 : } else if (cnt <= n) {
115 : /* sample size is larger than the input BAT, return
116 : * all oids */
117 885233 : bn = BATdense(0, hseq, cnt);
118 : } else {
119 129802 : oid minoid = hseq;
120 129802 : oid maxoid = hseq + cnt;
121 :
122 : /* if someone samples more than half of our tree, we
123 : * do the antiset */
124 129802 : bool antiset = n > cnt / 2;
125 129802 : slen = n;
126 129802 : if (antiset)
127 106262 : n = cnt - n;
128 :
129 129802 : tree = GDKmalloc(n * sizeof(struct oidtreenode));
130 129801 : if (tree == NULL) {
131 : return NULL;
132 : }
133 129801 : bn = COLnew(0, TYPE_oid, slen, TRANSIENT);
134 129802 : if (bn == NULL) {
135 0 : GDKfree(tree);
136 0 : return NULL;
137 : }
138 :
139 129802 : if (lock)
140 129792 : MT_lock_set(lock);
141 : /* generate a list of random numbers; note we use the
142 : * "tree" array, but we use the value from each location
143 : * before it is overwritten by the use as part of the
144 : * binary tree */
145 38111568 : for (rescnt = 0; rescnt < n; rescnt++)
146 37981766 : tree[rescnt].r = next(rse);
147 :
148 : /* while we do not have enough sample OIDs yet */
149 : BUN rnd = 0;
150 38111568 : for (rescnt = 0; rescnt < n; rescnt++) {
151 42299299 : oid candoid;
152 42299299 : do {
153 42299299 : if (rnd == n) {
154 : /* we ran out of random numbers,
155 : * so generate more */
156 4574615 : for (rnd = rescnt; rnd < n; rnd++)
157 4317533 : tree[rnd].r = next(rse);
158 : rnd = rescnt;
159 : }
160 42299299 : candoid = minoid + tree[rnd++].r % cnt;
161 : /* if that candidate OID was already
162 : * generated, try again */
163 42299299 : } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
164 : }
165 129802 : if (lock)
166 129792 : MT_lock_unset(lock);
167 129802 : if (!antiset) {
168 23540 : OIDTreeToBAT(tree, bn);
169 : } else {
170 106262 : OIDTreeToBATAntiset(tree, bn, minoid, maxoid);
171 : }
172 129802 : GDKfree(tree);
173 :
174 129802 : BATsetcount(bn, slen);
175 129802 : bn->trevsorted = bn->batCount <= 1;
176 129802 : bn->tsorted = true;
177 129802 : bn->tkey = true;
178 129802 : bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil;
179 : }
180 : return bn;
181 : }
182 :
183 : /* BATsample implements sampling for BATs */
184 : BAT *
185 10 : BATsample_with_seed(BAT *b, BUN n, uint64_t seed)
186 : {
187 10 : random_state_engine rse;
188 :
189 10 : init_random_state_engine(rse, seed);
190 :
191 10 : BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, NULL);
192 10 : TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
193 : ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
194 10 : return bn;
195 : }
196 :
197 : static MT_Lock rse_lock = MT_LOCK_INITIALIZER(rse_lock);
198 : BAT *
199 1015027 : BATsample(BAT *b, BUN n)
200 : {
201 1015027 : static random_state_engine rse;
202 :
203 1015027 : MT_lock_set(&rse_lock);
204 1015027 : if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0)
205 317 : init_random_state_engine(rse, (uint64_t) GDKusec());
206 1015027 : MT_lock_unset(&rse_lock);
207 1015027 : BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, &rse_lock);
208 1015023 : TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
209 : ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
210 1015023 : return bn;
211 : }
|