Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * @a Lefteris Sidirourgos, Hannes Muehleisen
15 : * @* Low level sample facilities
16 : *
17 : * This sampling implementation generates a sorted set of OIDs by
18 : * calling the random number generator, and uses a binary tree to
19 : * eliminate duplicates. The elements of the tree are then used to
20 : * create a sorted sample BAT. This implementation has a logarithmic
21 : * complexity that only depends on the sample size.
22 : *
23 : * There is a pathological case when the sample size is almost the
24 : * size of the BAT. Then, many collisions occur and performance
25 : * degrades. To catch this, we switch to antiset semantics when the
26 : * sample size is larger than half the BAT size. Then, we generate the
27 : * values that should be omitted from the sample.
28 : */
29 :
30 : #include "monetdb_config.h"
31 : #include "gdk.h"
32 : #include "gdk_private.h"
33 : #include "xoshiro256starstar.h"
34 :
35 : /* this is a straightforward implementation of a binary tree */
36 : struct oidtreenode {
37 : union {
38 : struct { /* use as a binary tree */
39 : oid o;
40 : struct oidtreenode *left;
41 : struct oidtreenode *right;
42 : };
43 : uint64_t r; /* temporary storage for random numbers */
44 : };
45 : };
46 :
47 : static bool
48 49932831 : OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
49 : {
50 49932831 : struct oidtreenode **nodep;
51 :
52 49932831 : if (allocated == 0) {
53 140902 : tree->left = tree->right = NULL;
54 140902 : tree->o = o;
55 140902 : return true;
56 : }
57 : nodep = &tree;
58 533169835 : while (*nodep) {
59 489120726 : if (o == (*nodep)->o)
60 : return false;
61 483377906 : if (o < (*nodep)->o)
62 241513094 : nodep = &(*nodep)->left;
63 : else
64 241864812 : nodep = &(*nodep)->right;
65 : }
66 44049109 : *nodep = &tree[allocated];
67 44049109 : tree[allocated].left = tree[allocated].right = NULL;
68 44049109 : tree[allocated].o = o;
69 44049109 : return true;
70 : }
71 :
72 : /* inorder traversal, gives us a sorted BAT */
73 : static void
74 12969807 : OIDTreeToBAT(struct oidtreenode *node, BAT *bn)
75 : {
76 25910410 : if (node->left != NULL)
77 12943891 : OIDTreeToBAT(node->left, bn);
78 25910410 : ((oid *) bn->theap->base)[bn->batCount++] = node->o;
79 25910410 : if (node->right != NULL )
80 : OIDTreeToBAT(node->right, bn);
81 12969807 : }
82 :
83 : /* Antiset traversal, give us all values but the ones in the tree */
84 : static void
85 9197702 : OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bn, oid start, oid stop)
86 : {
87 18279598 : oid noid;
88 :
89 18279598 : if (node->left != NULL)
90 9082716 : OIDTreeToBATAntiset(node->left, bn, start, node->o);
91 : else
92 66701403 : for (noid = start; noid < node->o; noid++)
93 57504521 : ((oid *) bn->theap->base)[bn->batCount++] = noid;
94 :
95 18279598 : if (node->right != NULL)
96 9081896 : OIDTreeToBATAntiset(node->right, bn, node->o + 1, stop);
97 : else
98 66674818 : for (noid = node->o+1; noid < stop; noid++)
99 57477116 : ((oid *) bn->theap->base)[bn->batCount++] = noid;
100 9197702 : }
101 :
102 : static BAT *
103 1036400 : do_batsample(oid hseq, BUN cnt, BUN n, random_state_engine rse, MT_Lock *lock)
104 : {
105 1036400 : BAT *bn;
106 1036400 : BUN slen;
107 1036400 : BUN rescnt;
108 1036400 : struct oidtreenode *tree = NULL;
109 :
110 1036400 : ERRORcheck(n > BUN_MAX, "sample size larger than BUN_MAX\n", NULL);
111 : /* empty sample size */
112 1036400 : if (n == 0) {
113 2 : bn = BATdense(0, 0, 0);
114 1036398 : } else if (cnt <= n) {
115 : /* sample size is larger than the input BAT, return
116 : * all oids */
117 895500 : bn = BATdense(0, hseq, cnt);
118 : } else {
119 140898 : oid minoid = hseq;
120 140898 : oid maxoid = hseq + cnt;
121 :
122 : /* if someone samples more than half of our tree, we
123 : * do the antiset */
124 140898 : bool antiset = n > cnt / 2;
125 140898 : slen = n;
126 140898 : if (antiset)
127 114986 : n = cnt - n;
128 :
129 140898 : tree = GDKmalloc(n * sizeof(struct oidtreenode));
130 140896 : if (tree == NULL) {
131 : return NULL;
132 : }
133 140896 : bn = COLnew(0, TYPE_oid, slen, TRANSIENT);
134 140896 : if (bn == NULL) {
135 0 : GDKfree(tree);
136 0 : return NULL;
137 : }
138 :
139 140896 : if (lock)
140 140886 : MT_lock_set(lock);
141 : /* generate a list of random numbers; note we use the
142 : * "tree" array, but we use the value from each location
143 : * before it is overwritten by the use as part of the
144 : * binary tree */
145 44330913 : for (rescnt = 0; rescnt < n; rescnt++)
146 44190011 : tree[rescnt].r = next(rse);
147 :
148 : /* while we do not have enough sample OIDs yet */
149 : BUN rnd = 0;
150 44330913 : for (rescnt = 0; rescnt < n; rescnt++) {
151 49932831 : oid candoid;
152 49932831 : do {
153 49932831 : if (rnd == n) {
154 : /* we ran out of random numbers,
155 : * so generate more */
156 6054704 : for (rnd = rescnt; rnd < n; rnd++)
157 5742820 : tree[rnd].r = next(rse);
158 : rnd = rescnt;
159 : }
160 49932831 : candoid = minoid + tree[rnd++].r % cnt;
161 : /* if that candidate OID was already
162 : * generated, try again */
163 49932831 : } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
164 : }
165 140902 : if (lock)
166 140892 : MT_lock_unset(lock);
167 140902 : if (!antiset) {
168 25916 : OIDTreeToBAT(tree, bn);
169 : } else {
170 114986 : OIDTreeToBATAntiset(tree, bn, minoid, maxoid);
171 : }
172 140902 : GDKfree(tree);
173 :
174 140902 : BATsetcount(bn, slen);
175 140902 : bn->trevsorted = bn->batCount <= 1;
176 140902 : bn->tsorted = true;
177 140902 : bn->tkey = true;
178 140902 : bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil;
179 : }
180 : return bn;
181 : }
182 :
183 : /* BATsample implements sampling for BATs */
184 : BAT *
185 10 : BATsample_with_seed(BAT *b, BUN n, uint64_t seed)
186 : {
187 10 : random_state_engine rse;
188 :
189 10 : init_random_state_engine(rse, seed);
190 :
191 10 : BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, NULL);
192 10 : TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
193 : ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
194 10 : return bn;
195 : }
196 :
197 : static MT_Lock rse_lock = MT_LOCK_INITIALIZER(rse_lock);
198 : BAT *
199 1035747 : BATsample(BAT *b, BUN n)
200 : {
201 1035747 : static random_state_engine rse;
202 :
203 1035747 : MT_lock_set(&rse_lock);
204 1036398 : if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0)
205 330 : init_random_state_engine(rse, (uint64_t) GDKusec());
206 1036398 : MT_lock_unset(&rse_lock);
207 1036396 : BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, &rse_lock);
208 1035897 : TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
209 : ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
210 1035897 : return bn;
211 : }
|