Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * @a Lefteris Sidirourgos, Hannes Muehleisen
15 : * @* Low level sample facilities
16 : *
17 : * This sampling implementation generates a sorted set of OIDs by
18 : * calling the random number generator, and uses a binary tree to
19 : * eliminate duplicates. The elements of the tree are then used to
20 : * create a sorted sample BAT. This implementation has a logarithmic
21 : * complexity that only depends on the sample size.
22 : *
23 : * There is a pathological case when the sample size is almost the
24 : * size of the BAT. Then, many collisions occur and performance
25 : * degrades. To catch this, we switch to antiset semantics when the
26 : * sample size is larger than half the BAT size. Then, we generate the
27 : * values that should be omitted from the sample.
28 : */
29 :
30 : #include "monetdb_config.h"
31 : #include "gdk.h"
32 : #include "gdk_private.h"
33 : #include "xoshiro256starstar.h"
34 :
35 : /* this is a straightforward implementation of a binary tree */
36 : struct oidtreenode {
37 : union {
38 : struct { /* use as a binary tree */
39 : oid o;
40 : struct oidtreenode *left;
41 : struct oidtreenode *right;
42 : };
43 : uint64_t r; /* temporary storage for random numbers */
44 : };
45 : };
46 :
47 : static bool
48 41320557 : OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
49 : {
50 41320557 : struct oidtreenode **nodep;
51 :
52 41320557 : if (allocated == 0) {
53 125774 : tree->left = tree->right = NULL;
54 125774 : tree->o = o;
55 125774 : return true;
56 : }
57 : nodep = &tree;
58 442379173 : while (*nodep) {
59 405552170 : if (o == (*nodep)->o)
60 : return false;
61 401184390 : if (o < (*nodep)->o)
62 200431536 : nodep = &(*nodep)->left;
63 : else
64 200752854 : nodep = &(*nodep)->right;
65 : }
66 36827003 : *nodep = &tree[allocated];
67 36827003 : tree[allocated].left = tree[allocated].right = NULL;
68 36827003 : tree[allocated].o = o;
69 36827003 : return true;
70 : }
71 :
72 : /* inorder traversal, gives us a sorted BAT */
73 : static void
74 11695769 : OIDTreeToBAT(struct oidtreenode *node, BAT *bn)
75 : {
76 23366362 : if (node->left != NULL)
77 11672396 : OIDTreeToBAT(node->left, bn);
78 23366362 : ((oid *) bn->theap->base)[bn->batCount++] = node->o;
79 23366362 : if (node->right != NULL )
80 : OIDTreeToBAT(node->right, bn);
81 11695769 : }
82 :
83 : /* Antiset traversal, give us all values but the ones in the tree */
84 : static void
85 6845811 : OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bn, oid start, oid stop)
86 : {
87 13586415 : oid noid;
88 :
89 13586415 : if (node->left != NULL)
90 6743410 : OIDTreeToBATAntiset(node->left, bn, start, node->o);
91 : else
92 58076485 : for (noid = start; noid < node->o; noid++)
93 51233480 : ((oid *) bn->theap->base)[bn->batCount++] = noid;
94 :
95 13586415 : if (node->right != NULL)
96 6740604 : OIDTreeToBATAntiset(node->right, bn, node->o + 1, stop);
97 : else
98 58009451 : for (noid = node->o+1; noid < stop; noid++)
99 51163640 : ((oid *) bn->theap->base)[bn->batCount++] = noid;
100 6845811 : }
101 :
102 : static BAT *
103 1033945 : do_batsample(oid hseq, BUN cnt, BUN n, random_state_engine rse, MT_Lock *lock)
104 : {
105 1033945 : BAT *bn;
106 1033945 : BUN slen;
107 1033945 : BUN rescnt;
108 1033945 : struct oidtreenode *tree = NULL;
109 :
110 1033945 : ERRORcheck(n > BUN_MAX, "sample size larger than BUN_MAX\n", NULL);
111 : /* empty sample size */
112 1033945 : if (n == 0) {
113 2 : bn = BATdense(0, 0, 0);
114 1033943 : } else if (cnt <= n) {
115 : /* sample size is larger than the input BAT, return
116 : * all oids */
117 908170 : bn = BATdense(0, hseq, cnt);
118 : } else {
119 125773 : oid minoid = hseq;
120 125773 : oid maxoid = hseq + cnt;
121 :
122 : /* if someone samples more than half of our tree, we
123 : * do the antiset */
124 125773 : bool antiset = n > cnt / 2;
125 125773 : slen = n;
126 125773 : if (antiset)
127 102401 : n = cnt - n;
128 :
129 125773 : tree = GDKmalloc(n * sizeof(struct oidtreenode));
130 125772 : if (tree == NULL) {
131 : return NULL;
132 : }
133 125772 : bn = COLnew(0, TYPE_oid, slen, TRANSIENT);
134 125769 : if (bn == NULL) {
135 0 : GDKfree(tree);
136 0 : return NULL;
137 : }
138 :
139 125769 : if (lock)
140 125759 : MT_lock_set(lock);
141 : /* generate a list of random numbers; note we use the
142 : * "tree" array, but we use the value from each location
143 : * before it is overwritten by the use as part of the
144 : * binary tree */
145 37078551 : for (rescnt = 0; rescnt < n; rescnt++)
146 36952777 : tree[rescnt].r = next(rse);
147 :
148 : /* while we do not have enough sample OIDs yet */
149 : BUN rnd = 0;
150 37078551 : for (rescnt = 0; rescnt < n; rescnt++) {
151 41320557 : oid candoid;
152 41320557 : do {
153 41320557 : if (rnd == n) {
154 : /* we ran out of random numbers,
155 : * so generate more */
156 4614754 : for (rnd = rescnt; rnd < n; rnd++)
157 4367780 : tree[rnd].r = next(rse);
158 : rnd = rescnt;
159 : }
160 41320557 : candoid = minoid + tree[rnd++].r % cnt;
161 : /* if that candidate OID was already
162 : * generated, try again */
163 41320557 : } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
164 : }
165 125774 : if (lock)
166 125764 : MT_lock_unset(lock);
167 125774 : if (!antiset) {
168 23373 : OIDTreeToBAT(tree, bn);
169 : } else {
170 102401 : OIDTreeToBATAntiset(tree, bn, minoid, maxoid);
171 : }
172 125774 : GDKfree(tree);
173 :
174 125774 : BATsetcount(bn, slen);
175 125774 : bn->trevsorted = bn->batCount <= 1;
176 125774 : bn->tsorted = true;
177 125774 : bn->tkey = true;
178 125774 : bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil;
179 : }
180 : return bn;
181 : }
182 :
183 : /* BATsample implements sampling for BATs */
184 : BAT *
185 10 : BATsample_with_seed(BAT *b, BUN n, uint64_t seed)
186 : {
187 10 : random_state_engine rse;
188 :
189 10 : init_random_state_engine(rse, seed);
190 :
191 10 : BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, NULL);
192 10 : TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
193 : ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
194 10 : return bn;
195 : }
196 :
197 : static MT_Lock rse_lock = MT_LOCK_INITIALIZER(rse_lock);
198 : BAT *
199 1033510 : BATsample(BAT *b, BUN n)
200 : {
201 1033510 : static random_state_engine rse;
202 :
203 1033510 : MT_lock_set(&rse_lock);
204 1033937 : if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0)
205 341 : init_random_state_engine(rse, (uint64_t) GDKusec());
206 1033937 : MT_lock_unset(&rse_lock);
207 1033936 : BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, &rse_lock);
208 1033706 : TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
209 : ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
210 1033706 : return bn;
211 : }
|