Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * @a Lefteris Sidirourgos
15 : * @d 30/08/2011
16 : * @+ The sampling facilities
17 : *
18 : * In the context of the SciBORQ project, we introduce a number of sampling
19 : * techniques in the MonetDB software stack. Our goal is to provide methods
20 : * for performing sampling (uniform and weighted) over a) the result of a
21 : * query, b) the base tables, and c) the entire database schema. Sampling
22 : * can be performed during query execution, as well as during data loading in
23 : * the case of predefined sampling indexes. In addition to the sampling
24 : * methods, a number of query plan optimisations for sampling are introduced on
25 : * the SQL and MAL level.
26 : *
27 : * Besides the sampling methods, SciBORQ also aims at multi-layered bounded
28 : * query execution. That is steering query execution over many layers of
29 : * samples with different size in order to achieve either strict error bounds
30 : * or limited execution time. For more details see the SciBORQ module.
31 : *
32 : * In the following, details are presented on the implementation and the usage
33 : * of each sampling method.
34 : */
35 :
36 : #include "monetdb_config.h"
37 : #include "gdk.h"
38 : #include "mal_exception.h"
39 : #include "mal_interpreter.h"
40 :
41 : // TODO: Go through this documentation and update it with an explanation about seeds.
42 : /*
43 : * @- Uniform Sampling.
44 : *
45 : * A new SQL operator has been added to support sampling the result of a query.
46 : * The syntax for sampling is:
47 : * SELECT ... FROM ... WHERE ... SAMPLE s
48 : *
49 : * where s if is an integer greater than 1, it defines the number of rows to be
50 : * in the sample. If s is a double between [0.0,1.0] the it refers to the
51 : * percentage of the result to be sampled. That is if s=0.3 then the sample
52 : * will be 30% the size of the query result.
53 : *
54 : * SAMPLE is been treated as LIMIT, ORDER BY, etc., that means that it can only
55 : * be in the outer most SELECT clause, i.e., SAMPLE cannot appear in a
56 : * subquery. However, if this is needed, then one may define a function, for
57 : * example
58 : *
59 : * CREATE FUNCTION mysample ()
60 : * RETURNS TABLE(col a,...)
61 : * BEGIN
62 : * RETURN
63 : * SELECT a,...
64 : * FROM name_table
65 : * SAMPLE 100;
66 : * end;
67 : *
68 : * and then use function mysample() for example to populate a new table with
69 : * the sample. E.g.,
70 : *
71 : * INSERT INTO sample_table (SELECT * FROM mysample());
72 : *
73 : */
74 :
75 : static str
76 18 : SAMPLEuniform(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
77 : {
78 :
79 18 : bat *r, *b;
80 18 : lng sample_size;
81 18 : unsigned seed;
82 18 : (void) cntxt;
83 :
84 18 : BAT *br, *bb;
85 :
86 18 : r = getArgReference_bat(stk, pci, 0);
87 18 : b = getArgReference_bat(stk, pci, 1);
88 :
89 18 : if ((bb = BATdescriptor(*b)) == NULL) {
90 0 : throw(MAL, "sample.subuniform", INTERNAL_BAT_ACCESS);
91 : }
92 :
93 18 : if (getArgType(mb, pci, 2) == TYPE_dbl) {
94 8 : dbl pr = *getArgReference_dbl(stk, pci, 2);
95 :
96 8 : if (pr < 0.0 || pr > 1.0) {
97 1 : BBPunfix(bb->batCacheid);
98 1 : throw(MAL, "sample.subuniform", ILLEGAL_ARGUMENT
99 : " p should be between 0 and 1.0");
100 7 : } else if (pr == 0) { /* special case */
101 : sample_size = 0;
102 : // TODO: Add special case for pr == 1.0.
103 : } else {
104 6 : sample_size = (lng) (pr * (double) BATcount(bb));
105 : }
106 : } else {
107 10 : sample_size = *getArgReference_lng(stk, pci, 2);
108 : }
109 :
110 17 : if (pci->argc == 4) {
111 10 : seed = (unsigned) *getArgReference_int(stk, pci, 3);
112 10 : br = BATsample_with_seed(bb, (BUN) sample_size, seed);
113 : } else {
114 7 : br = BATsample(bb, (BUN) sample_size);
115 : }
116 :
117 17 : BBPunfix(bb->batCacheid);
118 17 : if (br == NULL)
119 0 : throw(MAL, "sample.subuniform", OPERATION_FAILED);
120 :
121 17 : *r = br->batCacheid;
122 17 : BBPkeepref(br);
123 17 : return MAL_SUCCEED;
124 : }
125 :
126 : #include "mel.h"
127 : mel_func sample_init_funcs[] = {
128 : pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size s", args(1,3, batarg("",oid),batargany("b",0),arg("sample_size",lng))),
129 : pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size s and where the prg is seeded with sample_seed", args(1,4, batarg("",oid),batargany("b",0),arg("sample_size",lng),arg("sample_seed",int))),
130 : pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size = (p x count(b)), where 0 <= p <= 1.0", args(1,3, batarg("",oid),batargany("b",0),arg("p",dbl))),
131 : pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size = (p x count(b)), where 0 <= p <= 1.0 and where the prg is seeded with sample_seed", args(1,4, batarg("",oid),batargany("b",0),arg("p",dbl),arg("sample_seed",int))),
132 : { .imp=NULL }
133 : };
134 : #include "mal_import.h"
135 : #ifdef _MSC_VER
136 : #undef read
137 : #pragma section(".CRT$XCU",read)
138 : #endif
139 325 : LIB_STARTUP_FUNC(init_sample_mal)
140 325 : { mal_module("sample", NULL, sample_init_funcs); }
|