LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - sample.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 28 30 93.3 %
Date: 2024-12-20 21:24:02 Functions: 2 2 100.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  * @a Lefteris Sidirourgos
      15             :  * @d 30/08/2011
      16             :  * @+ The sampling facilities
      17             :  *
      18             :  * In the context of the SciBORQ project, we introduce a number of sampling
      19             :  * techniques in the MonetDB software stack. Our goal is to provide methods
      20             :  * for performing sampling (uniform and weighted) over a) the result of a
      21             :  * query, b) the base tables, and c) the entire database schema. Sampling
      22             :  * can be performed during query execution, as well as during data loading in
      23             :  * the case of predefined sampling indexes. In addition to the sampling
      24             :  * methods, a number of query plan optimisations for sampling are introduced on
      25             :  * the SQL and MAL level.
      26             :  *
      27             :  * Besides the sampling methods, SciBORQ also aims at multi-layered bounded
      28             :  * query execution. That is steering query execution over many layers of
      29             :  * samples with different size in order to achieve either strict error bounds
      30             :  * or limited execution time. For more details see the SciBORQ module.
      31             :  *
      32             :  * In the following, details are presented on the implementation and the usage
      33             :  * of each sampling method.
      34             :  */
      35             : 
      36             : #include "monetdb_config.h"
      37             : #include "gdk.h"
      38             : #include "mal_exception.h"
      39             : #include "mal_interpreter.h"
      40             : 
      41             : // TODO: Go through this documentation and update it with an explanation about seeds.
      42             : /*
      43             :  * @- Uniform Sampling.
      44             :  *
      45             :  * A new SQL operator has been added to support sampling the result of a query.
      46             :  * The syntax for sampling is:
      47             :  * SELECT ... FROM ... WHERE ... SAMPLE s
      48             :  *
      49             :  * where s if is an integer greater than 1, it defines the number of rows to be
      50             :  * in the sample. If s is a double between [0.0,1.0] the it refers to the
      51             :  * percentage of the result to be sampled. That is if s=0.3 then the sample
      52             :  * will be 30% the size of the query result.
      53             :  *
      54             :  * SAMPLE is been treated as LIMIT, ORDER BY, etc., that means that it can only
      55             :  * be in the outer most SELECT clause, i.e., SAMPLE cannot appear in a
      56             :  * subquery. However, if this is needed, then one may define a function, for
      57             :  * example
      58             :  *
      59             :  * CREATE FUNCTION mysample ()
      60             :  * RETURNS TABLE(col a,...)
      61             :  * BEGIN
      62             :  *    RETURN
      63             :  *      SELECT a,...
      64             :  *      FROM name_table
      65             :  *      SAMPLE 100;
      66             :  * end;
      67             :  *
      68             :  * and then use function mysample() for example to populate a new table with
      69             :  * the sample. E.g.,
      70             :  *
      71             :  * INSERT INTO sample_table (SELECT * FROM mysample());
      72             :  *
      73             :  */
      74             : 
      75             : static str
      76          18 : SAMPLEuniform(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
      77             : {
      78             : 
      79          18 :         bat *r, *b;
      80          18 :         lng sample_size;
      81          18 :         unsigned seed;
      82          18 :         (void) cntxt;
      83             : 
      84          18 :         BAT *br, *bb;
      85             : 
      86          18 :         r = getArgReference_bat(stk, pci, 0);
      87          18 :         b = getArgReference_bat(stk, pci, 1);
      88             : 
      89          18 :         if ((bb = BATdescriptor(*b)) == NULL) {
      90           0 :                 throw(MAL, "sample.subuniform", INTERNAL_BAT_ACCESS);
      91             :         }
      92             : 
      93          18 :         if (getArgType(mb, pci, 2) == TYPE_dbl) {
      94           8 :                 dbl pr = *getArgReference_dbl(stk, pci, 2);
      95             : 
      96           8 :                 if (pr < 0.0 || pr > 1.0) {
      97           1 :                         BBPunfix(bb->batCacheid);
      98           1 :                         throw(MAL, "sample.subuniform", ILLEGAL_ARGUMENT
      99             :                                   " p should be between 0 and 1.0");
     100           7 :                 } else if (pr == 0) {   /* special case */
     101             :                         sample_size = 0;
     102             :                         // TODO: Add special case for pr == 1.0.
     103             :                 } else {
     104           6 :                         sample_size = (lng) (pr * (double) BATcount(bb));
     105             :                 }
     106             :         } else {
     107          10 :                 sample_size = *getArgReference_lng(stk, pci, 2);
     108             :         }
     109             : 
     110          17 :         if (pci->argc == 4) {
     111          10 :                 seed = (unsigned) *getArgReference_int(stk, pci, 3);
     112          10 :                 br = BATsample_with_seed(bb, (BUN) sample_size, seed);
     113             :         } else {
     114           7 :                 br = BATsample(bb, (BUN) sample_size);
     115             :         }
     116             : 
     117          17 :         BBPunfix(bb->batCacheid);
     118          17 :         if (br == NULL)
     119           0 :                 throw(MAL, "sample.subuniform", OPERATION_FAILED);
     120             : 
     121          17 :         *r = br->batCacheid;
     122          17 :         BBPkeepref(br);
     123          17 :         return MAL_SUCCEED;
     124             : }
     125             : 
     126             : #include "mel.h"
     127             : mel_func sample_init_funcs[] = {
     128             :  pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size s", args(1,3, batarg("",oid),batargany("b",0),arg("sample_size",lng))),
     129             :  pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size s and where the prg is seeded with sample_seed", args(1,4, batarg("",oid),batargany("b",0),arg("sample_size",lng),arg("sample_seed",int))),
     130             :  pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size = (p x count(b)), where 0 <= p <= 1.0", args(1,3, batarg("",oid),batargany("b",0),arg("p",dbl))),
     131             :  pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size = (p x count(b)), where 0 <= p <= 1.0 and where the prg is seeded with sample_seed", args(1,4, batarg("",oid),batargany("b",0),arg("p",dbl),arg("sample_seed",int))),
     132             :  { .imp=NULL }
     133             : };
     134             : #include "mal_import.h"
     135             : #ifdef _MSC_VER
     136             : #undef read
     137             : #pragma section(".CRT$XCU",read)
     138             : #endif
     139         345 : LIB_STARTUP_FUNC(init_sample_mal)
     140         345 : { mal_module("sample", NULL, sample_init_funcs); }

Generated by: LCOV version 1.14