LCOV - code coverage report
Current view: top level - gdk - gdk_strimps.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 388 444 87.4 %
Date: 2024-11-15 19:37:45 Functions: 17 17 100.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /* Author: Panagiotis Koutsourakis
      14             :  *
      15             :  * A string imprint is an index that can be used as a prefilter in LIKE
      16             :  * queries. It has 2 components:
      17             :  *
      18             :  * - a header of 63 string element pairs.
      19             :  *
      20             :  * - a 64 bit mask for each string in the BAT that encodes the presence
      21             :  *   or absence of each element of the header in the specific item or if
      22             :  *   the corresponding entry in the BAT is nil.
      23             :  *
      24             :  * A string imprint is stored in a new Heap in the BAT, aligned in 8
      25             :  * byte (64 bit) words.
      26             :  *
      27             :  * The first 64 bit word, the header descriptor, describes how the
      28             :  * header of the strimp is encoded. The least significant byte (v in the
      29             :  * schematic below) is the version number. The second (np) is the number
      30             :  * of pairs in the header. In the current implementation this is always
      31             :  * 63. The next 2 bytes (hs) is the total size of the header in
      32             :  * bytes. Finally the fifth byte is the persistence byte. The last 3
      33             :  * bytes needed to align to the 8 byte boundary should be zero, and are
      34             :  * reserved for future use.
      35             :  *
      36             :  * The following np + 1 bytes are the sizes of the pairs. These can have
      37             :  * values from 2 to 8 and are the number of bytes that the corresponding
      38             :  * pair takes up. Following that there are the bytes encoding the actual
      39             :  * pairs.
      40             :  *
      41             :  * | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte |
      42             :  * |---------------------------------------------------------------|
      43             :  * |   v   |  np   |      hs       |   p   |      reserved         |  8bytes     ---
      44             :  * |---------------------------------------------------------------|  ___         |
      45             :  * | psz_0 | psz_1 | ...                                           |   |          |
      46             :  * |                                                               |   |          |
      47             :  * |                                                               |np bytes      |
      48             :  * |                                                               |   |          |
      49             :  * |                                                   ... | psz_n |   |       hs bytes
      50             :  * |---------------------------------------------------------------|  ___         |
      51             :  * |             pair_0            |             pair_1            |              |
      52             :  * |                              ...                              |              |
      53             :  * |                 pair_k-1                   |   pair_k         |              |
      54             :  * |                          pair_n                               |              |
      55             :  * |---------------------------------------------------------------|             ---
      56             :  *
      57             :  *
      58             :  * The bitmasks for each string in the BAT follow after this, aligned to
      59             :  * the string BAT.
      60             :  *
      61             :  * Strimp creation goes as follows:
      62             :  *
      63             :  * - Construct a histogram of all the element pairs for all the strings
      64             :  *   in the BAT.
      65             :  *
      66             :  * - Take the np most frequent pairs as the Strimp Header.
      67             :  *
      68             :  * - For each string s in the BAT, construct an (np + 1)-bit mask, m_s
      69             :  *   that encodes the presence or absence of each member of the header
      70             :  *   in the string or if s is nil.
      71             :  *
      72             :  * Filtering with a query string q goes as follows:
      73             :  *
      74             :  * - Use the strimp header to construct an (np + 1)-bit mask for q
      75             :  *   encoding the presence or absence of each member of the header in q.
      76             :  *
      77             :  * - For each bitmask in the strimp, first check if it encodes a nil
      78             :  *   value and keep it if it needs to be kept (this happens for NOT LIKE
      79             :  *   queries). Otherwise compute the bitwise AND of m_s and q. If the
      80             :  *   result is equal to q, that means that string s contains the same
      81             :  *   strimp header elements as q, so it is kept for more detailed
      82             :  *   examination.
      83             :  */
      84             : 
      85             : #include "monetdb_config.h"
      86             : #include "gdk.h"
      87             : #include "gdk_private.h"
      88             : 
      89             : #include <stdint.h>
      90             : #include <inttypes.h>
      91             : 
      92             : #define STRIMP_VERSION (uint64_t)2
      93             : #define STRIMP_HISTSIZE (256*256)
      94             : #define STRIMP_HEADER_SIZE 64
      95             : #define STRIMP_PAIRS (STRIMP_HEADER_SIZE - 1)
      96             : #define STRIMP_CREATION_THRESHOLD                               \
      97             :         ((BUN) ((ATOMIC_GET(&GDKdebug) & TESTINGMASK)? 100 : 5000))
      98             : 
      99             : typedef struct {
     100             : #ifdef UTF8STRIMPS
     101             :         uint8_t *pbytes;
     102             : #else
     103             :         uint8_t pbytes[2];
     104             : #endif //UTF8STRIMPSX
     105             :         uint8_t psize;
     106             :         size_t idx;
     107             :         uint64_t mask;
     108             : } CharPair;
     109             : 
     110             : typedef struct {
     111             :         size_t pos;
     112             :         size_t lim;
     113             :         const char *s;
     114             : } PairIterator;
     115             : 
     116             : typedef struct {
     117             :         uint64_t cnt;
     118             : } PairHistogramElem;
     119             : 
     120             : 
     121             : /* Macros for accessing metadada of a strimp. These are recorded in the
     122             :  * first 8 bytes of the heap.
     123             :  */
     124             : #define NPAIRS(d) (size_t)(((d) >> 8) & 0xff)
     125             : #define HSIZE(d) (size_t)(((d) >> 16) & 0xffff)
     126             : 
     127             : #undef UTF8STRIMPS              /* Not using utf8 for now */
     128             : #ifdef UTF8STRIMPS
     129             : static bool
     130             : pair_equal(const CharPair *p1, const CharPair *p2)
     131             : {
     132             :         if(p1->psize != p2->psize)
     133             :                 return false;
     134             : 
     135             :         for(size_t i = 0; i < p1->psize; i++)
     136             :                 if (p1->pbytes[i] != p2->pbytes[i])
     137             :                         return false;
     138             : 
     139             :         return true;
     140             : }
     141             : #else
     142             : /* BytePairs implementation.
     143             :  *
     144             :  * The header elements are pairs of bytes. In this case the histogram is
     145             :  * 256*256=65536 entries long. We use the numeric value of the 2 byte
     146             :  * sequence of the pair as the index to the histogram.
     147             :  *
     148             :  * Note: All the of the following functions and macros up to #endif need to be
     149             :  * implemented for the UTF8 case.
     150             :  */
     151             : 
     152             : /* We disregard spaces, digits and punctuation characters */
     153             : #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
     154             : #define pairToIndex(b1, b2) (size_t)(((uint16_t)b2)<<8 | ((uint16_t)b1))
     155             : 
     156             : inline static size_t
     157          63 : bytes2histindex(uint8_t *bytes, uint8_t psize)
     158             : {
     159          63 :         (void)psize;
     160          63 :         return pairToIndex(bytes[0], bytes[1]);
     161             : }
     162             : 
     163             : inline static bool
     164     6301823 : pair_at(const PairIterator *pi, CharPair *p)
     165             : {
     166     6301823 :         if (pi->pos >= pi->lim - 1)
     167             :                 return false;
     168             : 
     169     6119264 :         p->pbytes[0] = (uint8_t)tolower((unsigned char) pi->s[pi->pos]);
     170     6119264 :         p->pbytes[1] = (uint8_t)tolower((unsigned char) pi->s[pi->pos + 1]);
     171             : 
     172     6119264 :         p->psize = 2;
     173     6119264 :         p->idx = pairToIndex(p->pbytes[0], p->pbytes[1]);
     174     6119264 :         return true;
     175             : }
     176             : 
     177             : inline static bool
     178     6837710 : next_pair(PairIterator *pi)
     179             : {
     180     6837710 :         if (pi->pos >= pi->lim - 1)
     181             :                 return false;
     182     6822350 :         pi->pos++;
     183     6822350 :         return true;
     184             : }
     185             : 
     186             : /* Returns true if the specified char is ignored.
     187             :  */
     188             : inline static bool
     189     9157304 : ignored(const CharPair *p, uint8_t elm)
     190             : {
     191     9157304 :         assert(elm == 0 || elm == 1);
     192     9157304 :         return isIgnored(p->pbytes[elm]);
     193             : }
     194             : 
     195             : inline static strimp_masks_t
     196     1119868 : STRMPget_mask(const Strimps *r, uint64_t idx)
     197             : {
     198     1119868 :         return r->masks[idx];
     199             : }
     200             : 
     201             : inline static void
     202          63 : STRMPset_mask(Strimps *r, uint64_t idx, strimp_masks_t val)
     203             : {
     204          63 :         r->masks[idx] = val;
     205             : }
     206             : 
     207             : /* Looks up a given pair in the strimp header and returns the index as a
     208             :  * 64 bit integer. The return value has a bit on in the position
     209             :  * corresponding to the index of the pair in the strimp header, or is 0
     210             :  * if the pair does not occur.
     211             :  */
     212             : inline static uint64_t
     213     1119868 : STRMPpairLookup(const Strimps *s, const CharPair *p)
     214             : {
     215     1119868 :         return STRMPget_mask(s, p->idx);
     216             : }
     217             : 
     218             : 
     219             : #endif // UTF8STRIMPS
     220             : 
     221             : 
     222             : /* Computes the bitstring of a string s with respect to the strimp r.
     223             :  *
     224             :  */
     225             : static uint64_t
     226       38530 : STRMPmakebitstring(const char *s, Strimps *r)
     227             : {
     228       38530 :         uint64_t ret = 0;
     229             :         /* int8_t pair_idx = 0; */
     230       38530 :         PairIterator pi;
     231       38530 :         CharPair cp;
     232             : 
     233       38530 :         pi.s = s;
     234       38530 :         pi.pos = 0;
     235       38530 :         pi.lim = strlen(s);
     236             : 
     237       38530 :         if (pi.lim < 2) {
     238             :                 return ret;
     239             :         }
     240             : 
     241     1182427 :         while(pair_at(&pi, &cp)) {
     242     1119868 :                 ret |= STRMPpairLookup(r, &cp);
     243     2278264 :                 next_pair(&pi);
     244             :         }
     245             : 
     246             :         return ret;
     247             : }
     248             : 
     249             : #define SWAP(_a, _i, _j, TPE)                           \
     250             :         do {                                            \
     251             :                 TPE _t = ((TPE *)_a)[_i];               \
     252             :                 ((TPE *) _a)[_i] = ((TPE *) _a)[_j];    \
     253             :                 ((TPE *) _a)[_j] = _t;                  \
     254             :         } while(0)
     255             : 
     256             : 
     257             : 
     258             : static void
     259           5 : STRMPchoosePairs(const PairHistogramElem *hist, size_t hist_size, CharPair *cp)
     260             : {
     261           5 :         lng t0 = 0;
     262           5 :         size_t i;
     263           5 :         uint64_t max_counts[STRIMP_HEADER_SIZE] = {0};
     264           5 :         size_t indices[STRIMP_HEADER_SIZE] = {0};
     265           5 :         const size_t cmin_max = STRIMP_HEADER_SIZE - 1;
     266           5 :         size_t hidx;
     267             : 
     268           5 :         TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
     269             : 
     270      327685 :         for(i = 0; i < hist_size; i++) {
     271      327680 :                 if (max_counts[cmin_max] < hist[i].cnt) {
     272         660 :                         max_counts[cmin_max] = hist[i].cnt;
     273         660 :                         indices[cmin_max] = i;
     274       26237 :                         for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > max_counts[hidx-1]; hidx--) {
     275       25577 :                                 SWAP(max_counts, hidx, hidx-1, uint64_t);
     276       25577 :                                 SWAP(indices, hidx, hidx-1, size_t);
     277             :                         }
     278             :                 }
     279             :         }
     280             : 
     281         320 :         for(i = 0; i < STRIMP_PAIRS; i++) {
     282         315 :                 cp[i].pbytes[1] = (uint8_t)(indices[i] & 0xFF);
     283         315 :                 cp[i].pbytes[0] = (uint8_t)((indices[i] >> 8) & 0xFF);
     284         315 :                 cp[i].idx = indices[i];
     285         315 :                 cp[i].psize = 2;
     286         315 :                 cp[i].mask = ((uint64_t)0x1) << (STRIMP_PAIRS - i - 1);
     287             :         }
     288           5 :         cp[STRIMP_PAIRS] = (CharPair) {.psize = 2};
     289             : 
     290           5 :         TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0);
     291           5 : }
     292             : 
     293             : /* Given a BAT b and a candidate list s constructs the header elements
     294             :  * of the strimp.
     295             :  *
     296             :  * Initially creates the histogram for the all the pairs in the candidate
     297             :  * and then chooses the STRIMP_HEADER_SIZE most frequent of them.
     298             :  */
     299             : static bool
     300           5 : STRMPbuildHeader(BAT *b, BAT *s, CharPair *hpairs)
     301             : {
     302           5 :         lng t0 = 0;
     303           5 :         BATiter bi;
     304           5 :         BUN i;
     305           5 :         size_t hidx;
     306           5 :         oid x;
     307           5 :         size_t hlen;
     308           5 :         PairHistogramElem *hist;
     309           5 :         PairIterator pi;
     310           5 :         CharPair cp;
     311           5 :         struct canditer ci;
     312           5 :         size_t values = 0;
     313           5 :         bool res;
     314             : 
     315           5 :         TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
     316             : 
     317           5 :         canditer_init(&ci, b, s);
     318           5 :         if (ci.ncand == 0) {
     319           0 :                 GDKerror("Not enough distinct values to create strimp index\n");
     320           0 :                 return false;
     321             :         }
     322             : 
     323           5 :         hlen = STRIMP_HISTSIZE;
     324           5 :         if ((hist = (PairHistogramElem *)GDKzalloc(hlen*sizeof(PairHistogramElem))) == NULL) {
     325             :                 return false;
     326             :         }
     327             : 
     328             :         // Create Histogram
     329           5 :         bi = bat_iterator(b);
     330      120310 :         for (i = 0; i < ci.ncand; i++) {
     331      120300 :                 x = canditer_next(&ci) - b->hseqbase;
     332      120300 :                 const char *cs = BUNtvar(bi, x);
     333      240600 :                 if (!strNil(cs)) {
     334      120296 :                         pi.s = cs;
     335      120296 :                         pi.pos = 0;
     336      120296 :                         pi.lim = strlen(pi.s);
     337      120296 :                         if (pi.lim < 2) {
     338           2 :                                 continue;
     339             :                         }
     340     5078676 :                         while (pair_at(&pi, &cp)) {
     341     4958382 :                                 if(ignored(&cp, 1)) {
     342             :                                         /* Skip this AND the next pair
     343             :                                          * if the second char of the
     344             :                                          * pair is ignored.
     345             :                                          */
     346      759460 :                                         next_pair(&pi);
     347     4198922 :                                 } else if (ignored(&cp, 0)) {
     348             :                                         /* Skip this pair if the first
     349             :                                          * char is ignored. This should
     350             :                                          * only happen at the beginning
     351             :                                          * of a string, since the pair
     352             :                                          * will have been ignored in the
     353             :                                          * previous case.
     354             :                                          */
     355             :                                         ;
     356             : 
     357             :                                 } else {
     358             :                                         /* hidx = histogram_index(hist, hlen, &cp); */
     359     4072175 :                                         hidx = cp.idx;
     360             : #ifndef UTF8STRINGS
     361     4072175 :                                         assert(hidx < hlen);
     362             : #else
     363             :                                         if (hidx >= hlen) {
     364             :                                                 // TODO: Note and realloc. Should not happen for bytepairs.
     365             :                                                 continue;
     366             :                                         }
     367             : #endif
     368     4072175 :                                         if (!hist[hidx].cnt)
     369        1194 :                                                 values++;
     370     4072175 :                                         hist[hidx].cnt++;
     371             :                                 }
     372    10037058 :                                 next_pair(&pi);
     373             :                         }
     374             :                 }
     375             :         }
     376           5 :         bat_iterator_end(&bi);
     377             : 
     378             :         // Check that we have enough values in the histogram.
     379           5 :         if(values >= STRIMP_HEADER_SIZE) {
     380             :                 // Choose the header pairs
     381           5 :                 STRMPchoosePairs(hist, hlen, hpairs);
     382             :         }
     383             : 
     384           5 :         GDKfree(hist);
     385             : 
     386           5 :         TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0);
     387           5 :         if (!(res = values >= STRIMP_HEADER_SIZE))
     388           0 :                 GDKerror("Not enough distinct values to create strimp index\n");
     389             :         return res;
     390             : }
     391             : 
     392             : /* Read a strimp structure from disk.
     393             :  *
     394             :  * If the pointer b->tstrimps has the value 1, it means that the strimp
     395             :  * is on disk. This routine attempts to read it so that it can be used.
     396             :  *
     397             :  * There are a number of checks made for example we check that the
     398             :  * strimps version on disk matches the one the code recognizes, and that
     399             :  * the number of pairs encoded on disk matches the one we expect. If any
     400             :  * of these checks fail, we remove the file from disk since it is now
     401             :  * unusable, and set the pointer b->tstrimps to 2 so that the strimp
     402             :  * will be recreated.
     403             :  *
     404             :  * This function returns true if at the end we have a valid pointer.
     405             :  */
     406             : static bool
     407        1662 : BATcheckstrimps(BAT *b)
     408             : {
     409        1662 :         bool ret;
     410        1662 :         lng t = GDKusec();
     411             : 
     412        1662 :         if (b == NULL)
     413             :                 return false;
     414             : 
     415        1662 :         assert(b->batCacheid > 0);
     416             : 
     417        1662 :         if (b->tstrimps == (Strimps *)1) {
     418           1 :                 Strimps *hp;
     419           1 :                 const char *nme = BBP_physical(b->batCacheid);
     420           1 :                 int fd;
     421             : 
     422           1 :                 MT_thread_setalgorithm("read strimp index from disk");
     423             : 
     424           1 :                 b->tstrimps = NULL;
     425           1 :                 if ((hp = GDKzalloc(sizeof(Strimps))) != NULL &&
     426           1 :                     (hp->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) >= 0) {
     427           1 :                         strconcat_len(hp->strimps.filename,
     428             :                                       sizeof(hp->strimps.filename),
     429             :                                       nme, ".tstrimps", NULL);
     430           1 :                         hp->strimps.parentid = b->batCacheid;
     431             : 
     432             :                         /* check whether a persisted strimp can be found */
     433           1 :                         if ((fd = GDKfdlocate(hp->strimps.farmid, nme, "rb+", "tstrimps")) >= 0) {
     434           1 :                                 struct stat st;
     435           1 :                                 uint64_t desc;
     436           1 :                                 size_t npairs;
     437           1 :                                 size_t hsize;
     438             :                                 /* Read the 8 byte long strimp
     439             :                                  * descriptor.
     440             :                                  *
     441             :                                  * HSIZE must be between 200 and
     442             :                                  * 584 (inclusive): 8 bytes the
     443             :                                  * descriptor, 64 bytes the pair
     444             :                                  * sizes and n*64 bytes the
     445             :                                  * actual pairs where 2 <= n <=
     446             :                                  * 8.
     447             :                                  */
     448           1 :                                 if (read(fd, &desc, 8) == 8
     449           1 :                                     && (desc & 0xff) == STRIMP_VERSION
     450           1 :                                     && ((npairs = NPAIRS(desc)) == STRIMP_PAIRS)
     451           1 :                                     && (hsize = HSIZE(desc)) >= 200 && hsize <= 584
     452           1 :                                     && ((desc >> 32) & 0xff) == 1 /* check the persistence byte */
     453           1 :                                     && fstat(fd, &st) == 0
     454             :                                     /* TODO: We might need padding in the UTF-8 case. */
     455           1 :                                     && st.st_size >= (off_t) (hp->strimps.free = hp->strimps.size =
     456             :                                                               /* header size (desc + offsets + pairs) */
     457           1 :                                                               hsize +
     458             :                                                               /* bitmasks */
     459           1 :                                                               BATcount(b)*sizeof(uint64_t))
     460           1 :                                     && HEAPload(&hp->strimps, nme, "tstrimps", false) == GDK_SUCCEED) {
     461           1 :                                         hp->sizes_base = (uint8_t *)hp->strimps.base + 8; /* sizes just after the descriptor */
     462           1 :                                         hp->pairs_base = hp->sizes_base + STRIMP_HEADER_SIZE;   /* pairs just after the offsets. */
     463           1 :                                         hp->bitstrings_base = hp->strimps.base + hsize;   /* bitmasks just after the pairs */
     464           1 :                                         hp->masks = (strimp_masks_t *)GDKzalloc(STRIMP_HISTSIZE*sizeof(strimp_masks_t));
     465           1 :                                         if (hp->masks != NULL) {
     466             :                                                 /* init */
     467             :                                                 size_t offset = 0;
     468          64 :                                                 for (size_t idx = 0; idx < STRIMP_PAIRS; idx++) {
     469          63 :                                                         strimp_masks_t mask = ((strimp_masks_t)0x1) << (STRIMP_PAIRS - idx - 1);
     470          63 :                                                         uint8_t pair_size = hp->sizes_base[idx];
     471          63 :                                                         uint8_t *pair = hp->pairs_base + offset;
     472             : 
     473          63 :                                                         size_t i = bytes2histindex(pair, pair_size);
     474          63 :                                                         STRMPset_mask(hp, i, mask);
     475             : 
     476          63 :                                                         offset += pair_size;
     477             : 
     478             :                                                 }
     479             : 
     480           1 :                                                 close(fd);
     481           1 :                                                 ATOMIC_INIT(&hp->strimps.refs, 1);
     482           1 :                                                 b->tstrimps = hp;
     483           1 :                                                 hp->strimps.hasfile = true;
     484           1 :                                                 TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b));
     485           1 :                                                 return true;
     486             :                                         }
     487             :                                         /* We failed to allocate the
     488             :                                          * masks field. In principle we
     489             :                                          * can try to re-create the
     490             :                                          * strimp from scratch.
     491             :                                          */
     492             :                                 }
     493           0 :                                 close(fd);
     494             :                                 /* unlink unusable file */
     495           0 :                                 GDKunlink(hp->strimps.farmid, BATDIR, nme, "tstrimps");
     496           0 :                                 hp->strimps.hasfile = false;
     497             : 
     498             :                         }
     499             :                 }
     500             :                 /* For some reason the index exists but was not read
     501             :                  * correctly from disk. Set the pointer to the value 2
     502             :                  * to signify that it needs to be recreated.
     503             :                  */
     504           0 :                 b->tstrimps = (Strimps *)2;
     505           0 :                 GDKfree(hp);
     506           0 :                 GDKclrerr();    /* we're not currently interested in errors */
     507             :         }
     508             : 
     509        1661 :         ret = b->tstrimps != NULL && b->tstrimps != (Strimps *)2;
     510        1661 :         if (ret) {
     511        1656 :                 TRC_DEBUG(ACCELERATOR,
     512             :                           "BATcheckstrimps(" ALGOBATFMT "): already has strimps, waited " LLFMT " usec\n",
     513             :                           ALGOBATPAR(b), GDKusec() - t);
     514             :         }
     515             : 
     516             :         return ret;
     517             : }
     518             : 
     519             : #define STRMPfilterloop(next)                                           \
     520             :         do {                                                            \
     521             :                 for (i = 0; i < ci.ncand; i++) {                     \
     522             :                         x = next(&ci);                                      \
     523             :                         if ((bitstring_array[x] & qbmask) == qbmask || \
     524             :                             (keep_nils && (bitstring_array[x] & ((uint64_t)0x1 << (STRIMP_HEADER_SIZE - 1))))) { \
     525             :                                 rvals[j++] = x;                         \
     526             :                         }                                               \
     527             :                 }                                                       \
     528             :         } while (0)
     529             : 
     530             : /* Filter a slice of a BAT b as defined by a candidate list s using a
     531             :  * string q. Return the result as a candidate list.
     532             :  *
     533             :  * This function also takes a boolean that controls its behavior with
     534             :  * respect to nil values. It should be true only for NOT LIKE queries
     535             :  * and in that case the nil values get included in the result. Later we
     536             :  * will take the complement and the nil values will be dropped from the
     537             :  * final result.
     538             :  */
     539             : BAT *
     540        1610 : STRMPfilter(BAT *b, BAT *s, const char *q, const bool keep_nils)
     541             : {
     542        1610 :         BAT *r = NULL;
     543        1610 :         BUN i, j = 0;
     544        1610 :         uint64_t qbmask;
     545        1610 :         uint64_t *bitstring_array;
     546        1610 :         Strimps *strmps;
     547        1610 :         oid x, *restrict rvals;
     548        1610 :         struct canditer ci;
     549        1610 :         lng t0 = 0;
     550        1610 :         BAT *pb;
     551             : 
     552        1610 :         TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
     553             : 
     554        1610 :         if (VIEWtparent(b)) {
     555        1610 :                 pb = BATdescriptor(VIEWtparent(b));
     556        1653 :                 if (pb == NULL)
     557             :                         return NULL;
     558             :         }
     559             :         else {
     560             :                 pb = b;
     561             :         }
     562             : 
     563        1653 :         MT_lock_set(&pb->batIdxLock);
     564        1656 :         if (!BATcheckstrimps(pb)) {
     565           0 :                 MT_lock_unset(&pb->batIdxLock);
     566           0 :                 goto sfilter_fail;
     567             :         }
     568        1656 :         strmps = pb->tstrimps;
     569        1656 :         STRMPincref(strmps);
     570        1656 :         MT_lock_unset(&pb->batIdxLock);
     571             : 
     572        1656 :         canditer_init(&ci, b, s);
     573        1656 :         if (ci.ncand == 0) {
     574           0 :                 STRMPdecref(strmps, false);
     575           0 :                 r = BATdense(b->hseqbase, 0, 0);
     576           0 :                 if (pb != b)
     577           0 :                         BBPunfix(pb->batCacheid);
     578           0 :                 return r;
     579             :         }
     580        1656 :         r = COLnew(b->hseqbase, TYPE_oid, ci.ncand, TRANSIENT);
     581        1651 :         if (r == NULL) {
     582           0 :                 STRMPdecref(strmps, false);
     583           0 :                 goto sfilter_fail;
     584             :         }
     585             : 
     586        1651 :         qbmask = STRMPmakebitstring(q, strmps);
     587        1653 :         assert((qbmask & ((uint64_t)0x1 << (STRIMP_HEADER_SIZE - 1))) == 0);
     588        1653 :         TRC_DEBUG(ACCELERATOR, "strimp filtering with pattern '%s' bitmap: %#016" PRIx64 "\n",
     589             :                   q, qbmask);
     590        1653 :         bitstring_array = (uint64_t *)strmps->bitstrings_base;
     591        1653 :         rvals = Tloc(r, 0);
     592             : 
     593        1653 :         if (ci.tpe == cand_dense) {
     594      103789 :                 STRMPfilterloop(canditer_next_dense);
     595             :         } else {
     596           0 :                 STRMPfilterloop(canditer_next);
     597             :         }
     598             : 
     599        1655 :         BATsetcount(r, j);
     600        1653 :         r->tkey = true;
     601        1653 :         r->tsorted = true;
     602        1653 :         r->trevsorted = BATcount(r) <= 1;
     603        1653 :         r->tnil = false;
     604        1653 :         r->tnonil = true;
     605        1653 :         TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " BUNFMT
     606             :                   " items took " LLFMT " usec. Keeping " BUNFMT
     607             :                   " items (%.2f%%).\n",
     608             :                   ci.ncand, GDKusec()-t0, r->batCount,
     609             :                   100*r->batCount/(double)ci.ncand);
     610        1653 :         TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) );
     611        1653 :         STRMPdecref(strmps, false);
     612        1651 :         if (pb != b)
     613        1651 :                 BBPunfix(pb->batCacheid);
     614        1648 :         return virtualize(r);
     615             : 
     616           0 :  sfilter_fail:
     617           0 :         if (pb != b)
     618           0 :                 BBPunfix(pb->batCacheid);
     619             :         return NULL;
     620             : }
     621             : 
     622             : /* Write the strimp to disk */
     623             : static void
     624           5 : BATstrimpsync(BAT *b)
     625             : {
     626           5 :         lng t0 = 0;
     627           5 :         Heap *hp;
     628           5 :         int fd;
     629           5 :         const char *failed = " failed";
     630             : 
     631           5 :         TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
     632             : 
     633           5 :         if ((hp = &b->tstrimps->strimps)) {
     634           5 :                 if (HEAPsave(hp, hp->filename, NULL, true, hp->free, NULL) == GDK_SUCCEED) {
     635           5 :                         if (hp->storage == STORE_MEM) {
     636           3 :                                 if ((fd = GDKfdlocate(hp->farmid, hp->filename, "rb+", NULL)) >= 0) {
     637           3 :                                         ((uint64_t *)hp->base)[0] |= (uint64_t) 1 << 32;
     638           3 :                                         if (write(fd, hp->base, sizeof(uint64_t)) >= 0) {
     639           3 :                                                 failed = "";
     640           3 :                                                 if (!(ATOMIC_GET(&GDKdebug) & NOSYNCMASK)) {
     641             : #if defined(NATIVE_WIN32)
     642             :                                                         _commit(fd);
     643             : #elif defined(HAVE_FDATASYNC)
     644           0 :                                                         fdatasync(fd);
     645             : #elif defined(HAVE_FSYNC)
     646             :                                                         fsync(fd);
     647             : #endif
     648             :                                                 }
     649           3 :                                                 hp->dirty = false;
     650             :                                         } else {
     651           0 :                                                 perror("write strimps");
     652             :                                         }
     653           3 :                                         close(fd);
     654             :                                 }
     655             :                         } else {
     656           2 :                                 ((uint64_t *)hp->base)[0] |= (uint64_t) 1 << 32;
     657           2 :                                 if (!(ATOMIC_GET(&GDKdebug) & NOSYNCMASK) &&
     658           0 :                                     MT_msync(hp->base, sizeof(uint64_t)) < 0) {
     659           0 :                                         ((uint64_t *)hp->base)[0] &= ~((uint64_t) 1 << 32);
     660             :                                 } else {
     661           2 :                                         hp->dirty = false;
     662           2 :                                         failed = "";
     663             :                                 }
     664             :                         }
     665           5 :                         TRC_DEBUG(ACCELERATOR, "BATstrimpsync(%s): strimp persisted"
     666             :                                   " (" LLFMT " usec)%s\n",
     667             :                                   BATgetId(b), GDKusec() - t0, failed);
     668             :                 }
     669             :         }
     670           5 :         BBPunfix(b->batCacheid);
     671           5 : }
     672             : 
     673             : /* Perform some checks to see if it makes sense to persist the strimp
     674             :  * and if so call the routine that writes the strimp to disk.
     675             :  */
     676             : static void
     677           5 : persistStrimp(BAT *b)
     678             : {
     679           5 :         if((BBP_status(b->batCacheid) & BBPEXISTING)
     680           5 :            && b->batInserted == b->batCount
     681           5 :            && !b->theap->dirty
     682          10 :            && !GDKinmemory(b->theap->farmid)) {
     683           5 :                 BBPfix(b->batCacheid);
     684           5 :                 char name[MT_NAME_LEN];
     685           5 :                 snprintf(name, sizeof(name), "strimpsync%d", b->batCacheid);
     686           5 :                 BATstrimpsync(b);
     687             :         } else
     688           0 :                 TRC_DEBUG(ACCELERATOR, "persistStrimp(" ALGOBATFMT "): NOT persisting strimp\n", ALGOBATPAR(b));
     689           5 : }
     690             : 
     691             : 
     692             : /* This function calls all the necessary routines to create the strimp
     693             :  * header, allocates enough space for the heap and encodes the header.
     694             :  * It returns NULL if anything fails.
     695             :  *
     696             :  */
     697             : static Strimps *
     698           5 : STRMPcreateStrimpHeap(BAT *b, BAT *s)
     699             : {
     700           5 :         uint8_t *h1, *h2;
     701           5 :         Strimps *r = NULL;
     702           5 :         uint64_t descriptor;
     703           5 :         size_t i;
     704           5 :         uint16_t sz;
     705           5 :         CharPair *hpairs = (CharPair*)GDKzalloc(sizeof(CharPair)*STRIMP_HEADER_SIZE);
     706           5 :         const char *nme;
     707             : 
     708           5 :         if (!hpairs)
     709             :                 return NULL;
     710             : 
     711          10 :         if ((r = b->tstrimps) == NULL &&
     712           5 :                 STRMPbuildHeader(b, s, hpairs)) { /* Find the header pairs, put
     713             :                                                  the result in hpairs */
     714             :                 /* The 64th bit in the bit string is used to indicate if
     715             :                    the string is NULL. So the corresponding pair does
     716             :                    not encode useful information. We need to keep it for
     717             :                    alignment but we must make sure that it will not
     718             :                    match an actual pair of characters we encounter in
     719             :                    strings.*/
     720          15 :                 for (i = 0; i < hpairs[STRIMP_HEADER_SIZE - 1].psize; i++)
     721          10 :                         hpairs[STRIMP_HEADER_SIZE - 1].pbytes[i] = 0;
     722             :                 sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor and
     723             :                                                 the pair sizes */
     724         325 :                 for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
     725         320 :                         sz += hpairs[i].psize;
     726             :                 }
     727             : 
     728           5 :                 nme = GDKinmemory(b->theap->farmid) ? ":memory:"
     729           5 :                         : BBP_physical(b->batCacheid);
     730             :                 /* Allocate the strimps heap */
     731           5 :                 if ((r = GDKzalloc(sizeof(Strimps))) == NULL ||
     732           5 :                     (r->strimps.farmid =
     733           5 :                      BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 ||
     734          10 :                     (r->strimps.parentid = b->batCacheid) <= 0 ||
     735           5 :                     strconcat_len(r->strimps.filename, sizeof(r->strimps.filename),
     736             :                                   nme, ".tstrimps",
     737           5 :                                   NULL) >= sizeof(r->strimps.filename) ||
     738           5 :                     HEAPalloc(&r->strimps, BATcount(b) * sizeof(uint64_t) + sz,
     739             :                               sizeof(uint8_t)) != GDK_SUCCEED) {
     740           0 :                         GDKfree(r);
     741           0 :                         GDKfree(hpairs);
     742           0 :                         return NULL;
     743             :                 }
     744             : 
     745           5 :                 if ((r->masks = (strimp_masks_t *)GDKzalloc(STRIMP_HISTSIZE*sizeof(strimp_masks_t))) == NULL) {
     746           0 :                         HEAPfree(&r->strimps, true);
     747           0 :                         GDKfree(r);
     748           0 :                         return NULL;
     749             :                 }
     750             : 
     751         320 :                 for (size_t i = 0; i < STRIMP_PAIRS; i++) {
     752         315 :                         r->masks[hpairs[i].idx] = hpairs[i].mask;
     753             :                 }
     754             : 
     755           5 :                 descriptor = STRIMP_VERSION | ((uint64_t)(STRIMP_PAIRS)) << 8 |
     756           5 :                         ((uint64_t)sz) << 16;
     757             : 
     758           5 :                 ((uint64_t *)r->strimps.base)[0] = descriptor;
     759           5 :                 r->sizes_base = h1 = (uint8_t *)r->strimps.base + 8;
     760           5 :                 r->pairs_base = h2 = (uint8_t *)h1 + STRIMP_HEADER_SIZE;
     761             : 
     762         325 :                 for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
     763         320 :                         uint8_t psize = hpairs[i].psize;
     764         320 :                         h1[i] = psize;
     765         320 :                         memcpy(h2, hpairs[i].pbytes, psize);
     766         320 :                         h2 += psize;
     767             :                 }
     768           5 :                 r->bitstrings_base = h2;
     769           5 :                 r->strimps.free = sz;
     770           5 :                 r->rec_cnt = 0;
     771           5 :                 ATOMIC_INIT(&r->strimps.refs, 1);
     772             :         }
     773           5 :         GDKfree(hpairs);
     774           5 :         return r;
     775             : }
     776             : 
     777             : /* Check if there is a strimp index for the given BAT.
     778             :  */
     779             : bool
     780        5329 : BAThasstrimps(BAT *b)
     781             : {
     782        5329 :         BAT *pb;
     783        5329 :         bool ret;
     784        5329 :         if (VIEWtparent(b)) {
     785        4356 :                 pb = BATdescriptor(VIEWtparent(b));
     786        4391 :                 assert(pb);
     787             :         } else {
     788             :                 pb = b;
     789             :         }
     790             : 
     791        5364 :         MT_lock_set(&pb->batIdxLock);
     792        5381 :         ret = pb->tstrimps != NULL;
     793        5381 :         MT_lock_unset(&pb->batIdxLock);
     794             : 
     795        5382 :         if (pb != b)
     796        4391 :                 BBPunfix(pb->batCacheid);
     797        5381 :         return ret;
     798             : 
     799             : }
     800             : 
     801             : /* Signal strimp creation. The SQL layer uses this function to notify
     802             :  * the kernel that a strimp index should be created for this BAT. The
     803             :  * only way that this might fail is if the BAT is not large enough.
     804             :  */
     805             : gdk_return
     806           6 : BATsetstrimps(BAT *b)
     807             : {
     808           6 :         BAT *pb;
     809           6 :         if (VIEWtparent(b)) {
     810           0 :                 pb = BATdescriptor(VIEWtparent(b));
     811           0 :                 assert(pb);
     812             :         } else {
     813             :                 pb = b;
     814             :         }
     815             : 
     816           6 :         if (pb->batCount < STRIMP_CREATION_THRESHOLD) {
     817           0 :                 GDKerror("Cannot create strimps index on columns with fewer than " BUNFMT " elements\n", STRIMP_CREATION_THRESHOLD);
     818           0 :                 if (pb != b)
     819           0 :                         BBPunfix(pb->batCacheid);
     820           0 :                 return GDK_FAIL;
     821             :         }
     822             : 
     823             : 
     824           6 :         MT_lock_set(&pb->batIdxLock);
     825           6 :         if (pb->tstrimps == NULL) {
     826           6 :                 pb->tstrimps = (Strimps *)2;
     827             :         }
     828           6 :         MT_lock_unset(&pb->batIdxLock);
     829             : 
     830           6 :         if (pb != b)
     831           0 :                 BBPunfix(pb->batCacheid);
     832             :         return GDK_SUCCEED;
     833             : }
     834             : 
     835             : /* This macro takes a bat and checks if the strimp construction has been
     836             :  * completed. It is completed when it is an actual pointer and the
     837             :  * number of bitstrings computed is the same as the number of elements
     838             :  * in the BAT.
     839             :  */
     840             : #define STRIMP_COMPLETE(b)                                              \
     841             :         ((b)->tstrimps != NULL &&                                    \
     842             :          (b)->tstrimps != (Strimps *)1 &&                            \
     843             :          (b)->tstrimps != (Strimps *)2 &&                            \
     844             :          (((b)->tstrimps->strimps.free - ((char *)(b)->tstrimps->bitstrings_base - (b)->tstrimps->strimps.base)) == (b)->batCount*sizeof(uint64_t)))
     845             : 
     846             : 
     847             : /* Strimp creation.
     848             :  *
     849             :  * First we attempt to take the index lock of the BAT. The first thread
     850             :  * that succeeds, checks if the strimp already exists on disk and
     851             :  * attempts to read it. If this succeeds then strimp creation is
     852             :  * complete. If it does not either because the strimp does not exist or
     853             :  * because it is outdated (if for example there is a version mismatch),
     854             :  * the same thread that still holds the lock attempts to create the
     855             :  * strimp header and heap. If this fails then we cannot have a strimp on
     856             :  * this BAT and we report a failure after releasing the lock.
     857             :  *
     858             :  * If the strimp header is successfully created, then we release the
     859             :  * lock and allow the rest of the threads to compute the bitstrings of
     860             :  * the slice they have been assigned.
     861             :  *
     862             :  */
     863             : gdk_return
     864          72 : STRMPcreate(BAT *b, BAT *s)
     865             : {
     866          72 :         lng t0 = 0;
     867          72 :         BAT *pb;
     868          72 :         Strimps *r = NULL;
     869          72 :         BATiter bi;
     870          72 :         BUN i;
     871          72 :         oid x;
     872          72 :         struct canditer ci;
     873          72 :         uint64_t *dh;
     874             : 
     875          72 :         TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
     876          72 :         TRC_DEBUG(ACCELERATOR, "creating strimp");
     877          72 :         if (ATOMstorage(b->ttype) != TYPE_str) {
     878           0 :                 GDKerror("Cannot create strimps index for non string bats\n");
     879           0 :                 return GDK_FAIL;
     880             :         }
     881             : 
     882          72 :         if (VIEWtparent(b)) {
     883          72 :                 pb = BATdescriptor(VIEWtparent(b));
     884          72 :                 assert(pb);
     885             :         } else {
     886             :                 pb = b;
     887             :         }
     888             : 
     889             :         /* First thread to take the lock will read the strimp from disk
     890             :          * or construct the strimp header.
     891             :          */
     892          72 :         MT_lock_set(&pb->batIdxLock);
     893             : 
     894             :         /* Strimp creation was requested. There are three cases:
     895             :          *  - The strimp is on disk (pb->tstrimps == 1)
     896             :          *  - The strimp needs to be created (pb->tstrimps == 2)
     897             :          *  - Finally the pointer might have been changed to NULL in another thread.
     898             :          */
     899          72 :         if (pb->tstrimps == NULL || pb->tstrimps == (Strimps*)1 || pb->tstrimps == (Strimps*)2) {
     900             :                 /* The strimp needs to be created. The rest of the
     901             :                  * creation code assumes that the pointer to the strimps
     902             :                  * is NULL. Make it so.
     903             :                  */
     904           6 :                 if (pb->tstrimps == (Strimps *)2)
     905           5 :                         pb->tstrimps = NULL;
     906           6 :                 if (pb->tstrimps == NULL || pb->tstrimps == (Strimps*)1) {
     907           6 :                         if (BATcheckstrimps(pb)) {
     908           1 :                                 MT_lock_unset(&pb->batIdxLock);
     909           1 :                                 if (pb != b)
     910           1 :                                         BBPunfix(pb->batCacheid);
     911           1 :                                 return GDK_SUCCEED;
     912             :                         }
     913             : 
     914             :                         /* BATcheckstrimps, might set the pointer to 2.
     915             :                          * Set it to NULL so that strimp creation will
     916             :                          * proceed as if the strimp has never existed.
     917             :                          */
     918           5 :                         if (pb->tstrimps == (Strimps *)2)
     919           0 :                                 pb->tstrimps = NULL;
     920             : 
     921           5 :                         assert(pb->tstrimps == NULL);
     922             : 
     923           5 :                         if ((r = STRMPcreateStrimpHeap(pb, s)) == NULL) {
     924             :                                 /* Strimp creation failed, but it still
     925             :                                  * exists in the SQL layer. Set the
     926             :                                  * pointer to 2 so that construction
     927             :                                  * will be attempted again next time.
     928             :                                  */
     929           0 :                                 pb->tstrimps = (Strimps *)2;
     930           0 :                                 MT_lock_unset(&pb->batIdxLock);
     931           0 :                                 if (pb != b)
     932           0 :                                         BBPunfix(pb->batCacheid);
     933           0 :                                 return GDK_FAIL;
     934             :                         }
     935           5 :                         pb->tstrimps = r;
     936             :                 }
     937             :         }
     938             : 
     939          71 :         if (STRIMP_COMPLETE(pb)) {
     940          31 :                 MT_lock_unset(&pb->batIdxLock);
     941          31 :                 if (pb != b)
     942          31 :                         BBPunfix(pb->batCacheid);
     943          31 :                 return GDK_SUCCEED;
     944             :         }
     945             : 
     946             :         /* At this point pb->tstrimps should be a valid strimp heap. */
     947          40 :         assert(pb->tstrimps);
     948          40 :         MT_thread_setalgorithm("create strimp index");
     949          40 :         r = pb->tstrimps;
     950          40 :         STRMPincref(r);
     951          40 :         if (pb != b) {
     952          40 :                 MT_lock_unset(&pb->batIdxLock);
     953          40 :                 MT_lock_set(&b->batIdxLock);
     954             :         }
     955          40 :         dh = (uint64_t *)r->bitstrings_base + b->hseqbase;
     956          40 :         canditer_init(&ci, b, s);
     957             : 
     958          40 :         bi = bat_iterator(b);
     959       46712 :         for (i = 0; i < ci.ncand; i++) {
     960       46632 :                 x = canditer_next(&ci) - b->hseqbase;
     961       43889 :                 const char *cs = BUNtvar(bi, x);
     962       87778 :                 if (!strNil(cs))
     963       44445 :                         *dh++ = STRMPmakebitstring(cs, r);
     964             :                 else
     965           0 :                         *dh++ = (uint64_t)0x1 << (STRIMP_HEADER_SIZE - 1); /* Encode NULL strings in the most significant bit */
     966             :         }
     967          40 :         bat_iterator_end(&bi);
     968             : 
     969          40 :         if (pb != b) {
     970          40 :                 MT_lock_unset(&b->batIdxLock);
     971          40 :                 MT_lock_set(&pb->batIdxLock);
     972             :         }
     973          40 :         r->strimps.free += b->batCount*sizeof(uint64_t);
     974             :         /* The thread that reaches this point last needs to write the strimp to disk. */
     975          40 :         if ((r->strimps.free - ((char *)r->bitstrings_base - r->strimps.base)) == b->batCount*sizeof(uint64_t)) {
     976           5 :                 persistStrimp(pb);
     977             :         }
     978          40 :         MT_lock_unset(&pb->batIdxLock);
     979          40 :         STRMPdecref(r, false);
     980             : 
     981          40 :         TRC_DEBUG(ACCELERATOR, "strimp creation took " LLFMT " usec\n", GDKusec()-t0);
     982          40 :         if (pb != b)
     983          40 :                 BBPunfix(pb->batCacheid);
     984             :         return GDK_SUCCEED;
     985             : }
     986             : 
     987             : 
     988             : void
     989        1680 : STRMPdecref(Strimps *strimps, bool remove)
     990             : {
     991        1680 :         if (remove)
     992           2 :                 ATOMIC_OR(&strimps->strimps.refs, HEAPREMOVE);
     993        1680 :         ATOMIC_BASE_TYPE refs = ATOMIC_DEC(&strimps->strimps.refs);
     994        1680 :         TRC_DEBUG(ACCELERATOR, "Decrement ref count of %s to " BUNFMT "\n",
     995             :                   strimps->strimps.filename, (BUN) (refs & HEAPREFS));
     996        1680 :         if ((refs & HEAPREFS) == 0) {
     997           6 :                 HEAPfree(&strimps->strimps, (bool) (refs & HEAPREMOVE));
     998           6 :                 GDKfree(strimps->masks);
     999           6 :                 GDKfree(strimps);
    1000             :         }
    1001             : 
    1002        1680 : }
    1003             : 
    1004             : void
    1005        1696 : STRMPincref(Strimps *strimps)
    1006             : {
    1007        1696 :         ATOMIC_BASE_TYPE refs = ATOMIC_INC(&strimps->strimps.refs);
    1008        1696 :         TRC_DEBUG(ACCELERATOR, "Increment ref count of %s to " BUNFMT "\n",
    1009             :                   strimps->strimps.filename, (BUN) (refs & HEAPREFS));
    1010        1696 : }
    1011             : 
    1012             : void
    1013    75033003 : STRMPdestroy(BAT *b)
    1014             : {
    1015    75033003 :         if (b) {
    1016    75033003 :                 MT_lock_set(&b->batIdxLock);
    1017    75068862 :                 if (b->tstrimps == (Strimps *)1) {
    1018           0 :                         b->tstrimps = NULL;
    1019           0 :                         GDKunlink(BBPselectfarm(b->batRole, b->ttype, strimpheap),
    1020             :                                   BATDIR,
    1021           0 :                                   BBP_physical(b->batCacheid),
    1022             :                                   "tstrimps");
    1023    75068862 :                 } else if (b->tstrimps == (Strimps *)2) {
    1024           0 :                         b->tstrimps = NULL;
    1025    75068862 :                 } else if (b->tstrimps != NULL) {
    1026           2 :                         STRMPdecref(b->tstrimps, b->tstrimps->strimps.parentid == b->batCacheid);
    1027           2 :                         b->tstrimps = NULL;
    1028             :                 }
    1029    75068862 :                 MT_lock_unset(&b->batIdxLock);
    1030             :         }
    1031    75139633 : }
    1032             : 
    1033             : void
    1034      443588 : STRMPfree(BAT *b)
    1035             : {
    1036      443588 :         if (b) {
    1037      443588 :                 Strimps *s;
    1038      443588 :                 MT_lock_set(&b->batIdxLock);
    1039      443588 :                 if ((s = b->tstrimps) != NULL && s != (Strimps *)1 && s != (Strimps *)2) {
    1040           4 :                         if (GDKinmemory(s->strimps.farmid)) {
    1041           0 :                                 b->tstrimps = NULL;
    1042           0 :                                 STRMPdecref(s, s->strimps.parentid == b->batCacheid);
    1043             :                         }
    1044             :                         else {
    1045           4 :                                 if (s->strimps.parentid == b->batCacheid)
    1046           4 :                                         b->tstrimps = (Strimps *)1;
    1047             :                                 else
    1048           0 :                                         b->tstrimps = NULL;
    1049           4 :                                 STRMPdecref(s, false);
    1050             :                         }
    1051             : 
    1052             :                 }
    1053      443588 :                 MT_lock_unset(&b->batIdxLock);
    1054             :         }
    1055      443588 : }
    1056             : 
    1057             : #if 0
    1058             : /* Update the strimp by computing a bitstring and adding it to the heap.
    1059             :    This will probably be useful later when strimps take updates into
    1060             :    account. */
    1061             : gdk_return
    1062             : STRMPappendBitstring(BAT *b, const char *s)
    1063             : {
    1064             :         lng t0 = 0;
    1065             :         BAT *pb;
    1066             :         uint64_t *dh;
    1067             :         Strimps *strmp;
    1068             :         const float extend_factor = 1.5;
    1069             : 
    1070             :         TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
    1071             :         if (ATOMstorage(b->ttype) != TYPE_str) {
    1072             :                 GDKerror("Cannot manipulate strimps index for non string bats\n");
    1073             :                 return GDK_FAIL;
    1074             :         }
    1075             : 
    1076             :         if (VIEWtparent(b)) {
    1077             :                 pb = BATdescriptor(VIEWtparent(b));
    1078             :                 assert(pb);
    1079             :         } else {
    1080             :                 pb = b;
    1081             :         }
    1082             : 
    1083             :         if (!BATcheckstrimps(pb)) {
    1084             :                 GDKerror("Strimp missing, cannot append value\n");
    1085             :                 if (pb != b)
    1086             :                         BBPunfix(pb->batCacheid);
    1087             :                 return GDK_FAIL;
    1088             :         }
    1089             :         MT_lock_set(&pb->batIdxLock);
    1090             :         strmp = pb->tstrimps;
    1091             :         /* Extend heap if there is not enough space */
    1092             :         if (strmp->strimps.free >= strmp->strimps.size + sizeof(uint64_t)) {
    1093             :                 size_t sizes_offset = (char *)strmp->sizes_base - strmp->strimps.base;
    1094             :                 size_t pairs_offset = (char *)strmp->pairs_base - strmp->strimps.base;
    1095             :                 size_t bitstrings_offset = (char *)strmp->bitstrings_base - strmp->strimps.base;
    1096             :                 if (HEAPextend(&(strmp->strimps), (size_t)(extend_factor*BATcount(pb)*sizeof(uint64_t)), false) != GDK_SUCCEED) {
    1097             :                         MT_lock_unset(&pb->batIdxLock);
    1098             :                         GDKerror("Cannot extend heap\n");
    1099             :                         if (pb != b)
    1100             :                                 BBPunfix(pb->batCacheid);
    1101             :                         return GDK_FAIL;
    1102             :                 }
    1103             :                 strmp->sizes_base = (uint8_t *)strmp->strimps.base + sizes_offset;
    1104             :                 strmp->pairs_base = (uint8_t *)strmp->strimps.base + pairs_offset;
    1105             :                 strmp->bitstrings_base = strmp->strimps.base + bitstrings_offset;
    1106             :         }
    1107             :         dh = (uint64_t *)strmp->strimps.base + pb->tstrimps->strimps.free;
    1108             :         *dh = STRMPmakebitstring(s, strmp);
    1109             :         strmp->strimps.free += sizeof(uint64_t);
    1110             : 
    1111             :         strmp->rec_cnt++;
    1112             :         MT_lock_unset(&pb->batIdxLock);
    1113             : 
    1114             :         TRC_DEBUG(ACCELERATOR, "appending to strimp took " LLFMT " usec\n", GDKusec()-t0);
    1115             :         if (pb != b)
    1116             :                 BBPunfix(pb->batCacheid);
    1117             :         return GDK_SUCCEED;
    1118             : }
    1119             : #endif

Generated by: LCOV version 1.14