Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #ifndef _GDK_ATOMS_H_
14 : #define _GDK_ATOMS_H_
15 :
16 : /* atomFromStr returns the number of bytes of the input string that
17 : * were processed. atomToStr returns the length of the string
18 : * produced. Both functions return -1 on (any kind of) failure. If
19 : * *dst is not NULL, *len specifies the available space. If there is
20 : * not enough space, or if *dst is NULL, *dst will be freed (if not
21 : * NULL) and a new buffer will be allocated and returned in *dst.
22 : * *len will be set to reflect the actual size allocated. If
23 : * allocation fails, *dst will be NULL on return and *len is
24 : * undefined. In any case, if the function returns, *buf is either
25 : * NULL or a valid pointer and then *len is the size of the area *buf
26 : * points to.
27 : *
28 : * atomCmp returns a value less than zero/equal to zero/greater than
29 : * zer if the first argument points to a values which is deemed
30 : * smaller/equal to/larger than the value pointed to by the second
31 : * argument.
32 : *
33 : * atomHash calculates a hash function for the value pointed to by the
34 : * argument.
35 : */
36 :
37 : #define IDLENGTH 64 /* maximum BAT id length */
38 :
39 : typedef struct {
40 : /* simple attributes */
41 : char name[IDLENGTH];
42 : uint8_t storage; /* stored as another type? */
43 : bool linear; /* atom can be ordered linearly */
44 : uint16_t size; /* fixed size of atom */
45 :
46 : /* automatically generated fields */
47 : const void *atomNull; /* global nil value */
48 :
49 : /* generic (fixed + varsized atom) ADT functions */
50 : ssize_t (*atomFromStr) (const char *src, size_t *len, void **dst, bool external);
51 : ssize_t (*atomToStr) (char **dst, size_t *len, const void *src, bool external);
52 : void *(*atomRead) (void *dst, size_t *dstlen, stream *s, size_t cnt);
53 : gdk_return (*atomWrite) (const void *src, stream *s, size_t cnt);
54 : int (*atomCmp) (const void *v1, const void *v2);
55 : BUN (*atomHash) (const void *v);
56 : /* optional functions */
57 : gdk_return (*atomFix) (const void *atom);
58 : gdk_return (*atomUnfix) (const void *atom);
59 :
60 : /* varsized atom-only ADT functions */
61 : var_t (*atomPut) (BAT *, var_t *off, const void *src);
62 : void (*atomDel) (Heap *, var_t *atom);
63 : size_t (*atomLen) (const void *atom);
64 : gdk_return (*atomHeap) (Heap *, size_t);
65 : } atomDesc;
66 :
67 : #define MAXATOMS 128
68 :
69 : gdk_export atomDesc BATatoms[MAXATOMS];
70 : gdk_export int GDKatomcnt;
71 :
72 : gdk_export int ATOMallocate(const char *nme);
73 : gdk_export int ATOMindex(const char *nme);
74 :
75 : gdk_export const char *ATOMname(int id);
76 : gdk_export size_t ATOMlen(int id, const void *v);
77 : gdk_export void *ATOMnil(int id)
78 : __attribute__((__malloc__));
79 : gdk_export int ATOMprint(int id, const void *val, stream *fd);
80 : gdk_export char *ATOMformat(int id, const void *val)
81 : __attribute__((__warn_unused_result__));
82 :
83 : gdk_export void *ATOMdup(int id, const void *val);
84 :
85 : /*
86 : * @- maximum atomic string lengths
87 : */
88 : #define bitStrlen 8
89 : #define bteStrlen 8
90 : #define shtStrlen 12
91 : #define intStrlen 24
92 : #if SIZEOF_OID == SIZEOF_INT
93 : #define oidStrlen 24
94 : #else
95 : #define oidStrlen 48
96 : #endif
97 : #if SIZEOF_PTR == SIZEOF_INT
98 : #define ptrStrlen 24
99 : #else
100 : #define ptrStrlen 48
101 : #endif
102 : #define lngStrlen 48
103 : #ifdef HAVE_HGE
104 : #define hgeStrlen 96
105 : #endif
106 : #define fltStrlen 48
107 : #define dblStrlen 96
108 :
109 : /*
110 : * The system comes with the traditional atomic types: int (4 bytes),
111 : * bool(1 byte) and str (variable). In addition, we support the notion
112 : * of an OID type, which ensures uniqueness of its members. This
113 : * leads to the following type descriptor table.
114 : */
115 :
116 : #ifdef HAVE_HGE
117 : gdk_export ssize_t hgeFromStr(const char *src, size_t *len, hge **dst, bool external);
118 : gdk_export ssize_t hgeToStr(str *dst, size_t *len, const hge *src, bool external);
119 : #endif
120 : gdk_export ssize_t lngFromStr(const char *src, size_t *len, lng **dst, bool external);
121 : gdk_export ssize_t lngToStr(str *dst, size_t *len, const lng *src, bool external);
122 : gdk_export ssize_t intFromStr(const char *src, size_t *len, int **dst, bool external);
123 : gdk_export ssize_t intToStr(str *dst, size_t *len, const int *src, bool external);
124 : gdk_export ssize_t batFromStr(const char *src, size_t *len, bat **dst, bool external);
125 : gdk_export ssize_t batToStr(str *dst, size_t *len, const bat *src, bool external);
126 : gdk_export ssize_t ptrFromStr(const char *src, size_t *len, ptr **dst, bool external);
127 : gdk_export ssize_t ptrToStr(str *dst, size_t *len, const ptr *src, bool external);
128 : gdk_export ssize_t bitFromStr(const char *src, size_t *len, bit **dst, bool external);
129 : gdk_export ssize_t bitToStr(str *dst, size_t *len, const bit *src, bool external);
130 : gdk_export ssize_t OIDfromStr(const char *src, size_t *len, oid **dst, bool external);
131 : gdk_export ssize_t OIDtoStr(str *dst, size_t *len, const oid *src, bool external);
132 : gdk_export ssize_t shtFromStr(const char *src, size_t *len, sht **dst, bool external);
133 : gdk_export ssize_t shtToStr(str *dst, size_t *len, const sht *src, bool external);
134 : gdk_export ssize_t bteFromStr(const char *src, size_t *len, bte **dst, bool external);
135 : gdk_export ssize_t bteToStr(str *dst, size_t *len, const bte *src, bool external);
136 : gdk_export ssize_t fltFromStr(const char *src, size_t *len, flt **dst, bool external);
137 : gdk_export ssize_t fltToStr(str *dst, size_t *len, const flt *src, bool external);
138 : gdk_export ssize_t dblFromStr(const char *src, size_t *len, dbl **dst, bool external);
139 : gdk_export ssize_t dblToStr(str *dst, size_t *len, const dbl *src, bool external);
140 : gdk_export ssize_t GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len, char quote);
141 : gdk_export ssize_t strFromStr(const char *restrict src, size_t *restrict len, str *restrict dst, bool external);
142 : gdk_export size_t escapedStrlen(const char *restrict src, const char *sep1, const char *sep2, int quote);
143 : gdk_export size_t escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char *sep1, const char *sep2, int quote);
144 : /*
145 : * @- nil values
146 : * All types have a single value designated as a NIL value. It
147 : * designates a missing value and it is ignored (forbidden) in several
148 : * primitives. The current policy is to use the smallest value in any
149 : * ordered domain. The routine atomnil returns a pointer to the nil
150 : * value representation.
151 : */
152 : #define GDK_bit_max ((bit) 1)
153 : #define GDK_bit_min ((bit) 0)
154 : #define GDK_bte_max ((bte) INT8_MAX)
155 : #define GDK_bte_min ((bte) INT8_MIN+1)
156 : #define GDK_sht_max ((sht) INT16_MAX)
157 : #define GDK_sht_min ((sht) INT16_MIN+1)
158 : #define GDK_int_max ((int) INT32_MAX)
159 : #define GDK_int_min ((int) INT32_MIN+1)
160 : #define GDK_lng_max ((lng) INT64_MAX)
161 : #define GDK_lng_min ((lng) INT64_MIN+1)
162 : #ifdef HAVE_HGE
163 : #define GDK_hge_max ((((hge) 1) << 126) - 1 + (((hge) 1) << 126))
164 : #define GDK_hge_min (-GDK_hge_max)
165 : #endif
166 : #define GDK_flt_max ((flt) FLT_MAX)
167 : #define GDK_flt_min ((flt) -FLT_MAX)
168 : #define GDK_dbl_max ((dbl) DBL_MAX)
169 : #define GDK_dbl_min ((dbl) -DBL_MAX)
170 : #define GDK_oid_max (((oid) 1 << ((8 * SIZEOF_OID) - 1)) - 1)
171 : #define GDK_oid_min ((oid) 0)
172 : /* representation of the nil */
173 : gdk_export const bte bte_nil;
174 : gdk_export const sht sht_nil;
175 : gdk_export const int int_nil;
176 : #ifdef NAN_CANNOT_BE_USED_AS_INITIALIZER
177 : /* Definition of NAN is seriously broken on Intel compiler (at least
178 : * in some versions), so we work around it. */
179 : union _flt_nil_t {
180 : uint32_t l;
181 : flt f;
182 : };
183 : gdk_export const union _flt_nil_t _flt_nil_;
184 : #define flt_nil (_flt_nil_.f)
185 : union _dbl_nil_t {
186 : uint64_t l;
187 : dbl d;
188 : };
189 : gdk_export const union _dbl_nil_t _dbl_nil_;
190 : #define dbl_nil (_dbl_nil_.d)
191 : #else
192 : gdk_export const flt flt_nil;
193 : gdk_export const dbl dbl_nil;
194 : #endif
195 : gdk_export const lng lng_nil;
196 : #ifdef HAVE_HGE
197 : gdk_export const hge hge_nil;
198 : #endif
199 : gdk_export const oid oid_nil;
200 : gdk_export const char str_nil[2];
201 : gdk_export const ptr ptr_nil;
202 : gdk_export const uuid uuid_nil;
203 :
204 : /* derived NIL values - OIDDEPEND */
205 : #define bit_nil ((bit) bte_nil)
206 : #define bat_nil ((bat) int_nil)
207 :
208 : #define void_nil oid_nil
209 :
210 : #define is_bit_nil(v) ((v) == GDK_bte_min-1)
211 : #define is_bte_nil(v) ((v) == GDK_bte_min-1)
212 : #define is_sht_nil(v) ((v) == GDK_sht_min-1)
213 : #define is_int_nil(v) ((v) == GDK_int_min-1)
214 : #define is_lng_nil(v) ((v) == GDK_lng_min-1)
215 : #ifdef HAVE_HGE
216 : #define is_hge_nil(v) ((v) == GDK_hge_min-1)
217 : #endif
218 : #define is_oid_nil(v) ((v) == ((oid) 1 << ((8 * SIZEOF_OID) - 1)))
219 : #define is_flt_nil(v) isnan(v)
220 : #define is_dbl_nil(v) isnan(v)
221 : #define is_bat_nil(v) (((v) & 0x7FFFFFFF) == 0) /* v == bat_nil || v == 0 */
222 :
223 : #include <math.h>
224 :
225 : #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && _MSC_VER < 1800
226 : #include <float.h>
227 : #define isnan(x) _isnan(x)
228 : #define isinf(x) (_fpclass(x) & (_FPCLASS_NINF | _FPCLASS_PINF))
229 : #define isfinite(x) _finite(x)
230 : #endif
231 :
232 : #ifdef HAVE_HGE
233 : #define is_uuid_nil(x) ((x).h == 0)
234 : #else
235 : #define is_uuid_nil(x) (memcmp((x).u, uuid_nil.u, UUID_SIZE) == 0)
236 : #endif
237 :
238 : #define is_blob_nil(x) ((x)->nitems == ~(size_t)0)
239 :
240 : /*
241 : * @- Derived types
242 : * In all algorithms across GDK, you will find switches on the types
243 : * (bte, sht, int, flt, dbl, lng, hge, str). They respectively
244 : * represent an octet, a 16-bit int, a 32-bit int, a 32-bit float, a
245 : * 64-bit double, a 64-bit int, a 128-bit int, and a pointer-sized location
246 : * of a char-buffer (ended by a zero char).
247 : *
248 : * In contrast, the types (bit, ptr, bat, oid) are derived types. They
249 : * do not occur in the switches. The ATOMstorage macro maps them
250 : * respectively onto a @code{ bte}, @code{ int} (pointers are 32-bit),
251 : * @code{ int}, and @code{ int}. OIDs are 32-bit.
252 : *
253 : * This approach makes it tractable to switch to 64-bits OIDs, or to a
254 : * fully 64-bits OS easily. One only has to map the @code{ oid} and
255 : * @code{ ptr} types to @code{ lng} instead of @code{ int}.
256 : *
257 : * Derived types mimic their fathers in many ways. They inherit the
258 : * @code{ size}, @code{ linear}, and @code{ null}
259 : * properties of their father. The same goes for the
260 : * ADT functions HASH, CMP, PUT, NULL, DEL, LEN, and HEAP. So, a
261 : * derived type differs in only two ways from its father:
262 : * @table @code
263 : * @item [string representation]
264 : * the only two ADT operations specific for a derived type are FROMSTR
265 : * and TOSTR.
266 : * @item [identity]
267 : * (a @code{ bit} is really of a different type than @code{ bte}). The
268 : * set of operations on derived type values or BATs of such types may
269 : * differ from the sets of operations on the father type.
270 : * @end table
271 : */
272 : /* use "do ... while(0)" so that lhs can safely be used in if statements */
273 : #define ATOMstorage(t) BATatoms[t].storage
274 : #define ATOMsize(t) BATatoms[t].size
275 : #define ATOMfromstr(t,s,l,src,ext) BATatoms[t].atomFromStr(src,l,s,ext)
276 : #define ATOMnilptr(t) BATatoms[t].atomNull
277 : #define ATOMcompare(t) BATatoms[t].atomCmp
278 : #define ATOMcmp(t,l,r) ((*ATOMcompare(t))(l, r))
279 : #define ATOMhash(t,src) BATatoms[t].atomHash(src)
280 : #define ATOMdel(t,hp,src) do if (BATatoms[t].atomDel) BATatoms[t].atomDel(hp,src); while (0)
281 : #define ATOMvarsized(t) (BATatoms[t].atomPut != NULL)
282 : #define ATOMlinear(t) BATatoms[t].linear
283 : #define ATOMtype(t) ((t) == TYPE_void ? TYPE_oid : (t))
284 : #define ATOMfix(t,v) (BATatoms[t].atomFix ? BATatoms[t].atomFix(v) : GDK_SUCCEED)
285 : #define ATOMunfix(t,v) (BATatoms[t].atomUnfix ? BATatoms[t].atomUnfix(v) : GDK_SUCCEED)
286 :
287 : /* The base type is the storage type if the comparison function, the
288 : * hash function, and the nil value are the same as those of the
289 : * storage type; otherwise it is the type itself. */
290 : #define ATOMbasetype(t) ((t) != ATOMstorage(t) && \
291 : ATOMnilptr(t) == ATOMnilptr(ATOMstorage(t)) && \
292 : ATOMcompare(t) == ATOMcompare(ATOMstorage(t)) && \
293 : BATatoms[t].atomHash == BATatoms[ATOMstorage(t)].atomHash ? \
294 : ATOMstorage(t) : (t))
295 :
296 : /*
297 : * In case that atoms are added to a bat, their logical reference
298 : * count should be incremented (and decremented if deleted). Notice
299 : * that BATs with atomic types that have logical references (e.g. BATs
300 : * of BATs but also BATs of ODMG odSet) can never be persistent, as
301 : * this would make the commit tremendously complicated.
302 : */
303 :
304 : static inline gdk_return __attribute__((__warn_unused_result__))
305 101157836 : ATOMputVAR(BAT *b, var_t *dst, const void *src)
306 : {
307 101157836 : assert(BATatoms[b->ttype].atomPut != NULL);
308 101157836 : if ((*BATatoms[b->ttype].atomPut)(b, dst, src) == (var_t) -1)
309 0 : return GDK_FAIL;
310 : return GDK_SUCCEED;
311 : }
312 :
313 :
314 : static inline gdk_return __attribute__((__warn_unused_result__))
315 474986587 : ATOMputFIX(int type, void *dst, const void *src)
316 : {
317 474986587 : gdk_return rc;
318 :
319 474986587 : assert(BATatoms[type].atomPut == NULL);
320 474986587 : rc = ATOMfix(type, src);
321 0 : if (rc != GDK_SUCCEED)
322 : return rc;
323 504052571 : switch (ATOMsize(type)) {
324 : case 0: /* void */
325 : break;
326 24299415 : case 1:
327 24299415 : * (bte *) dst = * (bte *) src;
328 24299415 : break;
329 17794984 : case 2:
330 17794984 : * (sht *) dst = * (sht *) src;
331 17794984 : break;
332 416097694 : case 4:
333 416097694 : * (int *) dst = * (int *) src;
334 416097694 : break;
335 29450789 : case 8:
336 29450789 : * (lng *) dst = * (lng *) src;
337 29450789 : break;
338 16409689 : case 16:
339 : #ifdef HAVE_HGE
340 16409689 : * (hge *) dst = * (hge *) src;
341 : #else
342 : * (uuid *) dst = * (uuid *) src;
343 : #endif
344 16409689 : break;
345 0 : default:
346 0 : memcpy(dst, src, ATOMsize(type));
347 0 : break;
348 : }
349 : return GDK_SUCCEED;
350 : }
351 :
352 : static inline gdk_return __attribute__((__warn_unused_result__))
353 1125150 : ATOMreplaceVAR(BAT *b, var_t *dst, const void *src)
354 : {
355 1125150 : var_t loc = *dst;
356 1125150 : int type = b->ttype;
357 :
358 1125150 : assert(BATatoms[type].atomPut != NULL);
359 1125150 : if ((*BATatoms[type].atomPut)(b, &loc, src) == (var_t) -1)
360 : return GDK_FAIL;
361 1125222 : if (ATOMunfix(type, dst) != GDK_SUCCEED)
362 : return GDK_FAIL;
363 1125227 : ATOMdel(type, b->tvheap, dst);
364 1125228 : *dst = loc;
365 1125228 : return ATOMfix(type, src);
366 : }
367 :
368 : /* string heaps:
369 : * - strings are 8 byte aligned
370 : * - start with a 1024 bucket hash table
371 : * - heaps < 64KiB are fully duplicate eliminated with this hash tables
372 : * - heaps >= 64KiB are opportunistically (imperfect) duplicate
373 : * eliminated as only the last 128KiB chunk is considered and there
374 : * is no linked list
375 : * - buckets and next pointers are unsigned short "indices"
376 : * - indices should be multiplied by 8 and takes from ELIMBASE to get
377 : * an offset
378 : * Note that a 64KiB chunk of the heap contains at most 8K 8-byte
379 : * aligned strings. The 1K bucket list means that in worst load, the
380 : * list length is 8 (OK).
381 : */
382 : #define GDK_STRHASHTABLE (1<<10) /* 1024 */
383 : #define GDK_STRHASHMASK (GDK_STRHASHTABLE-1)
384 : #define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t))
385 : #define GDK_ELIMPOWER 16 /* 64KiB is the threshold */
386 : #define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT)
387 : #define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) /* equivalently: ELIMBASE == 0 */
388 : #define GDK_ELIMBASE(x) (((x) >> GDK_ELIMPOWER) << GDK_ELIMPOWER)
389 : #define GDK_VAROFFSET ((var_t) GDK_STRHASHSIZE)
390 :
391 : /*
392 : * @- String Comparison, NILs and UTF-8
393 : *
394 : * Using the char* type for strings is handy as this is the type of
395 : * any constant strings in a C/C++ program. Therefore, MonetDB uses
396 : * this definition for str. However, different compilers and
397 : * platforms use either signed or unsigned characters for the char
398 : * type. It is required that string ordering in MonetDB is consistent
399 : * over platforms though.
400 : *
401 : * As for the choice how strings should be ordered, our support for
402 : * UTF-8 actually imposes that it should follow 'unsigned char'
403 : * doctrine (like in the AIX native compiler). In this semantics,
404 : * though we have to take corrective action to ensure that str(nil) is
405 : * the smallest value of the domain.
406 : */
407 : static inline bool __attribute__((__pure__))
408 324623 : strEQ(const char *l, const char *r)
409 : {
410 324623 : return strcmp(l, r) == 0;
411 : }
412 :
413 : static inline bool __attribute__((__pure__))
414 1273950932 : strNil(const char *s)
415 : {
416 1280893763 : return s == NULL || (s[0] == '\200' && s[1] == '\0');
417 : }
418 :
419 : static inline size_t __attribute__((__pure__))
420 52794710 : strLen(const char *s)
421 : {
422 104418768 : return strNil(s) ? 2 : strlen(s) + 1;
423 : }
424 :
425 : static inline int __attribute__((__pure__))
426 307002871 : strCmp(const char *l, const char *r)
427 : {
428 610359263 : return strNil(r)
429 134806157 : ? !strNil(l)
430 298044301 : : strNil(l) ? -1 : strcmp(l, r);
431 : }
432 :
433 : static inline size_t
434 445519269 : VarHeapVal(const void *b, BUN p, int w)
435 : {
436 445519269 : switch (w) {
437 121132696 : case 1:
438 121132696 : return (size_t) ((const uint8_t *) b)[p] + GDK_VAROFFSET;
439 50870554 : case 2:
440 50870554 : return (size_t) ((const uint16_t *) b)[p] + GDK_VAROFFSET;
441 220425485 : case 4:
442 220425485 : return (size_t) ((const uint32_t *) b)[p];
443 : #if SIZEOF_VAR_T == 8
444 53090534 : case 8:
445 53090534 : return (size_t) ((const uint64_t *) b)[p];
446 : #endif
447 : default:
448 0 : MT_UNREACHABLE();
449 : }
450 : }
451 :
452 : static inline BUN __attribute__((__pure__))
453 225962014 : strHash(const char *key)
454 : {
455 225962014 : BUN y = 0;
456 :
457 4050825012 : for (BUN i = 0; key[i]; i++) {
458 3824862998 : y += key[i];
459 3824862998 : y += (y << 10);
460 3824862998 : y ^= (y >> 6);
461 : }
462 225962014 : y += (y << 3);
463 225962014 : y ^= (y >> 11);
464 225962014 : y += (y << 15);
465 225962014 : return y;
466 : }
467 :
468 : #endif /* _GDK_ATOMS_H_ */
|