Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /* monetdb_config.h must be the first include in each .c file */
14 : #include "monetdb_config.h"
15 : #include "udf.h"
16 : #include "str.h"
17 :
18 : /* Reverse a string */
19 :
20 : /* actual implementation */
21 : /* all non-exported functions must be declared static */
22 : static str
23 15 : UDFreverse_(str *buf, size_t *buflen, const char *src)
24 : {
25 15 : size_t len = strlen(src);
26 15 : char *dst = NULL;
27 :
28 : /* assert calling sanity */
29 15 : assert(buf);
30 : /* test if input buffer is large enough for result string, otherwise re-allocate it */
31 15 : CHECK_STR_BUFFER_LENGTH(buf, buflen, (len + 1), "udf.reverse");
32 15 : dst = *buf;
33 :
34 15 : dst[len] = 0;
35 : /* all strings in MonetDB are encoded using UTF-8; we must
36 : * make sure that the reversed string is also encoded in valid
37 : * UTF-8, so we treat multibyte characters as single units */
38 257 : while (*src) {
39 242 : if ((*src & 0xF8) == 0xF0) {
40 : /* 4 byte UTF-8 sequence */
41 0 : assert(len >= 4);
42 0 : dst[len - 4] = *src++;
43 0 : assert((*src & 0xC0) == 0x80);
44 0 : dst[len - 3] = *src++;
45 0 : assert((*src & 0xC0) == 0x80);
46 0 : dst[len - 2] = *src++;
47 0 : assert((*src & 0xC0) == 0x80);
48 0 : dst[len - 1] = *src++;
49 0 : len -= 4;
50 242 : } else if ((*src & 0xF0) == 0xE0) {
51 : /* 3 byte UTF-8 sequence */
52 6 : assert(len >= 3);
53 6 : dst[len - 3] = *src++;
54 6 : assert((*src & 0xC0) == 0x80);
55 6 : dst[len - 2] = *src++;
56 6 : assert((*src & 0xC0) == 0x80);
57 6 : dst[len - 1] = *src++;
58 6 : len -= 3;
59 236 : } else if ((*src & 0xE0) == 0xC0) {
60 : /* 2 byte UTF-8 sequence */
61 0 : assert(len >= 2);
62 0 : dst[len - 2] = *src++;
63 0 : assert((*src & 0xC0) == 0x80);
64 0 : dst[len - 1] = *src++;
65 0 : len -= 2;
66 : } else {
67 : /* 1 byte UTF-8 "sequence" */
68 236 : assert(len >= 1);
69 236 : assert((*src & 0x80) == 0);
70 236 : dst[--len] = *src++;
71 : }
72 : }
73 15 : assert(len == 0);
74 :
75 : return MAL_SUCCEED;
76 : }
77 :
78 : /* MAL wrapper */
79 : str
80 7 : UDFreverse(str *res, const str *arg)
81 : {
82 7 : str msg = MAL_SUCCEED, s;
83 :
84 : /* assert calling sanity */
85 7 : assert(res && arg);
86 7 : s = *arg;
87 7 : if (strNil(s)) {
88 0 : if (!(*res = GDKstrdup(str_nil)))
89 0 : throw(MAL, "udf.reverse", SQLSTATE(HY013) MAL_MALLOC_FAIL);
90 : } else {
91 7 : size_t buflen = strlen(s) + 1;
92 :
93 7 : if (!(*res = GDKmalloc(buflen)))
94 0 : throw(MAL, "udf.reverse", SQLSTATE(HY013) MAL_MALLOC_FAIL);
95 7 : if ((msg = UDFreverse_(res, &buflen, s)) != MAL_SUCCEED) {
96 0 : GDKfree(*res);
97 0 : *res = NULL;
98 0 : return msg;
99 : }
100 : }
101 : return msg;
102 : }
103 :
104 : /* Reverse a BAT of strings */
105 : /*
106 : * Generic "type-oblivious" version,
107 : * using generic "type-oblivious" BAT access interface.
108 : */
109 :
110 : /* actual implementation */
111 : static char *
112 2 : UDFBATreverse_(BAT **ret, BAT *src)
113 : {
114 2 : BATiter li;
115 2 : BAT *bn = NULL;
116 2 : BUN p = 0, q = 0;
117 2 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
118 2 : str msg = MAL_SUCCEED, buf;
119 2 : bool nils = false;
120 :
121 : /* assert calling sanity */
122 2 : assert(ret);
123 :
124 : /* handle NULL pointer */
125 2 : if (src == NULL)
126 0 : throw(MAL, "batudf.reverse", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
127 : /* check tail type */
128 2 : if (src->ttype != TYPE_str)
129 0 : throw(MAL, "batudf.reverse", SQLSTATE(42000) "tail-type of input BAT must be TYPE_str");
130 :
131 : /* to avoid many allocations, we allocate a single buffer, which will reallocate if a
132 : larger string is found and freed at the end */
133 2 : if (!(buf = GDKmalloc(buflen))) {
134 0 : msg = createException(MAL, "batudf.reverse", SQLSTATE(HY013) MAL_MALLOC_FAIL);
135 0 : goto bailout;
136 : }
137 2 : q = BATcount(src);
138 : /* allocate void-headed result BAT */
139 2 : if (!(bn = COLnew(src->hseqbase, TYPE_str, q, TRANSIENT))) {
140 0 : msg = createException(MAL, "batudf.reverse", SQLSTATE(HY013) MAL_MALLOC_FAIL);
141 0 : goto bailout;
142 : }
143 :
144 : /* create BAT iterator */
145 2 : li = bat_iterator(src);
146 : /* the core of the algorithm */
147 12 : for (p = 0; p < q ; p++) {
148 8 : const char *x = BUNtvar(li, p);
149 :
150 8 : if (strNil(x)) {
151 : /* if the input string is null, then append directly */
152 0 : if (tfastins_nocheckVAR(bn, p, str_nil) != GDK_SUCCEED) {
153 0 : msg = createException(MAL, "batudf.reverse", SQLSTATE(HY013) MAL_MALLOC_FAIL);
154 0 : goto bailout1;
155 : }
156 : nils = true;
157 : } else {
158 : /* revert tail value */
159 8 : if ((msg = UDFreverse_(&buf, &buflen, x)) != MAL_SUCCEED)
160 0 : goto bailout1;
161 : /* assert logical sanity */
162 8 : assert(buf && x);
163 : /* append to the output BAT. We are using a faster route, because we know what we are doing */
164 8 : if (tfastins_nocheckVAR(bn, p, buf) != GDK_SUCCEED) {
165 0 : msg = createException(MAL, "batudf.reverse", SQLSTATE(HY013) MAL_MALLOC_FAIL);
166 0 : goto bailout1;
167 : }
168 : }
169 : }
170 2 : bailout1:
171 2 : bat_iterator_end(&li);
172 :
173 2 : bailout:
174 2 : GDKfree(buf);
175 2 : if (bn && !msg) {
176 2 : BATsetcount(bn, q);
177 2 : bn->tnil = nils;
178 2 : bn->tnonil = !nils;
179 2 : bn->tkey = BATcount(bn) <= 1;
180 2 : bn->tsorted = BATcount(bn) <= 1;
181 2 : bn->trevsorted = BATcount(bn) <= 1;
182 0 : } else if (bn) {
183 0 : BBPreclaim(bn);
184 0 : bn = NULL;
185 : }
186 2 : *ret = bn;
187 2 : return msg;
188 : }
189 :
190 : /* MAL wrapper */
191 : char *
192 2 : UDFBATreverse(bat *ret, const bat *arg)
193 : {
194 2 : BAT *res = NULL, *src = NULL;
195 2 : char *msg = NULL;
196 :
197 : /* assert calling sanity */
198 2 : assert(ret != NULL && arg != NULL);
199 :
200 : /* bat-id -> BAT-descriptor */
201 2 : if ((src = BATdescriptor(*arg)) == NULL)
202 0 : throw(MAL, "batudf.reverse", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
203 :
204 : /* do the work */
205 2 : msg = UDFBATreverse_( &res, src );
206 :
207 : /* release input BAT-descriptor */
208 2 : BBPunfix(src->batCacheid);
209 :
210 2 : if (msg == MAL_SUCCEED) {
211 : /* register result BAT in buffer pool */
212 2 : *ret = res->batCacheid;
213 2 : BBPkeepref(res);
214 : }
215 :
216 : return msg;
217 : }
218 :
219 : /* fuse */
220 :
221 : /* instantiate type-specific functions */
222 :
223 : #define UI bte
224 : #define UU unsigned char
225 : #define UO sht
226 : #include "udf_impl.h"
227 : #undef UI
228 : #undef UU
229 : #undef UO
230 :
231 : #define UI sht
232 : #define UU unsigned short
233 : #define UO int
234 : #include "udf_impl.h"
235 : #undef UI
236 : #undef UU
237 : #undef UO
238 :
239 : #define UI int
240 : #define UU unsigned int
241 : #define UO lng
242 : #include "udf_impl.h"
243 : #undef UI
244 : #undef UU
245 : #undef UO
246 :
247 : #ifdef HAVE_HGE
248 : #define UI lng
249 : #define UU ulng
250 : #define UO hge
251 : #include "udf_impl.h"
252 : #undef UI
253 : #undef UU
254 : #undef UO
255 : #endif
256 :
257 : /* BAT fuse */
258 :
259 : /* actual implementation */
260 : static char *
261 8 : UDFBATfuse_(BAT **ret, BAT *bone, BAT *btwo)
262 : {
263 8 : BAT *bres = NULL;
264 8 : bit two_tail_sorted_unsigned = FALSE;
265 8 : bit two_tail_revsorted_unsigned = FALSE;
266 8 : BUN n;
267 8 : char *msg = NULL;
268 :
269 : /* assert calling sanity */
270 8 : assert(ret != NULL);
271 :
272 : /* handle NULL pointer */
273 8 : if (bone == NULL || btwo == NULL)
274 0 : throw(MAL, "batudf.fuse", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
275 :
276 : /* check for aligned heads */
277 8 : if (BATcount(bone) != BATcount(btwo) ||
278 8 : bone->hseqbase != btwo->hseqbase) {
279 0 : throw(MAL, "batudf.fuse",
280 : "heads of input BATs must be aligned");
281 : }
282 8 : n = BATcount(bone);
283 :
284 : /* check tail types */
285 8 : if (bone->ttype != btwo->ttype) {
286 0 : throw(MAL, "batudf.fuse",
287 : "tails of input BATs must be identical");
288 : }
289 :
290 : /* allocate result BAT */
291 8 : switch (bone->ttype) {
292 2 : case TYPE_bte:
293 2 : bres = COLnew(bone->hseqbase, TYPE_sht, n, TRANSIENT);
294 2 : break;
295 2 : case TYPE_sht:
296 2 : bres = COLnew(bone->hseqbase, TYPE_int, n, TRANSIENT);
297 2 : break;
298 4 : case TYPE_int:
299 4 : bres = COLnew(bone->hseqbase, TYPE_lng, n, TRANSIENT);
300 4 : break;
301 : #ifdef HAVE_HGE
302 0 : case TYPE_lng:
303 0 : bres = COLnew(bone->hseqbase, TYPE_hge, n, TRANSIENT);
304 0 : break;
305 : #endif
306 0 : default:
307 0 : throw(MAL, "batudf.fuse",
308 : "tails of input BATs must be one of {bte, sht, int"
309 : #ifdef HAVE_HGE
310 : ", lng"
311 : #endif
312 : "}");
313 : }
314 8 : if (bres == NULL)
315 0 : throw(MAL, "batudf.fuse", SQLSTATE(HY013) MAL_MALLOC_FAIL);
316 :
317 : /* call type-specific core algorithm */
318 8 : switch (bone->ttype) {
319 2 : case TYPE_bte:
320 2 : msg = UDFBATfuse_bte_sht ( bres, bone, btwo, n,
321 : &two_tail_sorted_unsigned, &two_tail_revsorted_unsigned );
322 2 : break;
323 2 : case TYPE_sht:
324 2 : msg = UDFBATfuse_sht_int ( bres, bone, btwo, n,
325 : &two_tail_sorted_unsigned, &two_tail_revsorted_unsigned );
326 2 : break;
327 4 : case TYPE_int:
328 4 : msg = UDFBATfuse_int_lng ( bres, bone, btwo, n,
329 : &two_tail_sorted_unsigned, &two_tail_revsorted_unsigned );
330 4 : break;
331 : #ifdef HAVE_HGE
332 0 : case TYPE_lng:
333 0 : msg = UDFBATfuse_lng_hge ( bres, bone, btwo, n,
334 : &two_tail_sorted_unsigned, &two_tail_revsorted_unsigned );
335 0 : break;
336 : #endif
337 0 : default:
338 0 : BBPunfix(bres->batCacheid);
339 0 : throw(MAL, "batudf.fuse",
340 : "tails of input BATs must be one of {bte, sht, int"
341 : #ifdef HAVE_HGE
342 : ", lng"
343 : #endif
344 : "}");
345 : }
346 :
347 8 : if (msg != MAL_SUCCEED) {
348 0 : BBPunfix(bres->batCacheid);
349 : } else {
350 : /* set number of tuples in result BAT */
351 8 : BATsetcount(bres, n);
352 :
353 : /* Result tail is sorted, if the left/first input tail is
354 : * sorted and key (unique), or if the left/first input tail is
355 : * sorted and the second/right input tail is sorted and the
356 : * second/right tail values are either all >= 0 or all < 0;
357 : * otherwise, we cannot tell.
358 : */
359 8 : if (bone->tsorted
360 8 : && (BATtkey(bone) || two_tail_sorted_unsigned))
361 6 : bres->tsorted = true;
362 : else
363 2 : bres->tsorted = (BATcount(bres) <= 1);
364 8 : if (bone->trevsorted
365 0 : && (BATtkey(bone) || two_tail_revsorted_unsigned))
366 0 : bres->trevsorted = true;
367 : else
368 8 : bres->trevsorted = (BATcount(bres) <= 1);
369 : /* result tail is key (unique), iff both input tails are */
370 14 : BATkey(bres, BATtkey(bone) || BATtkey(btwo));
371 :
372 8 : *ret = bres;
373 : }
374 :
375 : return msg;
376 : }
377 :
378 : /* MAL wrapper */
379 : char *
380 8 : UDFBATfuse(bat *ires, const bat *ione, const bat *itwo)
381 : {
382 8 : BAT *bres = NULL, *bone = NULL, *btwo = NULL;
383 8 : char *msg = NULL;
384 :
385 : /* assert calling sanity */
386 8 : assert(ires != NULL && ione != NULL && itwo != NULL);
387 :
388 : /* bat-id -> BAT-descriptor */
389 8 : if ((bone = BATdescriptor(*ione)) == NULL)
390 0 : throw(MAL, "batudf.fuse", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
391 :
392 : /* bat-id -> BAT-descriptor */
393 8 : if ((btwo = BATdescriptor(*itwo)) == NULL) {
394 0 : BBPunfix(bone->batCacheid);
395 0 : throw(MAL, "batudf.fuse", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
396 : }
397 :
398 : /* do the work */
399 8 : msg = UDFBATfuse_ ( &bres, bone, btwo );
400 :
401 : /* release input BAT-descriptors */
402 8 : BBPunfix(bone->batCacheid);
403 8 : BBPunfix(btwo->batCacheid);
404 :
405 8 : if (msg == MAL_SUCCEED) {
406 : /* register result BAT in buffer pool */
407 8 : *ires = bres->batCacheid;
408 8 : BBPkeepref(bres);
409 : }
410 :
411 : return msg;
412 : }
413 :
414 : #include "mel.h"
415 : static mel_func udf_init_funcs[] = {
416 : command("udf", "reverse", UDFreverse, false, "Reverse a string", args(1,2, arg("",str),arg("ra1",str))),
417 : command("batudf", "reverse", UDFBATreverse, false, "Reverse a BAT of strings", args(1,2, batarg("",str),batarg("b",str))),
418 : command("udf", "fuse", UDFfuse_bte_sht, false, "fuse two (1-byte) bte values into one (2-byte) sht value", args(1,3, arg("",sht),arg("one",bte),arg("two",bte))),
419 : command("udf", "fuse", UDFfuse_sht_int, false, "fuse two (2-byte) sht values into one (4-byte) int value", args(1,3, arg("",int),arg("one",sht),arg("two",sht))),
420 : command("udf", "fuse", UDFfuse_int_lng, false, "fuse two (4-byte) int values into one (8-byte) lng value", args(1,3, arg("",lng),arg("one",int),arg("two",int))),
421 : command("batudf", "fuse", UDFBATfuse, false, "fuse two (1-byte) bte values into one (2-byte) sht value", args(1,3, batarg("",sht),batarg("one",bte),batarg("two",bte))),
422 : command("batudf", "fuse", UDFBATfuse, false, "fuse two (2-byte) sht values into one (4-byte) int value", args(1,3, batarg("",int),batarg("one",sht),batarg("two",sht))),
423 : command("batudf", "fuse", UDFBATfuse, false, "fuse two (4-byte) int values into one (8-byte) lng value", args(1,3, batarg("",lng),batarg("one",int),batarg("two",int))),
424 : #ifdef HAVE_HGE
425 : command("udf", "fuse", UDFfuse_lng_hge, false, "fuse two (8-byte) lng values into one (16-byte) hge value", args(1,3, arg("",hge),arg("one",lng),arg("two",lng))),
426 : command("batudf", "fuse", UDFBATfuse, false, "fuse two (8-byte) lng values into one (16-byte) hge value", args(1,3, batarg("",hge),batarg("one",lng),batarg("two",lng))),
427 : #endif
428 : { .imp=NULL }
429 : };
430 : #include "mal_import.h"
431 : #ifdef _MSC_VER
432 : #undef read
433 : #pragma section(".CRT$XCU",read)
434 : #endif
435 4 : LIB_STARTUP_FUNC(init_udf_mal)
436 4 : { mal_module("udf", NULL, udf_init_funcs); }
|