Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "gdk.h"
15 : #include "str.h"
16 : #include "mal.h"
17 : #include "mal_exception.h"
18 : #include "mal_interpreter.h"
19 : #include <string.h>
20 : #include <limits.h>
21 :
22 : #define MIN3(X,Y,Z) (MIN(MIN((X),(Y)),(Z)))
23 : #define MAX3(X,Y,Z) (MAX(MAX((X),(Y)),(Z)))
24 : #define MIN4(W,X,Y,Z) (MIN(MIN(MIN(W,X),Y),Z))
25 :
26 : static void
27 15 : reclaim_bats(int nargs, ...)
28 : {
29 15 : va_list valist;
30 :
31 15 : va_start(valist, nargs);
32 90 : for (int i = 0; i < nargs; i++) {
33 75 : BAT *b = va_arg(valist, BAT *);
34 120 : BBPreclaim(b);
35 : }
36 15 : va_end(valist);
37 15 : }
38 :
39 : static inline int *
40 2143 : damerau_get_cellpointer(int *pOrigin, int col, int row, int nCols)
41 : {
42 2143 : return pOrigin + col + (row * (nCols + 1));
43 : }
44 :
45 : static inline int
46 1134 : damerau_getat(int *pOrigin, int col, int row, int nCols)
47 : {
48 1134 : int *pCell;
49 1134 : pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
50 1134 : return *pCell;
51 : }
52 :
53 : static inline void
54 1009 : damerau_putat(int *pOrigin, int col, int row, int nCols, int x)
55 : {
56 1009 : int *pCell;
57 1009 : pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
58 1009 : *pCell = x;
59 : }
60 :
61 : static str
62 55 : dameraulevenshtein(int *res, str *S, str *T, int insdel_cost, int replace_cost,
63 : int transpose_cost)
64 : {
65 55 : char *s = *S;
66 55 : char *t = *T;
67 55 : int *d; /* pointer to matrix */
68 55 : int n; /* length of s */
69 55 : int m; /* length of t */
70 55 : int i; /* iterates through s */
71 55 : int j; /* iterates through t */
72 55 : char s_i; /* ith character of s */
73 55 : char t_j; /* jth character of t */
74 55 : int cost; /* cost */
75 55 : int cell; /* contents of target cell */
76 55 : int above; /* contents of cell immediately above */
77 55 : int left; /* contents of cell immediately to left */
78 55 : int diag; /* contents of cell immediately above and to left */
79 55 : lng sz; /* number of cells in matrix */
80 55 : int diag2 = 0, cost2 = 0;
81 :
82 104 : if (strNil(*S) || strNil(*T)) {
83 9 : *res = int_nil;
84 9 : return MAL_SUCCEED;
85 : }
86 :
87 : /* Step 1 */
88 46 : n = (int) strlen(s); /* 64bit: assume strings are less than 2 GB */
89 46 : m = (int) strlen(t);
90 46 : if (n == 0) {
91 8 : *res = m;
92 8 : return MAL_SUCCEED;
93 : }
94 38 : if (m == 0) {
95 8 : *res = n;
96 8 : return MAL_SUCCEED;
97 : }
98 30 : sz = (((lng)n) + 1) * (((lng)m) + 1) * sizeof(int);
99 30 : if (sz > (LL_CONSTANT(1)<<28))
100 0 : throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
101 30 : d = (int *) GDKmalloc((size_t)sz);
102 31 : if (d == NULL)
103 0 : throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
104 :
105 : /* Step 2 */
106 214 : for (i = 0; i <= n; i++) {
107 183 : damerau_putat(d, i, 0, n, i);
108 : }
109 187 : for (j = 0; j <= m; j++) {
110 156 : damerau_putat(d, 0, j, n, j);
111 : }
112 : /* Step 3 */
113 175 : for (i = 1; i <= n; i++) {
114 144 : s_i = s[i - 1];
115 : /* Step 4 */
116 814 : for (j = 1; j <= m; j++) {
117 670 : t_j = t[j - 1];
118 : /* Step 5 */
119 670 : if (s_i == t_j) {
120 : cost = 0;
121 : } else {
122 546 : cost = replace_cost;
123 : }
124 : /* Step 6 */
125 670 : above = damerau_getat(d, i - 1, j, n);
126 670 : left = damerau_getat(d, i, j - 1, n);
127 670 : diag = damerau_getat(d, i - 1, j - 1, n);
128 670 : if (j >= 2 && i >= 2) {
129 : /* NEW: detect transpositions */
130 433 : diag2 = damerau_getat(d, i - 2, j - 2, n);
131 433 : if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
132 : cost2 = transpose_cost;
133 : } else {
134 412 : cost2 = 2;
135 : }
136 433 : cell = MIN4(above + insdel_cost, left + insdel_cost,
137 : diag + cost, diag2 + cost2);
138 : } else {
139 237 : cell = MIN3(above + insdel_cost, left + insdel_cost,
140 : diag + cost);
141 : }
142 670 : damerau_putat(d, i, j, n, cell);
143 : }
144 : }
145 : /* Step 7 */
146 31 : *res = damerau_getat(d, n, m, n);
147 31 : GDKfree(d);
148 31 : return MAL_SUCCEED;
149 : }
150 :
151 : static str
152 37 : TXTSIMdameraulevenshtein1(int *result, str *s, str *t)
153 : {
154 8 : return dameraulevenshtein(result, s, t, 1, 1, 2);
155 : }
156 :
157 : static str
158 9 : TXTSIMdameraulevenshtein2(int *result, str *s, str *t)
159 : {
160 9 : return dameraulevenshtein(result, s, t, 1, 1, 1);
161 : }
162 :
163 : static str
164 2 : TXTSIMdameraulevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
165 : InstrPtr pci)
166 : {
167 2 : (void) cntxt;
168 2 : (void) mb;
169 2 : int *res = getArgReference_int(stk, pci, 0);
170 2 : str *X = getArgReference_str(stk, pci, 1),
171 2 : *Y = getArgReference_str(stk, pci, 2);
172 2 : int insdel_cost, replace_cost, transpose_cost;
173 :
174 2 : assert(pci->argc == 3 || pci->argc == 6);
175 :
176 2 : if (pci->argc == 3) {
177 : insdel_cost = 1;
178 : replace_cost = 1;
179 : transpose_cost = 2;
180 : } else {
181 1 : insdel_cost = *getArgReference_int(stk, pci, 3);
182 1 : replace_cost = *getArgReference_int(stk, pci, 4);
183 1 : transpose_cost = *getArgReference_int(stk, pci, 5);
184 : }
185 :
186 2 : return dameraulevenshtein(res, X, Y, insdel_cost, replace_cost,
187 : transpose_cost);
188 : }
189 :
190 : static inline str
191 12 : levenshtein(int *res, const str *X, const str *Y, int insdel_cost,
192 : int replace_cost, int max)
193 : {
194 12 : str x = *X, y = *Y, x_iter = x;
195 12 : unsigned int xlen, ylen, i = 0, j = 0;
196 12 : unsigned int last_diagonal, old_diagonal;
197 12 : int cx, cy;
198 12 : unsigned int *column, min;
199 :
200 22 : if (strNil(*X) || strNil(*Y)) {
201 3 : *res = int_nil;
202 3 : return MAL_SUCCEED;
203 : }
204 :
205 9 : xlen = UTF8_strlen(x);
206 9 : ylen = UTF8_strlen(y);
207 :
208 9 : if (xlen == ylen && (strcmp(x, y) == 0))
209 : return MAL_SUCCEED;
210 :
211 8 : column = GDKmalloc((xlen + 1) * sizeof(unsigned int));
212 8 : if (column == NULL)
213 0 : throw(MAL, "levenshtein", MAL_MALLOC_FAIL);
214 :
215 59 : for (i = 1; i <= xlen; i++)
216 51 : column[i] = i;
217 :
218 57 : for (j = 1; j <= ylen; j++) {
219 49 : column[0] = j;
220 49 : min = INT_MAX;
221 49 : x_iter = x;
222 49 : UTF8_GETCHAR(cy, y);
223 306 : for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
224 257 : UTF8_GETCHAR(cx, x_iter);
225 257 : old_diagonal = column[i];
226 257 : column[i] = MIN3(column[i] + insdel_cost,
227 : column[i - 1] + insdel_cost,
228 : last_diagonal + (cx == cy ? 0 : replace_cost));
229 257 : last_diagonal = old_diagonal;
230 257 : if (last_diagonal < min)
231 : min = last_diagonal;
232 : }
233 49 : if (max != -1 && min > (unsigned int) max) {
234 0 : *res = INT_MAX;
235 0 : GDKfree(column);
236 0 : return MAL_SUCCEED;
237 : }
238 49 : (*x)++;
239 : }
240 :
241 8 : *res = column[xlen];
242 8 : GDKfree(column);
243 8 : return MAL_SUCCEED;
244 0 : illegal:
245 : /* UTF8_GETCHAR bail */
246 0 : GDKfree(column);
247 0 : throw(MAL, "txtsim.levenshtein", "Illegal unicode code point");
248 : }
249 :
250 : /* Levenshtein OP but with column externaly allocated */
251 : static inline int
252 0 : levenshtein2(const str X, const str Y, const size_t xlen, const size_t ylen,
253 : unsigned int *column, const int insdel_cost,
254 : const int replace_cost, const int max)
255 : {
256 0 : str x = X, y = Y;
257 0 : str x_iter = x;
258 0 : unsigned int i = 0, j = 0, min;;
259 0 : unsigned int last_diagonal, old_diagonal;
260 0 : int cx, cy;
261 :
262 0 : if (strNil(x) || strNil(y))
263 0 : return int_nil;
264 :
265 0 : if (xlen == ylen && (strcmp(x, y) == 0))
266 : return 0;
267 :
268 0 : for (i = 1; i <= xlen; i++)
269 0 : column[i] = i;
270 :
271 0 : for (j = 1; j <= ylen; j++) {
272 0 : column[0] = j;
273 0 : min = INT_MAX;
274 0 : x_iter = x;
275 0 : UTF8_GETCHAR(cy, y);
276 0 : for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
277 0 : UTF8_GETCHAR(cx, x_iter);
278 0 : old_diagonal = column[i];
279 0 : column[i] = MIN3(column[i] + insdel_cost,
280 : column[i - 1] + insdel_cost,
281 : last_diagonal + (cx == cy ? 0 : replace_cost));
282 0 : last_diagonal = old_diagonal;
283 0 : if (last_diagonal < min)
284 : min = last_diagonal;
285 : }
286 0 : if (max != -1 && min > (unsigned int) max)
287 : return INT_MAX;
288 0 : (*x)++;
289 : }
290 0 : return column[xlen];
291 : illegal:
292 : /* UTF8_GETCHAR bail */
293 : return INT_MAX;
294 : }
295 :
296 : static str
297 18 : TXTSIMlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
298 : {
299 18 : (void) cntxt;
300 18 : (void) mb;
301 18 : int *res = getArgReference_int(stk, pci, 0);
302 18 : str *X = getArgReference_str(stk, pci, 1),
303 18 : *Y = getArgReference_str(stk, pci, 2);
304 18 : int insdel_cost, replace_cost;
305 :
306 18 : if (pci->argc == 3) {
307 : insdel_cost = 1;
308 : replace_cost = 1;
309 7 : } else if (pci->argc == 5 || pci->argc == 6) {
310 7 : insdel_cost = *getArgReference_int(stk, pci, 3);
311 7 : replace_cost = *getArgReference_int(stk, pci, 4);
312 : /* Backwards compatibility purposes */
313 7 : if (pci->argc == 6) {
314 6 : int transposition_cost = *getArgReference_int(stk, pci, 5);
315 6 : return dameraulevenshtein(res, X, Y, insdel_cost, replace_cost,
316 : transposition_cost);
317 : }
318 : } else {
319 0 : throw(MAL, "txtsim.levenshtein", RUNTIME_SIGNATURE_MISSING);
320 : }
321 :
322 12 : return levenshtein(res, X, Y, insdel_cost, replace_cost, -1);;
323 : }
324 :
325 : static str
326 0 : TXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
327 : {
328 0 : (void) cntxt;
329 0 : (void) mb;
330 0 : int *res = getArgReference_int(stk, pci, 0);
331 0 : const str *X = getArgReference_str(stk, pci, 1),
332 0 : *Y = getArgReference_str(stk, pci, 2);
333 0 : const int *k = getArgReference_int(stk, pci, 3);
334 0 : int insdel_cost, replace_cost;
335 :
336 0 : if (pci->argc == 4) {
337 : insdel_cost = 1;
338 : replace_cost = 1;
339 0 : } else if (pci->argc == 6) {
340 0 : insdel_cost = *getArgReference_int(stk, pci, 4);
341 0 : replace_cost = *getArgReference_int(stk, pci, 5);
342 : } else {
343 0 : throw(MAL, "txtsim.maxlevenshtein", RUNTIME_SIGNATURE_MISSING);
344 : }
345 :
346 0 : return levenshtein(res, X, Y, insdel_cost, replace_cost, *k);
347 : }
348 :
349 : static str
350 0 : BATTXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
351 : {
352 0 : (void) cntxt;
353 0 : (void) mb;
354 0 : bat *res = getArgReference_bat(stk, pci, 0);
355 0 : bat *lid = getArgReference_bat(stk, pci, 1);
356 0 : bat *rid = getArgReference_bat(stk, pci, 2);
357 0 : int *k = getArgReference_int(stk, pci, 3);
358 0 : int insdel_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 4) : 1,
359 0 : replace_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 5) : 1;
360 0 : BAT *left = NULL, *right = NULL, *bn = NULL;
361 0 : BUN p, q;
362 0 : BATiter li, ri;
363 0 : unsigned int *buffer = NULL;
364 0 : str lv, rv, msg = MAL_SUCCEED;
365 0 : size_t llen = 0, rlen = 0, maxlen = 0;
366 0 : int d;
367 0 : bit v;
368 :
369 0 : if ((left = BATdescriptor(*lid)) == NULL) {
370 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
371 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
372 0 : goto exit;
373 : }
374 0 : if ((right = BATdescriptor(*rid)) == NULL) {
375 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
376 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
377 0 : goto exit;
378 : }
379 0 : if (BATcount(left) != BATcount(right)) {
380 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
381 : "Columns must be aligned");
382 0 : goto exit;
383 : }
384 0 : if ((bn = COLnew(0, TYPE_bit, BATcount(left), TRANSIENT)) == NULL) {
385 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
386 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
387 0 : goto exit;
388 : }
389 :
390 0 : li = bat_iterator(left);
391 0 : ri = bat_iterator(right);
392 0 : BATloop(left, p, q) {
393 0 : lv = (str) BUNtail(li, p);
394 0 : rv = (str) BUNtail(ri, p);
395 0 : llen = UTF8_strlen(lv);
396 0 : rlen = UTF8_strlen(rv);
397 0 : if (abs((int) llen - (int) rlen) > (int) *k)
398 0 : v = false;
399 : else {
400 0 : if (llen > maxlen) {
401 0 : maxlen = llen;
402 0 : unsigned int *tmp = GDKrealloc(buffer, (maxlen + 1) * sizeof(unsigned int));
403 0 : if (tmp == NULL) {
404 0 : bat_iterator_end(&li);
405 0 : bat_iterator_end(&ri);
406 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
407 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
408 0 : goto exit;
409 : }
410 : buffer = tmp;
411 : }
412 0 : d = levenshtein2(lv, rv, llen, rlen, buffer, insdel_cost,
413 : replace_cost, (int) *k);
414 0 : v = (bit) (d <= (int) *k);
415 : }
416 0 : if (BUNappend(bn, (const void *) &v, false) != GDK_SUCCEED) {
417 0 : bat_iterator_end(&li);
418 0 : bat_iterator_end(&ri);
419 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
420 : "BUNappend failed");
421 0 : goto exit;
422 : }
423 : }
424 0 : bat_iterator_end(&li);
425 0 : bat_iterator_end(&ri);
426 :
427 0 : *res = bn->batCacheid;
428 0 : BBPkeepref(bn);
429 0 : exit:
430 0 : GDKfree(buffer);
431 0 : BBPreclaim(left);
432 0 : BBPreclaim(right);
433 0 : if (msg != MAL_SUCCEED)
434 0 : BBPreclaim(bn);
435 0 : return msg;
436 : }
437 :
438 : #define JARO_WINKLER_SCALING_FACTOR 0.1
439 : #define JARO_WINKLER_PREFIX_LEN 4
440 :
441 : typedef struct {
442 : size_t matches; /* accumulator for number of matches for this item */
443 : BUN o; /* position in the BAT */
444 : str val; /* string value */
445 : int *cp_sequence; /* string as array of Unicode codepoints */
446 : int len; /* string length in characters (multi-byte characters count as 1) */
447 : int cp_seq_len; /* string length in bytes */
448 : uint64_t abm; /* 64bit alphabet bitmap */
449 : int abm_popcount; /* hamming weight of abm */
450 : } str_item;
451 :
452 : static inline int
453 95 : popcount64(uint64_t x)
454 : {
455 : #if defined(__GNUC__)
456 95 : return (int) __builtin_popcountll(x);
457 : #elif defined(_MSC_VER)
458 : #if SIZEOF_OID == 4
459 : /* no __popcnt64 on 32 bit Windows */
460 : return (int) (__popcnt((uint32_t) x) + __popcnt((uint32_t) (x >> 32)));
461 : #else
462 : return (int) __popcnt64(x);
463 : #endif
464 : #else
465 : x = (x & UINT64_C(0x5555555555555555)) + ((x >> 1) & UINT64_C(0x5555555555555555));
466 : x = (x & UINT64_C(0x3333333333333333)) + ((x >> 2) & UINT64_C(0x3333333333333333));
467 : x = (x & UINT64_C(0x0F0F0F0F0F0F0F0F)) + ((x >> 4) & UINT64_C(0x0F0F0F0F0F0F0F0F));
468 : return (int) ((x * UINT64_C(0x0101010101010101)) >> 56);
469 : #endif
470 : }
471 :
472 : static int
473 39 : str_item_lenrev_cmp(const void *a, const void *b)
474 : {
475 39 : return ((int) ((str_item *) b)->len - (int) ((str_item *) a)->len);
476 : }
477 :
478 : static str
479 61 : str_2_codepointseq(str_item *s)
480 : {
481 61 : str p = s->val;
482 61 : int c;
483 :
484 61 : s->cp_sequence = GDKmalloc(s->len * sizeof(int));
485 63 : if (s->cp_sequence == NULL)
486 0 : throw(MAL, "str_2_byteseq", SQLSTATE(HY013) MAL_MALLOC_FAIL);
487 :
488 423 : for (int i = 0; i < s->len; i++) {
489 360 : UTF8_GETCHAR(c, p);
490 360 : if (c == 0)
491 : break;
492 360 : s->cp_sequence[i] = c;
493 : }
494 : return MAL_SUCCEED;
495 0 : illegal:
496 0 : throw(MAL, "str_2_byteseq", SQLSTATE(42000) "Illegal unicode code point");
497 : }
498 :
499 : static void
500 60 : str_alphabet_bitmap(str_item *s)
501 : {
502 60 : int i;
503 :
504 60 : s->abm = UINT64_C(0);
505 :
506 400 : for (i = 0; i < s->len; i++)
507 340 : s->abm |= UINT64_C(1) << (s->cp_sequence[i] % 64);
508 :
509 60 : s->abm_popcount = popcount64(s->abm);
510 60 : }
511 :
512 : static inline double
513 13 : jarowinkler_lp(const str_item *a, const str_item *b)
514 : {
515 13 : unsigned int l;
516 :
517 : /* calculate common string prefix up to prefixlen chars */
518 13 : l = 0;
519 65 : for (int i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN); i++)
520 52 : l += (a->cp_sequence[i] == b->cp_sequence[i]);
521 :
522 13 : return (double) l *JARO_WINKLER_SCALING_FACTOR;
523 : }
524 :
525 : static inline double
526 15 : jarowinkler(const str_item *x, const str_item *y, double lp, int *x_flags,
527 : int *y_flags)
528 : {
529 15 : int xlen = x->len, ylen = y->len;
530 15 : int range = MAX(0, MAX(xlen, ylen) / 2 - 1);
531 15 : int *x1 = x->cp_sequence, *x2 = y->cp_sequence;
532 15 : int m = 0, t = 0;
533 15 : int i, j, l;
534 15 : double dw;
535 :
536 15 : if (!xlen || !ylen)
537 : return 0.0;
538 89 : for (i = 0; i < xlen; i++)
539 74 : x_flags[i] = 0;
540 102 : for (i = 0; i < ylen; i++)
541 87 : y_flags[i] = 0;
542 : /* matching chars */
543 102 : for (i = 0; i < ylen; i++) {
544 280 : for (j = MAX(i - range, 0), l = MIN(i + range + 1, xlen); j < l; j++) {
545 225 : if (x2[i] == x1[j] && !x_flags[j]) {
546 32 : x_flags[j] = 1;
547 32 : y_flags[i] = 1;
548 32 : m++;
549 32 : break;
550 : }
551 : }
552 : }
553 15 : if (!m)
554 : return 0.0;
555 : /* char transpositions */
556 : l = 0;
557 90 : for (i = 0; i < ylen; i++) {
558 77 : if (y_flags[i] == 1) {
559 48 : for (j = l; j < xlen; j++) {
560 48 : if (x_flags[j] == 1) {
561 32 : l = j + 1;
562 32 : break;
563 : }
564 : }
565 32 : if (x2[i] != x1[j])
566 4 : t++;
567 : }
568 : }
569 13 : t /= 2;
570 :
571 : /* jaro similarity */
572 13 : dw = (((double) m / xlen) + ((double) m / ylen) +
573 13 : ((double) (m - t) / m)) / 3.0;
574 : /* calculate common string prefix up to prefixlen chars */
575 13 : if (lp == -1)
576 13 : lp = jarowinkler_lp(x, y);
577 : /* jaro-winkler similarity */
578 13 : dw = dw + (lp * (1 - dw));
579 13 : return dw;
580 : }
581 :
582 : static str
583 2 : TXTSIMjarowinkler(dbl *res, str *x, str *y)
584 : {
585 2 : int *x_flags = NULL, *y_flags = NULL;
586 2 : str_item xi = { 0 }, yi = { 0 };
587 2 : str msg = MAL_SUCCEED;
588 :
589 2 : xi.val = *x;
590 2 : xi.len = UTF8_strlen(*x);
591 2 : if ((msg = str_2_codepointseq(&xi)) != MAL_SUCCEED)
592 0 : goto bailout;
593 :
594 2 : yi.val = *y;
595 2 : yi.len = UTF8_strlen(*y);
596 2 : if ((msg = str_2_codepointseq(&yi)) != MAL_SUCCEED)
597 0 : goto bailout;
598 :
599 2 : x_flags = GDKmalloc(xi.len * sizeof(int));
600 2 : y_flags = GDKmalloc(yi.len * sizeof(int));
601 :
602 2 : if (x_flags == NULL || y_flags == NULL)
603 0 : goto bailout;
604 :
605 2 : *res = jarowinkler(&xi, &yi, -1, x_flags, y_flags);
606 :
607 2 : bailout:
608 2 : GDKfree(x_flags);
609 2 : GDKfree(y_flags);
610 2 : GDKfree(xi.cp_sequence);
611 2 : GDKfree(yi.cp_sequence);
612 2 : return msg;
613 : }
614 :
615 : static str
616 0 : TXTSIMminjarowinkler(bit *res, str *x, str *y, const dbl *threshold)
617 : {
618 0 : str msg = MAL_SUCCEED;
619 0 : double s = 1;
620 :
621 0 : msg = TXTSIMjarowinkler(&s, x, y);
622 0 : if (msg != MAL_SUCCEED)
623 0 : throw(MAL, "txt.minjarowinkler", OPERATION_FAILED);
624 :
625 0 : *res = (s > *threshold);
626 0 : return MAL_SUCCEED;
627 : }
628 :
629 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width))
630 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
631 :
632 : #define PREP_BAT_STRITEM(B, CI, SI) \
633 : do { \
634 : for (n = 0; n < CI.ncand; n++) { \
635 : SI[n].matches = 0; \
636 : SI[n].o = canditer_next(&CI); \
637 : SI[n].val = (str) VALUE(B, SI[n].o - B->hseqbase); \
638 : SI[n].cp_sequence = NULL; \
639 : SI[n].len = UTF8_strlen(SI[n].val); \
640 : SI[n].cp_seq_len = str_strlen(SI[n].val); \
641 : if ((msg = str_2_codepointseq(&SI[n])) != MAL_SUCCEED) \
642 : goto exit; \
643 : str_alphabet_bitmap(&SI[n]); \
644 : } \
645 : } while (false)
646 :
647 : #define FINALIZE_BATS(L, R, LCI, RCI, LSI, RSI) \
648 : do { \
649 : assert(BATcount(L) == BATcount(R)); \
650 : BATsetcount(L, BATcount(L)); \
651 : BATsetcount(R, BATcount(R)); \
652 : for (n = 0; n < LCI.ncand; n++) { \
653 : if (LSI[n].matches > 1) { \
654 : L->tkey = false; \
655 : break; \
656 : } \
657 : } \
658 : if (n == LCI.ncand) { \
659 : L->tkey = true; \
660 : } \
661 : for (n = 0; n < RCI.ncand; n++) { \
662 : if (RSI[n].matches > 1) { \
663 : R->tkey = false; \
664 : break; \
665 : } \
666 : } \
667 : if (n == RCI.ncand) { \
668 : R->tkey = true; \
669 : } \
670 : BATordered(L); \
671 : BATordered(R); \
672 : L->theap->dirty |= BATcount(L) > 0; \
673 : R->theap->dirty |= BATcount(R) > 0; \
674 : } while (false)
675 :
676 : static inline int
677 12 : maxlevenshtein_extcol_stritem(const str_item *si1, const str_item *si2,
678 : unsigned int *column, const int k)
679 : {
680 12 : unsigned int lastdiag, olddiag;
681 12 : int c1, c2, x, y;
682 12 : unsigned int min;
683 12 : int s1len = si1->len, s2len = si2->len;
684 12 : int *s1 = si1->cp_sequence, *s2 = si2->cp_sequence;
685 : /* first test if the strings are equal */
686 12 : if (s1len == s2len) {
687 1 : for (x = 0; x < s1len; x++)
688 1 : if (s1[x] != s2[x])
689 : break;
690 1 : if (x == s1len)
691 : return 0;
692 : }
693 68 : for (y = 1; y <= s1len; y++)
694 56 : column[y] = y;
695 78 : for (x = 1; x <= s2len; x++) {
696 69 : c2 = s2[x - 1];
697 69 : column[0] = x;
698 69 : min = INT_MAX;
699 395 : for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
700 326 : olddiag = column[y];
701 326 : c1 = s1[y - 1];
702 326 : column[y] = MIN3(column[y] + 1, column[y - 1] + 1, lastdiag + (c1 == c2 ? 0 : 1));
703 326 : lastdiag = olddiag;
704 326 : if (lastdiag < min)
705 : min = lastdiag;
706 : }
707 69 : if (min > (unsigned int) k)
708 : return INT_MAX;
709 : }
710 9 : return column[s1len];
711 : }
712 :
713 : static str
714 6 : maxlevenshteinjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int k)
715 : {
716 6 : BAT *r1t = NULL, *r2t = NULL;
717 6 : BUN n;
718 6 : struct canditer lci, rci;
719 6 : const char *lvals, *rvals, *lvars, *rvars;;
720 6 : int lwidth, rwidth, d;
721 6 : str_item *lsi = NULL, *rsi = NULL;
722 6 : str msg = MAL_SUCCEED;
723 6 : unsigned int *buffer = NULL;
724 :
725 18 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
726 6 : assert(ATOMtype(l->ttype) == TYPE_str);
727 :
728 6 : canditer_init(&lci, l, sl);
729 6 : canditer_init(&rci, r, sr);
730 :
731 6 : if (lci.ncand == 0 || rci.ncand == 0)
732 0 : goto exit;
733 :
734 6 : lvals = (const char *) Tloc(l, 0);
735 6 : rvals = (const char *) Tloc(r, 0);
736 6 : lvars = l->tvheap->base;
737 6 : rvars = r->tvheap->base;
738 6 : lwidth = l->twidth;
739 6 : rwidth = r->twidth;
740 :
741 6 : if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
742 6 : (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
743 0 : reclaim_bats(2, r1t, r2t);
744 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
745 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
746 0 : goto exit;
747 : }
748 :
749 6 : r1t->tsorted = r1t->trevsorted = false;
750 6 : r2t->tsorted = r2t->trevsorted = false;
751 :
752 6 : if ((lsi = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
753 6 : (rsi = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
754 0 : reclaim_bats(2, r1t, r2t);
755 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
756 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
757 0 : goto exit;
758 : }
759 :
760 12 : PREP_BAT_STRITEM(l, lci, lsi);
761 24 : PREP_BAT_STRITEM(r, rci, rsi);
762 6 : qsort(lsi, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
763 6 : qsort(rsi, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
764 :
765 6 : if ((buffer = GDKmalloc((lsi[0].len + 1) * sizeof(int))) == NULL) {
766 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
767 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
768 0 : goto exit;
769 : }
770 :
771 : /* join loop */
772 24 : for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
773 34 : for (n = lstart; n < lci.ncand; n++) {
774 : /* first and cheapest filter */
775 18 : if ((lsi[n].len) > k + rsi[rstart].len) {
776 0 : lstart++;
777 0 : continue; /* no possible matches yet for this r */
778 18 : } else if (rsi[rstart].len > k + lsi[n].len) {
779 : break; /* no more possible matches from this r */
780 : }
781 : /* filter by comparing alphabet bitmaps (imprecise but fast filter) */
782 16 : d = MAX(lsi[n].abm_popcount,
783 : rsi[rstart].abm_popcount) -
784 16 : popcount64(lsi[n].abm & rsi[rstart].abm);
785 16 : if (d > k)
786 4 : continue;
787 : /* final and most expensive test: Levenshtein distance */
788 12 : d = maxlevenshtein_extcol_stritem(&lsi[n], &rsi[rstart], buffer,
789 : (const unsigned int) k);
790 12 : if (d > k)
791 6 : continue;
792 : /* The match test succeeded */
793 6 : lsi[n].matches++;
794 6 : rsi[rstart].matches++;
795 6 : if (bunfastappTYPE(oid, r1t, &(lsi[n].o)) != GDK_SUCCEED) {
796 0 : reclaim_bats(2, r1t, r2t);
797 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
798 : OPERATION_FAILED "Failed bun append");
799 0 : goto exit;
800 : }
801 6 : if (bunfastappTYPE(oid, r2t, &(rsi[rstart].o)) != GDK_SUCCEED) {
802 0 : reclaim_bats(2, r1t, r2t);
803 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
804 : OPERATION_FAILED "Failed bun append");
805 0 : goto exit;
806 : }
807 : }
808 : }
809 :
810 29 : FINALIZE_BATS(r1t, r2t, lci, rci, lsi, rsi);
811 6 : *r1 = r1t;
812 6 : *r2 = r2t;
813 :
814 6 : exit:
815 12 : for (n = 0; lsi && n < lci.ncand; n++)
816 6 : GDKfree(lsi[n].cp_sequence);
817 24 : for (n = 0; rsi && n < rci.ncand; n++)
818 18 : GDKfree(rsi[n].cp_sequence);
819 6 : GDKfree(lsi);
820 6 : GDKfree(rsi);
821 6 : GDKfree(buffer);
822 6 : return msg;
823 : }
824 :
825 : static str
826 5 : TXTSIMmaxlevenshteinjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
827 : const bat *kid, const bat *slid, const bat *srid,
828 : const bit *nil_matches, const lng *estimate,
829 : const bit *anti)
830 : {
831 5 : (void) nil_matches;
832 5 : (void) estimate;
833 5 : (void) anti;
834 :
835 5 : BAT *bleft = NULL, *bright = NULL, *bk = NULL,
836 5 : *bcandleft = NULL, *bcandright = NULL, *r1t = NULL, *r2t = NULL;
837 5 : int k = 0;
838 5 : str msg = MAL_SUCCEED;
839 :
840 5 : if ((bleft = BATdescriptor(*lid)) == NULL ||
841 6 : (bright = BATdescriptor(*rid)) == NULL ||
842 6 : (bk = BATdescriptor(*kid)) == NULL) {
843 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
844 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
845 0 : goto exit;
846 : }
847 :
848 6 : if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
849 6 : (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
850 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
851 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
852 0 : goto exit;
853 : }
854 :
855 6 : if (BATcount(bk) > 0) {
856 6 : BATiter ki = bat_iterator(bk);
857 6 : k = *(int *) BUNtloc(ki, 0);
858 6 : bat_iterator_end(&ki);
859 : }
860 :
861 6 : if ((msg = maxlevenshteinjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, k)) != MAL_SUCCEED)
862 0 : goto exit;
863 :
864 6 : *r1 = r1t->batCacheid;
865 6 : *r2 = r2t->batCacheid;
866 6 : BBPkeepref(r1t);
867 6 : BBPkeepref(r2t);
868 :
869 6 : exit:
870 6 : reclaim_bats(5, bleft, bright, bcandleft, bcandright, bk);
871 6 : if (msg != MAL_SUCCEED)
872 0 : reclaim_bats(2, r1t, r2t);
873 6 : return msg;
874 : }
875 :
876 : static inline void
877 7 : jarowinkler_rangebounds(int *lb, int *ub, const str_item *a, const double lp,
878 : const double threshold)
879 : {
880 7 : *lb = (int) floor(3.0 * a->len * (threshold - lp) / (1.0 - lp) -
881 7 : (2.0 * a->len));
882 7 : *ub = (int) ceil(a->len / ((3.0 * (threshold - lp) / (1.0 - lp)) - 2.0));
883 7 : }
884 :
885 : /* version with given lp and m, and t = 0*/
886 : static inline double
887 19 : jarowinkler_lp_m_t0(const str_item *lsi, const str_item *rsi, double lp, int m)
888 : {
889 19 : double dw;
890 : /* Jaro similarity */
891 19 : dw = (((double) m / lsi->len) + ((double) m / rsi->len) + 1.0) / 3.0;
892 : /* Jaro-Winkler similarity */
893 19 : dw = dw + (lp * (1 - dw));
894 19 : return dw;
895 : }
896 :
897 : static str
898 9 : minjarowinklerjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
899 : const dbl threshold)
900 : {
901 9 : BAT *r1t = NULL, *r2t = NULL;
902 9 : BUN n;
903 9 : struct canditer lci, rci;
904 9 : const char *lvals, *rvals, *lvars, *rvars;
905 9 : int lwidth, rwidth, lb = 0, ub = 0, m = -1, *x_flags = NULL, *y_flags = NULL;
906 9 : str_item *ssl = NULL, *ssr = NULL, shortest;
907 9 : str msg = MAL_SUCCEED;
908 9 : const bool sliding_window_allowed = threshold > (2.01 + JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR) / 3.0;
909 9 : double s, lp = 0;
910 :
911 27 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
912 9 : assert(ATOMtype(l->ttype) == TYPE_str);
913 :
914 9 : canditer_init(&lci, l, sl);
915 9 : canditer_init(&rci, r, sr);
916 :
917 9 : if (lci.ncand == 0 || rci.ncand == 0)
918 0 : goto exit;
919 :
920 9 : lvals = (const char *) Tloc(l, 0);
921 9 : rvals = (const char *) Tloc(r, 0);
922 9 : assert(r->ttype);
923 9 : lvars = l->tvheap->base;
924 9 : rvars = r->tvheap->base;
925 9 : lwidth = l->twidth;
926 9 : rwidth = r->twidth;
927 :
928 9 : if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
929 9 : (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
930 0 : reclaim_bats(2, r1t, r2t);
931 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
932 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
933 0 : goto exit;
934 : }
935 :
936 9 : r1t->tsorted = r1t->trevsorted = false;
937 9 : r2t->tsorted = r2t->trevsorted = false;
938 :
939 9 : if ((ssl = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
940 9 : (ssr = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
941 0 : reclaim_bats(2, r1t, r2t);
942 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
943 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
944 0 : goto exit;
945 : }
946 :
947 16 : PREP_BAT_STRITEM(l, lci, ssl);
948 32 : PREP_BAT_STRITEM(r, rci, ssr);
949 9 : qsort(ssl, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
950 7 : qsort(ssr, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
951 :
952 8 : if ((x_flags = GDKmalloc(ssl[0].len * sizeof(int))) == NULL ||
953 9 : (y_flags = GDKmalloc(ssr[0].len * sizeof(int))) == NULL) {
954 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
955 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
956 0 : goto exit;
957 : }
958 : // lp used for filters. Use -1 for actual JW (forces to compute actual lp)
959 32 : lp = JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR;
960 :
961 : /* join loop */
962 32 : for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
963 23 : if (sliding_window_allowed)
964 7 : jarowinkler_rangebounds(&lb, &ub, &ssr[rstart], lp, threshold);
965 44 : for (n = lstart; n < lci.ncand; n++) {
966 : /* Update sliding window */
967 : /* This is the first and cheapest filter */
968 25 : if (sliding_window_allowed) {
969 7 : if (ssl[n].len > ub) { /* no possible matches yet for this r */
970 1 : lstart++;
971 1 : continue;
972 : }
973 6 : if (ssl[n].len < lb) { /* no more possible matches from this r */
974 : break;
975 : }
976 : }
977 : /* filter by comparing alphabet bitmaps */
978 : /* find the best possible m: the length of the shorter string
979 : * minus the number of characters that surely cannot match */
980 19 : shortest = ssl[n].len < ssr[rstart].len ? ssl[n] : ssr[rstart];
981 19 : m = shortest.len - popcount64(shortest.abm -
982 19 : (ssl[n].abm & ssr[rstart].abm));
983 : /* equivalent to:
984 : m = shortest.len - popcount64(shortest.abm & (~(ssl[n].abm & ssr[rstart].abm))); */
985 19 : s = jarowinkler_lp_m_t0(&ssl[n], &ssr[rstart], lp, m);
986 19 : if (s < threshold) {
987 7 : continue;
988 : }
989 : /* final and most expensive test: Jaro-Winkler similarity */
990 12 : s = jarowinkler(&ssl[n], &ssr[rstart], -1, x_flags, y_flags);
991 13 : if (s < threshold) {
992 4 : continue;
993 : }
994 : /* The match test succeeded */
995 9 : ssl[n].matches++;
996 9 : ssr[rstart].matches++;
997 9 : if (bunfastappTYPE(oid, r1t, &(ssl[n].o)) != GDK_SUCCEED) {
998 0 : reclaim_bats(2, r1t, r2t);
999 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
1000 : OPERATION_FAILED "Failed bun append");
1001 0 : goto exit;
1002 : }
1003 9 : if (bunfastappTYPE(oid, r2t, &(ssr[rstart].o)) != GDK_SUCCEED) {
1004 0 : reclaim_bats(2, r1t, r2t);
1005 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
1006 : OPERATION_FAILED "Failed bun append");
1007 0 : goto exit;
1008 : }
1009 : }
1010 : }
1011 :
1012 42 : FINALIZE_BATS(r1t, r2t, lci, rci, ssl, ssr);
1013 9 : *r1 = r1t;
1014 9 : *r2 = r2t;
1015 :
1016 9 : exit:
1017 9 : if (ssl)
1018 18 : for (n = 0; n < lci.ncand; n++)
1019 9 : GDKfree(ssl[n].cp_sequence);
1020 9 : if (ssr)
1021 34 : for (n = 0; n < rci.ncand; n++)
1022 25 : GDKfree(ssr[n].cp_sequence);
1023 9 : GDKfree(x_flags);
1024 9 : GDKfree(y_flags);
1025 9 : GDKfree(ssl);
1026 9 : GDKfree(ssr);
1027 9 : return msg;
1028 : }
1029 :
1030 : static str
1031 9 : TXTSIMminjarowinklerjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
1032 : const bat *thresholdid, const bat *slid,
1033 : const bat *srid, const bit *nil_matches,
1034 : const lng *estimate, const bit *anti)
1035 : {
1036 9 : (void) nil_matches;
1037 9 : (void) estimate;
1038 9 : (void) anti;
1039 :
1040 9 : BAT *bleft = NULL, *bright = NULL,
1041 9 : *bcandleft = NULL, *bcandright = NULL, *bthreshold = NULL,
1042 9 : *r1t = NULL, *r2t = NULL;
1043 9 : dbl threshold = 1;
1044 9 : str msg = MAL_SUCCEED;
1045 :
1046 9 : if ((bleft = BATdescriptor(*lid)) == NULL ||
1047 9 : (bright = BATdescriptor(*rid)) == NULL ||
1048 9 : (bthreshold = BATdescriptor(*thresholdid)) == NULL) {
1049 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
1050 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1051 0 : goto exit;
1052 : }
1053 :
1054 9 : if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
1055 9 : (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
1056 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
1057 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1058 0 : goto exit;
1059 : }
1060 :
1061 9 : if (BATcount(bthreshold) > 0) {
1062 9 : BATiter thresholdi = bat_iterator(bthreshold);
1063 9 : threshold = *(dbl *) BUNtail(thresholdi, 0);
1064 9 : bat_iterator_end(&thresholdi);
1065 : }
1066 :
1067 9 : if ((msg = minjarowinklerjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, threshold)) != MAL_SUCCEED)
1068 0 : goto exit;
1069 :
1070 9 : *r1 = r1t->batCacheid;
1071 9 : *r2 = r2t->batCacheid;
1072 9 : BBPkeepref(r1t);
1073 9 : BBPkeepref(r2t);
1074 :
1075 9 : exit:
1076 9 : reclaim_bats(5, bleft, bright, bcandleft, bcandright, bthreshold);
1077 9 : if (msg != MAL_SUCCEED)
1078 0 : reclaim_bats(2, r1t, r2t);
1079 9 : return msg;
1080 : }
1081 :
1082 : #define SoundexLen 4 /* length of a soundex code */
1083 : #define SoundexKey "Z000" /* default key for soundex code */
1084 :
1085 : /* set letter values */
1086 : static const int Code[] = { 0, 1, 2, 3, 0, 1,
1087 : 2, 0, 0, 2, 2, 4,
1088 : 5, 5, 0, 1, 2, 6,
1089 : 2, 3, 0, 1, 0, 2, 0, 2
1090 : };
1091 :
1092 : #define RETURN_NIL_IF(b,t) \
1093 : if (b) { \
1094 : if (ATOMextern(t)) { \
1095 : *(ptr*) res = (ptr) ATOMnil(t); \
1096 : if ( *(ptr *) res == NULL) \
1097 : throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
1098 : } else { \
1099 : memcpy(res, ATOMnilptr(t), ATOMsize(t)); \
1100 : } \
1101 : return MAL_SUCCEED; \
1102 : }
1103 :
1104 : static inline char
1105 139 : SCode(unsigned char c)
1106 : {
1107 139 : if (c == 95)
1108 : return (2); /* german sz */
1109 139 : return (Code[toupper(c) - 'A']);
1110 : }
1111 :
1112 : static str
1113 66 : soundex_code(const char *Name, char *Key)
1114 : {
1115 66 : char LastLetter;
1116 66 : int Index;
1117 :
1118 298 : for (const char *p = Name; *p; p++)
1119 233 : if ((*p & 0x80) != 0)
1120 1 : throw(MAL, "soundex",
1121 : SQLSTATE(42000)
1122 : "Soundex function not available for non ASCII strings");
1123 :
1124 : /* set default key */
1125 65 : strcpy(Key, SoundexKey);
1126 :
1127 : /* keep first letter */
1128 65 : Key[0] = *Name;
1129 65 : if (!isupper((unsigned char) (Key[0])))
1130 23 : Key[0] = toupper(Key[0]);
1131 :
1132 65 : LastLetter = *Name;
1133 65 : if (!*Name)
1134 : return MAL_SUCCEED;
1135 55 : Name++;
1136 :
1137 : /* scan rest of string */
1138 208 : for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
1139 : /* use only letters */
1140 153 : if (isalpha((unsigned char) (*Name))) {
1141 : /* ignore duplicate successive chars */
1142 151 : if (LastLetter != *Name) {
1143 : /* new LastLetter */
1144 139 : LastLetter = *Name;
1145 :
1146 : /* ignore letters with code 0 */
1147 139 : if (SCode(*Name) != 0) {
1148 68 : Key[Index] = '0' + SCode(*Name);
1149 68 : Index ++;
1150 : }
1151 : }
1152 : }
1153 : }
1154 : return MAL_SUCCEED;
1155 : }
1156 :
1157 : static str
1158 68 : soundex(str *res, str *Name)
1159 : {
1160 68 : str msg = MAL_SUCCEED;
1161 :
1162 68 : GDKfree(*res);
1163 72 : RETURN_NIL_IF(strNil(*Name), TYPE_str);
1164 :
1165 64 : *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
1166 66 : if (*res == NULL)
1167 0 : throw(MAL, "soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1168 :
1169 : /* calculate Key for Name */
1170 66 : if ((msg = soundex_code(*Name, *res))) {
1171 1 : GDKfree(*res);
1172 1 : *res = NULL;
1173 1 : return msg;
1174 : }
1175 : return msg;
1176 : }
1177 :
1178 : static str
1179 29 : stringdiff(int *res, str *s1, str *s2)
1180 : {
1181 29 : str r = MAL_SUCCEED;
1182 29 : char *S1 = NULL, *S2 = NULL;
1183 :
1184 29 : r = soundex(&S1, s1);
1185 28 : if (r != MAL_SUCCEED)
1186 : return r;
1187 28 : r = soundex(&S2, s2);
1188 29 : if (r != MAL_SUCCEED) {
1189 0 : GDKfree(S1);
1190 0 : return r;
1191 : }
1192 29 : r = TXTSIMdameraulevenshtein1(res, &S1, &S2);
1193 30 : GDKfree(S1);
1194 30 : GDKfree(S2);
1195 30 : return r;
1196 : }
1197 :
1198 : /******************************
1199 : * QGRAMNORMALIZE
1200 : *
1201 : * This function 'normalizes' a string so valid q-grams can be made of it:
1202 : * All characters are transformed to uppercase, and all characters
1203 : * which are not letters or digits are stripped to a single space.
1204 : *
1205 : * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
1206 : * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
1207 : *
1208 : *****************************/
1209 : static str
1210 12 : qgram_normalize(str *res, str *Input)
1211 : {
1212 12 : char *input = *Input;
1213 12 : int i, j = 0;
1214 12 : char c, last = ' ';
1215 :
1216 12 : GDKfree(*res);
1217 12 : RETURN_NIL_IF(strNil(input), TYPE_str);
1218 12 : *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1)); /* normalized strings are never longer than original */
1219 12 : if (*res == NULL)
1220 0 : throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1221 :
1222 69 : for (i = 0; input[i]; i++) {
1223 57 : c = toupper(input[i]);
1224 57 : if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
1225 57 : c = ' ';
1226 57 : if (c != ' ' || last != ' ') {
1227 49 : (*res)[j++] = c;
1228 : }
1229 57 : last = c;
1230 : }
1231 12 : (*res)[j] = 0;
1232 : /* strip final whitespace */
1233 15 : while (j > 0 && (*res)[--j] == ' ')
1234 3 : (*res)[j] = 0;
1235 :
1236 : return MAL_SUCCEED;
1237 : }
1238 :
1239 : static str
1240 0 : qgram_selfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid,
1241 : flt *c, int *k)
1242 : {
1243 0 : BAT *qgram, *id, *pos, *len;
1244 0 : BUN n;
1245 0 : BUN i, j;
1246 0 : BAT *bn, *bn2;
1247 0 : oid *qbuf;
1248 0 : int *ibuf;
1249 0 : int *pbuf;
1250 0 : int *lbuf;
1251 0 : str msg = MAL_SUCCEED;
1252 :
1253 0 : qgram = BATdescriptor(*qid);
1254 0 : id = BATdescriptor(*bid);
1255 0 : pos = BATdescriptor(*pid);
1256 0 : len = BATdescriptor(*lid);
1257 0 : if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
1258 0 : BBPreclaim(qgram);
1259 0 : BBPreclaim(id);
1260 0 : BBPreclaim(pos);
1261 0 : BBPreclaim(len);
1262 0 : throw(MAL, "txtsim.qgramselfjoin",
1263 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1264 : }
1265 :
1266 0 : BATiter qgrami = bat_iterator(qgram);
1267 0 : BATiter idi = bat_iterator(id);
1268 0 : BATiter posi = bat_iterator(pos);
1269 0 : BATiter leni = bat_iterator(len);
1270 0 : if (qgrami.type != TYPE_oid)
1271 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1272 : SEMANTIC_TYPE_MISMATCH
1273 : ": tail of BAT qgram must be oid");
1274 0 : else if (idi.type != TYPE_int)
1275 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1276 : SEMANTIC_TYPE_MISMATCH
1277 : ": tail of BAT id must be int");
1278 0 : else if (posi.type != TYPE_int)
1279 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1280 : SEMANTIC_TYPE_MISMATCH
1281 : ": tail of BAT pos must be int");
1282 0 : else if (leni.type != TYPE_int)
1283 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1284 : SEMANTIC_TYPE_MISMATCH
1285 : ": tail of BAT len must be int");
1286 0 : if (msg) {
1287 0 : bat_iterator_end(&qgrami);
1288 0 : bat_iterator_end(&idi);
1289 0 : bat_iterator_end(&posi);
1290 0 : bat_iterator_end(&leni);
1291 0 : BBPunfix(qgram->batCacheid);
1292 0 : BBPunfix(id->batCacheid);
1293 0 : BBPunfix(pos->batCacheid);
1294 0 : BBPunfix(len->batCacheid);
1295 0 : return msg;
1296 : }
1297 :
1298 0 : n = BATcount(qgram);
1299 :
1300 : /* if (BATcount(qgram)>1 && !qgrami.sorted) throw(MAL, "txtsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
1301 :
1302 0 : if (!ALIGNsynced(qgram, id))
1303 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1304 : SEMANTIC_TYPE_MISMATCH
1305 : ": qgram and id are not synced");
1306 :
1307 0 : else if (!ALIGNsynced(qgram, pos))
1308 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1309 : SEMANTIC_TYPE_MISMATCH
1310 : ": qgram and pos are not synced");
1311 0 : else if (!ALIGNsynced(qgram, len))
1312 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1313 : SEMANTIC_TYPE_MISMATCH
1314 : ": qgram and len are not synced");
1315 :
1316 0 : else if (qgrami.width != ATOMsize(qgrami.type))
1317 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1318 : SEMANTIC_TYPE_MISMATCH
1319 : ": qgram is not a true void bat");
1320 0 : else if (idi.width != ATOMsize(idi.type))
1321 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1322 : SEMANTIC_TYPE_MISMATCH
1323 : ": id is not a true void bat");
1324 :
1325 0 : else if (posi.width != ATOMsize(posi.type))
1326 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1327 : SEMANTIC_TYPE_MISMATCH
1328 : ": pos is not a true void bat");
1329 0 : else if (leni.width != ATOMsize(leni.type))
1330 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1331 : SEMANTIC_TYPE_MISMATCH
1332 : ": len is not a true void bat");
1333 0 : if (msg) {
1334 0 : bat_iterator_end(&qgrami);
1335 0 : bat_iterator_end(&idi);
1336 0 : bat_iterator_end(&posi);
1337 0 : bat_iterator_end(&leni);
1338 0 : BBPunfix(qgram->batCacheid);
1339 0 : BBPunfix(id->batCacheid);
1340 0 : BBPunfix(pos->batCacheid);
1341 0 : BBPunfix(len->batCacheid);
1342 0 : return msg;
1343 : }
1344 :
1345 0 : bn = COLnew(0, TYPE_int, n, TRANSIENT);
1346 0 : bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
1347 0 : if (bn == NULL || bn2 == NULL) {
1348 0 : bat_iterator_end(&qgrami);
1349 0 : bat_iterator_end(&idi);
1350 0 : bat_iterator_end(&posi);
1351 0 : bat_iterator_end(&leni);
1352 0 : BBPreclaim(bn);
1353 0 : BBPreclaim(bn2);
1354 0 : BBPunfix(qgram->batCacheid);
1355 0 : BBPunfix(id->batCacheid);
1356 0 : BBPunfix(pos->batCacheid);
1357 0 : BBPunfix(len->batCacheid);
1358 0 : throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1359 : }
1360 :
1361 0 : qbuf = (oid *) qgrami.base;
1362 0 : ibuf = (int *) idi.base;
1363 0 : pbuf = (int *) posi.base;
1364 0 : lbuf = (int *) leni.base;
1365 0 : for (i = 0; i < n - 1; i++) {
1366 0 : for (j = i + 1;
1367 0 : (j < n && qbuf[j] == qbuf[i]
1368 0 : && pbuf[j] <= (pbuf[i] + (*k + *c * MIN(lbuf[i], lbuf[j]))));
1369 0 : j++) {
1370 0 : if (ibuf[i] != ibuf[j]
1371 0 : && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MIN(lbuf[i], lbuf[j]))) {
1372 0 : if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED
1373 0 : || BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
1374 0 : bat_iterator_end(&qgrami);
1375 0 : bat_iterator_end(&idi);
1376 0 : bat_iterator_end(&posi);
1377 0 : bat_iterator_end(&leni);
1378 0 : BBPunfix(qgram->batCacheid);
1379 0 : BBPunfix(id->batCacheid);
1380 0 : BBPunfix(pos->batCacheid);
1381 0 : BBPunfix(len->batCacheid);
1382 0 : BBPreclaim(bn);
1383 0 : BBPreclaim(bn2);
1384 0 : throw(MAL, "txtsim.qgramselfjoin",
1385 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1386 : }
1387 : }
1388 : }
1389 : }
1390 0 : bat_iterator_end(&qgrami);
1391 0 : bat_iterator_end(&idi);
1392 0 : bat_iterator_end(&posi);
1393 0 : bat_iterator_end(&leni);
1394 :
1395 0 : BBPunfix(qgram->batCacheid);
1396 0 : BBPunfix(id->batCacheid);
1397 0 : BBPunfix(pos->batCacheid);
1398 0 : BBPunfix(len->batCacheid);
1399 :
1400 0 : *res1 = bn->batCacheid;
1401 0 : BBPkeepref(bn);
1402 0 : *res2 = bn2->batCacheid;
1403 0 : BBPkeepref(bn2);
1404 :
1405 0 : return MAL_SUCCEED;
1406 : }
1407 :
1408 : /* copy up to utf8len UTF-8 encoded characters from src to buf
1409 : * stop early if buf (size given by bufsize) is too small, or if src runs out
1410 : * return number of UTF-8 characters copied (excluding NUL)
1411 : * close with NUL if enough space */
1412 : static size_t
1413 26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
1414 : {
1415 26 : size_t cnt = 0;
1416 :
1417 128 : while (utf8len != 0 && *src != 0 && bufsize != 0) {
1418 102 : bufsize--;
1419 102 : utf8len--;
1420 102 : cnt++;
1421 102 : if (((*buf++ = *src++) & 0x80) != 0) {
1422 40 : while ((*src & 0xC0) == 0x80 && bufsize != 0) {
1423 20 : *buf++ = *src++;
1424 20 : bufsize--;
1425 : }
1426 : }
1427 : }
1428 26 : if (bufsize != 0)
1429 26 : *buf = 0;
1430 26 : return cnt;
1431 : }
1432 :
1433 : static str
1434 2 : str_2_qgrams(bat *ret, str *val)
1435 : {
1436 2 : BAT *bn;
1437 2 : size_t i, len = strlen(*val) + 5;
1438 2 : str s = GDKmalloc(len);
1439 2 : char qgram[4 * 6 + 1]; /* 4 UTF-8 code points plus NULL byte */
1440 :
1441 2 : if (s == NULL)
1442 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1443 2 : strcpy(s, "##");
1444 2 : strcpy(s + 2, *val);
1445 2 : strcpy(s + len - 3, "$$");
1446 2 : bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
1447 2 : if (bn == NULL) {
1448 0 : GDKfree(s);
1449 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1450 : }
1451 :
1452 : i = 0;
1453 26 : while (s[i]) {
1454 26 : if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
1455 : break;
1456 24 : if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
1457 0 : BBPreclaim(bn);
1458 0 : GDKfree(s);
1459 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1460 : }
1461 24 : if ((s[i++] & 0xC0) == 0xC0) {
1462 8 : while ((s[i] & 0xC0) == 0x80)
1463 4 : i++;
1464 : }
1465 : }
1466 2 : *ret = bn->batCacheid;
1467 2 : BBPkeepref(bn);
1468 2 : GDKfree(s);
1469 2 : return MAL_SUCCEED;
1470 : }
1471 :
1472 : #include "mel.h"
1473 : mel_func txtsim_init_funcs[] = {
1474 : pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1, transposition = 2)", args(1,3,arg("",int),arg("x",str),arg("y",str))),
1475 : pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
1476 : command("txtsim", "editdistance", TXTSIMdameraulevenshtein1, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1477 : command("txtsim", "editdistance2", TXTSIMdameraulevenshtein2, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 1", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1478 : pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1)", args(1,3,arg("",int),arg("s",str),arg("t",str))),
1479 : pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, variable operation costs (ins/del, replacement)", args(1,5,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int))),
1480 : pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
1481 : pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with basic costs but up to a MAX", args(1, 4, arg("",int), arg("l",str),arg("r",str),arg("k",int))),
1482 : pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with variable costs but up to a MAX", args(1, 6, arg("",int), arg("l",str),arg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
1483 : pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 4, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int))),
1484 : pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 6, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
1485 : command("txtsim", "maxlevenshteinjoin", TXTSIMmaxlevenshteinjoin, false, "", args(2,10, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("k",int),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
1486 : command("txtsim", "soundex", soundex, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
1487 : command("txtsim", "stringdiff", stringdiff, false, "Calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
1488 : command("txtsim", "qgramnormalize", qgram_normalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
1489 : command("txtsim", "qgramselfjoin", qgram_selfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
1490 : command("txtsim", "str2qgrams", str_2_qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
1491 : command("txtsim", "jarowinkler", TXTSIMjarowinkler, false, "Calculate Jaro Winkler similarity", args(1,3, arg("",dbl),arg("x",str),arg("y",str))),
1492 : command("txtsim", "minjarowinkler", TXTSIMminjarowinkler, false, "", args(1, 4, arg("",bit), arg("l",str),arg("r",str),arg("threshold",dbl))),
1493 : command("txtsim", "minjarowinklerjoin", TXTSIMminjarowinklerjoin, false, "", args(2, 10, batarg("",oid),batarg("",oid), batarg("l",str),batarg("r",str),batarg("threshold",dbl),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
1494 : { .imp=NULL }
1495 : };
1496 : #include "mal_import.h"
1497 : #ifdef _MSC_VER
1498 : #undef read
1499 : #pragma section(".CRT$XCU",read)
1500 : #endif
1501 334 : LIB_STARTUP_FUNC(init_txtsim_mal)
1502 334 : { mal_module("txtsim", NULL, txtsim_init_funcs); }
|