Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "gdk.h"
15 : #include "str.h"
16 : #include "mal.h"
17 : #include "mal_exception.h"
18 : #include "mal_interpreter.h"
19 : #include <string.h>
20 : #include <limits.h>
21 :
22 : #define MIN3(X,Y,Z) (MIN(MIN((X),(Y)),(Z)))
23 : #define MAX3(X,Y,Z) (MAX(MAX((X),(Y)),(Z)))
24 : #define MIN4(W,X,Y,Z) (MIN(MIN(MIN(W,X),Y),Z))
25 :
26 : static void
27 15 : reclaim_bats(int nargs, ...)
28 : {
29 15 : va_list valist;
30 :
31 15 : va_start(valist, nargs);
32 90 : for (int i = 0; i < nargs; i++) {
33 75 : BAT *b = va_arg(valist, BAT *);
34 120 : BBPreclaim(b);
35 : }
36 15 : va_end(valist);
37 15 : }
38 :
39 : static inline int *
40 4178 : damerau_get_cellpointer(int *pOrigin, int col, int row, int nCols)
41 : {
42 4178 : return pOrigin + col + (row * (nCols + 1));
43 : }
44 :
45 : static inline int
46 2302 : damerau_getat(int *pOrigin, int col, int row, int nCols)
47 : {
48 2302 : int *pCell;
49 2302 : pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
50 2302 : return *pCell;
51 : }
52 :
53 : static inline void
54 1876 : damerau_putat(int *pOrigin, int col, int row, int nCols, int x)
55 : {
56 1876 : int *pCell;
57 1876 : pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
58 1876 : *pCell = x;
59 : }
60 :
61 : static str
62 83 : dameraulevenshtein(int *res, str *S, str *T, int insdel_cost, int replace_cost,
63 : int transpose_cost)
64 : {
65 83 : char *s = *S;
66 83 : char *t = *T;
67 83 : int *d; /* pointer to matrix */
68 83 : int n; /* length of s */
69 83 : int m; /* length of t */
70 83 : int i; /* iterates through s */
71 83 : int j; /* iterates through t */
72 83 : char s_i; /* ith character of s */
73 83 : char t_j; /* jth character of t */
74 83 : int cost; /* cost */
75 83 : int cell; /* contents of target cell */
76 83 : int above; /* contents of cell immediately above */
77 83 : int left; /* contents of cell immediately to left */
78 83 : int diag; /* contents of cell immediately above and to left */
79 83 : lng sz; /* number of cells in matrix */
80 83 : int diag2 = 0, cost2 = 0;
81 :
82 159 : if (strNil(*S) || strNil(*T)) {
83 10 : *res = int_nil;
84 10 : return MAL_SUCCEED;
85 : }
86 :
87 : /* Step 1 */
88 73 : n = (int) strlen(s); /* 64bit: assume strings are less than 2 GB */
89 73 : m = (int) strlen(t);
90 73 : if (n == 0) {
91 8 : *res = m;
92 8 : return MAL_SUCCEED;
93 : }
94 65 : if (m == 0) {
95 8 : *res = n;
96 8 : return MAL_SUCCEED;
97 : }
98 57 : sz = (((lng)n) + 1) * (((lng)m) + 1) * sizeof(int);
99 57 : if (sz > (LL_CONSTANT(1)<<28))
100 1 : throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
101 56 : d = (int *) GDKmalloc((size_t)sz);
102 56 : if (d == NULL)
103 0 : throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
104 :
105 : /* Step 2 */
106 346 : for (i = 0; i <= n; i++) {
107 290 : damerau_putat(d, i, 0, n, i);
108 : }
109 324 : for (j = 0; j <= m; j++) {
110 268 : damerau_putat(d, 0, j, n, j);
111 : }
112 : /* Step 3 */
113 290 : for (i = 1; i <= n; i++) {
114 234 : s_i = s[i - 1];
115 : /* Step 4 */
116 1552 : for (j = 1; j <= m; j++) {
117 1318 : t_j = t[j - 1];
118 : /* Step 5 */
119 1318 : if (s_i == t_j) {
120 : cost = 0;
121 : } else {
122 1002 : cost = replace_cost;
123 : }
124 : /* Step 6 */
125 1318 : above = damerau_getat(d, i - 1, j, n);
126 1318 : left = damerau_getat(d, i, j - 1, n);
127 1318 : diag = damerau_getat(d, i - 1, j - 1, n);
128 1318 : if (j >= 2 && i >= 2) {
129 : /* NEW: detect transpositions */
130 928 : diag2 = damerau_getat(d, i - 2, j - 2, n);
131 928 : if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
132 : cost2 = transpose_cost;
133 : } else {
134 856 : cost2 = 2;
135 : }
136 928 : cell = MIN4(above + insdel_cost, left + insdel_cost,
137 : diag + cost, diag2 + cost2);
138 : } else {
139 390 : cell = MIN3(above + insdel_cost, left + insdel_cost,
140 : diag + cost);
141 : }
142 1318 : damerau_putat(d, i, j, n, cell);
143 : }
144 : }
145 : /* Step 7 */
146 56 : *res = damerau_getat(d, n, m, n);
147 56 : GDKfree(d);
148 56 : return MAL_SUCCEED;
149 : }
150 :
151 : static str
152 39 : TXTSIMdameraulevenshtein1(int *result, str *s, str *t)
153 : {
154 9 : return dameraulevenshtein(result, s, t, 1, 1, 2);
155 : }
156 :
157 : static str
158 9 : TXTSIMdameraulevenshtein2(int *result, str *s, str *t)
159 : {
160 9 : return dameraulevenshtein(result, s, t, 1, 1, 1);
161 : }
162 :
163 : static str
164 2 : TXTSIMdameraulevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
165 : InstrPtr pci)
166 : {
167 2 : (void) cntxt;
168 2 : (void) mb;
169 2 : int *res = getArgReference_int(stk, pci, 0);
170 2 : str *X = getArgReference_str(stk, pci, 1),
171 2 : *Y = getArgReference_str(stk, pci, 2);
172 2 : int insdel_cost, replace_cost, transpose_cost;
173 :
174 2 : assert(pci->argc == 3 || pci->argc == 6);
175 :
176 2 : if (pci->argc == 3) {
177 : insdel_cost = 1;
178 : replace_cost = 1;
179 : transpose_cost = 2;
180 : } else {
181 1 : insdel_cost = *getArgReference_int(stk, pci, 3);
182 1 : replace_cost = *getArgReference_int(stk, pci, 4);
183 1 : transpose_cost = *getArgReference_int(stk, pci, 5);
184 : }
185 :
186 2 : return dameraulevenshtein(res, X, Y, insdel_cost, replace_cost,
187 : transpose_cost);
188 : }
189 :
190 : static inline str
191 12 : levenshtein(int *res, const str *X, const str *Y, int insdel_cost,
192 : int replace_cost, int max)
193 : {
194 12 : str x = *X, y = *Y, x_iter = x;
195 12 : unsigned int xlen, ylen, i = 0, j = 0;
196 12 : unsigned int last_diagonal, old_diagonal;
197 12 : int cx, cy;
198 12 : unsigned int *column, min;
199 :
200 22 : if (strNil(*X) || strNil(*Y)) {
201 3 : *res = int_nil;
202 3 : return MAL_SUCCEED;
203 : }
204 :
205 9 : xlen = UTF8_strlen(x);
206 9 : ylen = UTF8_strlen(y);
207 :
208 9 : if (xlen == ylen && (strcmp(x, y) == 0))
209 : return MAL_SUCCEED;
210 :
211 8 : column = GDKmalloc((xlen + 1) * sizeof(unsigned int));
212 8 : if (column == NULL)
213 0 : throw(MAL, "levenshtein", MAL_MALLOC_FAIL);
214 :
215 59 : for (i = 1; i <= xlen; i++)
216 51 : column[i] = i;
217 :
218 57 : for (j = 1; j <= ylen; j++) {
219 49 : column[0] = j;
220 49 : min = INT_MAX;
221 49 : x_iter = x;
222 49 : UTF8_GETCHAR(cy, y);
223 306 : for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
224 257 : UTF8_GETCHAR(cx, x_iter);
225 257 : old_diagonal = column[i];
226 257 : column[i] = MIN3(column[i] + insdel_cost,
227 : column[i - 1] + insdel_cost,
228 : last_diagonal + (cx == cy ? 0 : replace_cost));
229 257 : last_diagonal = old_diagonal;
230 257 : if (last_diagonal < min)
231 : min = last_diagonal;
232 : }
233 49 : if (max != -1 && min > (unsigned int) max) {
234 0 : *res = INT_MAX;
235 0 : GDKfree(column);
236 0 : return MAL_SUCCEED;
237 : }
238 49 : (*x)++;
239 : }
240 :
241 8 : *res = column[xlen];
242 8 : GDKfree(column);
243 8 : return MAL_SUCCEED;
244 0 : illegal:
245 : /* UTF8_GETCHAR bail */
246 0 : GDKfree(column);
247 0 : throw(MAL, "txtsim.levenshtein", "Illegal unicode code point");
248 : }
249 :
250 : /* Levenshtein OP but with column externaly allocated */
251 : static inline int
252 0 : levenshtein2(const str X, const str Y, const size_t xlen, const size_t ylen,
253 : unsigned int *column, const int insdel_cost,
254 : const int replace_cost, const int max)
255 : {
256 0 : str x = X, y = Y;
257 0 : str x_iter = x;
258 0 : unsigned int i = 0, j = 0, min;;
259 0 : unsigned int last_diagonal, old_diagonal;
260 0 : int cx, cy;
261 :
262 0 : if (strNil(x) || strNil(y))
263 0 : return int_nil;
264 :
265 0 : if (xlen == ylen && (strcmp(x, y) == 0))
266 : return 0;
267 :
268 0 : for (i = 1; i <= xlen; i++)
269 0 : column[i] = i;
270 :
271 0 : for (j = 1; j <= ylen; j++) {
272 0 : column[0] = j;
273 0 : min = INT_MAX;
274 0 : x_iter = x;
275 0 : UTF8_GETCHAR(cy, y);
276 0 : for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
277 0 : UTF8_GETCHAR(cx, x_iter);
278 0 : old_diagonal = column[i];
279 0 : column[i] = MIN3(column[i] + insdel_cost,
280 : column[i - 1] + insdel_cost,
281 : last_diagonal + (cx == cy ? 0 : replace_cost));
282 0 : last_diagonal = old_diagonal;
283 0 : if (last_diagonal < min)
284 : min = last_diagonal;
285 : }
286 0 : if (max != -1 && min > (unsigned int) max)
287 : return INT_MAX;
288 0 : (*x)++;
289 : }
290 0 : return column[xlen];
291 : illegal:
292 : /* UTF8_GETCHAR bail */
293 : return INT_MAX;
294 : }
295 :
296 : static str
297 45 : TXTSIMlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
298 : {
299 45 : (void) cntxt;
300 45 : (void) mb;
301 45 : int *res = getArgReference_int(stk, pci, 0);
302 45 : str *X = getArgReference_str(stk, pci, 1),
303 45 : *Y = getArgReference_str(stk, pci, 2);
304 45 : int insdel_cost, replace_cost;
305 :
306 45 : if (pci->argc == 3) {
307 : insdel_cost = 1;
308 : replace_cost = 1;
309 34 : } else if (pci->argc == 5 || pci->argc == 6) {
310 34 : insdel_cost = *getArgReference_int(stk, pci, 3);
311 34 : replace_cost = *getArgReference_int(stk, pci, 4);
312 : /* Backwards compatibility purposes */
313 34 : if (pci->argc == 6) {
314 33 : int transposition_cost = *getArgReference_int(stk, pci, 5);
315 33 : return dameraulevenshtein(res, X, Y, insdel_cost, replace_cost,
316 : transposition_cost);
317 : }
318 : } else {
319 0 : throw(MAL, "txtsim.levenshtein", RUNTIME_SIGNATURE_MISSING);
320 : }
321 :
322 12 : return levenshtein(res, X, Y, insdel_cost, replace_cost, -1);;
323 : }
324 :
325 : static str
326 0 : TXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
327 : {
328 0 : (void) cntxt;
329 0 : (void) mb;
330 0 : int *res = getArgReference_int(stk, pci, 0);
331 0 : const str *X = getArgReference_str(stk, pci, 1),
332 0 : *Y = getArgReference_str(stk, pci, 2);
333 0 : const int *k = getArgReference_int(stk, pci, 3);
334 0 : int insdel_cost, replace_cost;
335 :
336 0 : if (pci->argc == 4) {
337 : insdel_cost = 1;
338 : replace_cost = 1;
339 0 : } else if (pci->argc == 6) {
340 0 : insdel_cost = *getArgReference_int(stk, pci, 4);
341 0 : replace_cost = *getArgReference_int(stk, pci, 5);
342 : } else {
343 0 : throw(MAL, "txtsim.maxlevenshtein", RUNTIME_SIGNATURE_MISSING);
344 : }
345 :
346 0 : return levenshtein(res, X, Y, insdel_cost, replace_cost, *k);
347 : }
348 :
349 : static str
350 0 : BATTXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
351 : {
352 0 : (void) cntxt;
353 0 : (void) mb;
354 0 : bat *res = getArgReference_bat(stk, pci, 0);
355 0 : bat *lid = getArgReference_bat(stk, pci, 1);
356 0 : bat *rid = getArgReference_bat(stk, pci, 2);
357 0 : int *k = getArgReference_int(stk, pci, 3);
358 0 : int insdel_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 4) : 1,
359 0 : replace_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 5) : 1;
360 0 : BAT *left = NULL, *right = NULL, *bn = NULL;
361 0 : BUN p, q;
362 0 : BATiter li, ri;
363 0 : unsigned int *buffer = NULL;
364 0 : str lv, rv, msg = MAL_SUCCEED;
365 0 : size_t llen = 0, rlen = 0, maxlen = 0;
366 0 : int d;
367 0 : bit v;
368 :
369 0 : if ((left = BATdescriptor(*lid)) == NULL) {
370 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
371 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
372 0 : goto exit;
373 : }
374 0 : if ((right = BATdescriptor(*rid)) == NULL) {
375 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
376 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
377 0 : goto exit;
378 : }
379 0 : if (BATcount(left) != BATcount(right)) {
380 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
381 : "Columns must be aligned");
382 0 : goto exit;
383 : }
384 0 : if ((bn = COLnew(0, TYPE_bit, BATcount(left), TRANSIENT)) == NULL) {
385 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
386 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
387 0 : goto exit;
388 : }
389 :
390 0 : li = bat_iterator(left);
391 0 : ri = bat_iterator(right);
392 0 : BATloop(left, p, q) {
393 0 : lv = (str) BUNtail(li, p);
394 0 : rv = (str) BUNtail(ri, p);
395 0 : llen = UTF8_strlen(lv);
396 0 : rlen = UTF8_strlen(rv);
397 0 : if (abs((int) llen - (int) rlen) > (int) *k)
398 0 : v = false;
399 : else {
400 0 : if (llen > maxlen) {
401 0 : maxlen = llen;
402 0 : unsigned int *tmp = GDKrealloc(buffer, (maxlen + 1) * sizeof(unsigned int));
403 0 : if (tmp == NULL) {
404 0 : bat_iterator_end(&li);
405 0 : bat_iterator_end(&ri);
406 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
407 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
408 0 : goto exit;
409 : }
410 : buffer = tmp;
411 : }
412 0 : d = levenshtein2(lv, rv, llen, rlen, buffer, insdel_cost,
413 : replace_cost, (int) *k);
414 0 : v = (bit) (d <= (int) *k);
415 : }
416 0 : if (BUNappend(bn, (const void *) &v, false) != GDK_SUCCEED) {
417 0 : bat_iterator_end(&li);
418 0 : bat_iterator_end(&ri);
419 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
420 : "BUNappend failed");
421 0 : goto exit;
422 : }
423 : }
424 0 : bat_iterator_end(&li);
425 0 : bat_iterator_end(&ri);
426 :
427 0 : *res = bn->batCacheid;
428 0 : BBPkeepref(bn);
429 0 : exit:
430 0 : GDKfree(buffer);
431 0 : BBPreclaim(left);
432 0 : BBPreclaim(right);
433 0 : if (msg != MAL_SUCCEED)
434 0 : BBPreclaim(bn);
435 0 : return msg;
436 : }
437 :
438 : #define JARO_WINKLER_SCALING_FACTOR 0.1
439 : #define JARO_WINKLER_PREFIX_LEN 4
440 :
441 : typedef struct {
442 : size_t matches; /* accumulator for number of matches for this item */
443 : BUN o; /* position in the BAT */
444 : str val; /* string value */
445 : int *cp_sequence; /* string as array of Unicode codepoints */
446 : int len; /* string length in characters (multi-byte characters count as 1) */
447 : int cp_seq_len; /* string length in bytes */
448 : uint64_t abm; /* 64bit alphabet bitmap */
449 : int abm_popcount; /* hamming weight of abm */
450 : } str_item;
451 :
452 : static inline int
453 95 : popcount64(uint64_t x)
454 : {
455 95 : x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL);
456 95 : x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
457 95 : x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL);
458 95 : return (x * 0x0101010101010101ULL) >> 56;
459 : }
460 :
461 : static int
462 45 : str_item_lenrev_cmp(const void *a, const void *b)
463 : {
464 45 : return ((int) ((str_item *) b)->len - (int) ((str_item *) a)->len);
465 : }
466 :
467 : static str
468 64 : str_2_codepointseq(str_item *s)
469 : {
470 64 : str p = s->val;
471 64 : int c;
472 :
473 64 : s->cp_sequence = GDKmalloc(s->len * sizeof(int));
474 64 : if (s->cp_sequence == NULL)
475 0 : throw(MAL, "str_2_byteseq", SQLSTATE(HY013) MAL_MALLOC_FAIL);
476 :
477 428 : for (int i = 0; i < s->len; i++) {
478 364 : UTF8_GETCHAR(c, p);
479 364 : if (c == 0)
480 : break;
481 364 : s->cp_sequence[i] = c;
482 : }
483 : return MAL_SUCCEED;
484 0 : illegal:
485 0 : throw(MAL, "str_2_byteseq", SQLSTATE(42000) "Illegal unicode code point");
486 : }
487 :
488 : static void
489 60 : str_alphabet_bitmap(str_item *s)
490 : {
491 60 : int i;
492 :
493 60 : s->abm = 0ULL;
494 :
495 400 : for (i = 0; i < s->len; i++)
496 340 : s->abm |= 1ULL << (s->cp_sequence[i] % 64);
497 :
498 60 : s->abm_popcount = popcount64(s->abm);
499 60 : }
500 :
501 : static inline double
502 13 : jarowinkler_lp(const str_item *a, const str_item *b)
503 : {
504 13 : unsigned int l;
505 :
506 : /* calculate common string prefix up to prefixlen chars */
507 13 : l = 0;
508 65 : for (int i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN); i++)
509 52 : l += (a->cp_sequence[i] == b->cp_sequence[i]);
510 :
511 13 : return (double) l *JARO_WINKLER_SCALING_FACTOR;
512 : }
513 :
514 : static inline double
515 15 : jarowinkler(const str_item *x, const str_item *y, double lp, int *x_flags,
516 : int *y_flags)
517 : {
518 15 : int xlen = x->len, ylen = y->len;
519 15 : int range = MAX(0, MAX(xlen, ylen) / 2 - 1);
520 15 : int *x1 = x->cp_sequence, *x2 = y->cp_sequence;
521 15 : int m = 0, t = 0;
522 15 : int i, j, l;
523 15 : double dw;
524 :
525 15 : if (!xlen || !ylen)
526 : return 0.0;
527 89 : for (i = 0; i < xlen; i++)
528 74 : x_flags[i] = 0;
529 102 : for (i = 0; i < ylen; i++)
530 87 : y_flags[i] = 0;
531 : /* matching chars */
532 102 : for (i = 0; i < ylen; i++) {
533 280 : for (j = MAX(i - range, 0), l = MIN(i + range + 1, xlen); j < l; j++) {
534 225 : if (x2[i] == x1[j] && !x_flags[j]) {
535 32 : x_flags[j] = 1;
536 32 : y_flags[i] = 1;
537 32 : m++;
538 32 : break;
539 : }
540 : }
541 : }
542 15 : if (!m)
543 : return 0.0;
544 : /* char transpositions */
545 : l = 0;
546 90 : for (i = 0; i < ylen; i++) {
547 77 : if (y_flags[i] == 1) {
548 48 : for (j = l; j < xlen; j++) {
549 48 : if (x_flags[j] == 1) {
550 32 : l = j + 1;
551 32 : break;
552 : }
553 : }
554 32 : if (x2[i] != x1[j])
555 4 : t++;
556 : }
557 : }
558 13 : t /= 2;
559 :
560 : /* jaro similarity */
561 13 : dw = (((double) m / xlen) + ((double) m / ylen) +
562 13 : ((double) (m - t) / m)) / 3.0;
563 : /* calculate common string prefix up to prefixlen chars */
564 13 : if (lp == -1)
565 13 : lp = jarowinkler_lp(x, y);
566 : /* jaro-winkler similarity */
567 13 : dw = dw + (lp * (1 - dw));
568 13 : return dw;
569 : }
570 :
571 : static str
572 2 : TXTSIMjarowinkler(dbl *res, str *x, str *y)
573 : {
574 2 : int *x_flags = NULL, *y_flags = NULL;
575 2 : str_item xi = { 0 }, yi = { 0 };
576 2 : str msg = MAL_SUCCEED;
577 :
578 2 : xi.val = *x;
579 2 : xi.len = UTF8_strlen(*x);
580 2 : if ((msg = str_2_codepointseq(&xi)) != MAL_SUCCEED)
581 0 : goto bailout;
582 :
583 2 : yi.val = *y;
584 2 : yi.len = UTF8_strlen(*y);
585 2 : if ((msg = str_2_codepointseq(&yi)) != MAL_SUCCEED)
586 0 : goto bailout;
587 :
588 2 : x_flags = GDKmalloc(xi.len * sizeof(int));
589 2 : y_flags = GDKmalloc(yi.len * sizeof(int));
590 :
591 2 : if (x_flags == NULL || y_flags == NULL)
592 0 : goto bailout;
593 :
594 2 : *res = jarowinkler(&xi, &yi, -1, x_flags, y_flags);
595 :
596 2 : bailout:
597 2 : GDKfree(x_flags);
598 2 : GDKfree(y_flags);
599 2 : GDKfree(xi.cp_sequence);
600 2 : GDKfree(yi.cp_sequence);
601 2 : return msg;
602 : }
603 :
604 : static str
605 0 : TXTSIMminjarowinkler(bit *res, str *x, str *y, const dbl *threshold)
606 : {
607 0 : str msg = MAL_SUCCEED;
608 0 : double s = 1;
609 :
610 0 : msg = TXTSIMjarowinkler(&s, x, y);
611 0 : if (msg != MAL_SUCCEED)
612 0 : throw(MAL, "txt.minjarowinkler", OPERATION_FAILED);
613 :
614 0 : *res = (s > *threshold);
615 0 : return MAL_SUCCEED;
616 : }
617 :
618 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width))
619 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
620 :
621 : #define PREP_BAT_STRITEM(B, CI, SI) \
622 : do { \
623 : for (n = 0; n < CI.ncand; n++) { \
624 : SI[n].matches = 0; \
625 : SI[n].o = canditer_next(&CI); \
626 : SI[n].val = (str) VALUE(B, SI[n].o - B->hseqbase); \
627 : SI[n].cp_sequence = NULL; \
628 : SI[n].len = UTF8_strlen(SI[n].val); \
629 : SI[n].cp_seq_len = str_strlen(SI[n].val); \
630 : if ((msg = str_2_codepointseq(&SI[n])) != MAL_SUCCEED) \
631 : goto exit; \
632 : str_alphabet_bitmap(&SI[n]); \
633 : } \
634 : } while (false)
635 :
636 : #define FINALIZE_BATS(L, R, LCI, RCI, LSI, RSI) \
637 : do { \
638 : assert(BATcount(L) == BATcount(R)); \
639 : BATsetcount(L, BATcount(L)); \
640 : BATsetcount(R, BATcount(R)); \
641 : for (n = 0; n < LCI.ncand; n++) { \
642 : if (LSI[n].matches > 1) { \
643 : L->tkey = false; \
644 : break; \
645 : } \
646 : } \
647 : if (n == LCI.ncand) { \
648 : L->tkey = true; \
649 : } \
650 : for (n = 0; n < RCI.ncand; n++) { \
651 : if (RSI[n].matches > 1) { \
652 : R->tkey = false; \
653 : break; \
654 : } \
655 : } \
656 : if (n == RCI.ncand) { \
657 : R->tkey = true; \
658 : } \
659 : BATordered(L); \
660 : BATordered(R); \
661 : L->theap->dirty |= BATcount(L) > 0; \
662 : R->theap->dirty |= BATcount(R) > 0; \
663 : } while (false)
664 :
665 : static inline int
666 12 : maxlevenshtein_extcol_stritem(const str_item *si1, const str_item *si2,
667 : unsigned int *column, const int k)
668 : {
669 12 : unsigned int lastdiag, olddiag;
670 12 : int c1, c2, x, y;
671 12 : unsigned int min;
672 12 : int s1len = si1->len, s2len = si2->len;
673 12 : int *s1 = si1->cp_sequence, *s2 = si2->cp_sequence;
674 : /* first test if the strings are equal */
675 12 : if (s1len == s2len) {
676 1 : for (x = 0; x < s1len; x++)
677 1 : if (s1[x] != s2[x])
678 : break;
679 1 : if (x == s1len)
680 : return 0;
681 : }
682 68 : for (y = 1; y <= s1len; y++)
683 56 : column[y] = y;
684 78 : for (x = 1; x <= s2len; x++) {
685 69 : c2 = s2[x - 1];
686 69 : column[0] = x;
687 69 : min = INT_MAX;
688 395 : for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
689 326 : olddiag = column[y];
690 326 : c1 = s1[y - 1];
691 326 : column[y] = MIN3(column[y] + 1, column[y - 1] + 1, lastdiag + (c1 == c2 ? 0 : 1));
692 326 : lastdiag = olddiag;
693 326 : if (lastdiag < min)
694 : min = lastdiag;
695 : }
696 69 : if (min > (unsigned int) k)
697 : return INT_MAX;
698 : }
699 9 : return column[s1len];
700 : }
701 :
702 : static str
703 6 : maxlevenshteinjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int k)
704 : {
705 6 : BAT *r1t = NULL, *r2t = NULL;
706 6 : BUN n;
707 6 : struct canditer lci, rci;
708 6 : const char *lvals, *rvals, *lvars, *rvars;;
709 6 : int lwidth, rwidth, d;
710 6 : str_item *lsi = NULL, *rsi = NULL;
711 6 : str msg = MAL_SUCCEED;
712 6 : unsigned int *buffer = NULL;
713 :
714 18 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
715 6 : assert(ATOMtype(l->ttype) == TYPE_str);
716 :
717 6 : canditer_init(&lci, l, sl);
718 6 : canditer_init(&rci, r, sr);
719 :
720 6 : if (lci.ncand == 0 || rci.ncand == 0)
721 0 : goto exit;
722 :
723 6 : lvals = (const char *) Tloc(l, 0);
724 6 : rvals = (const char *) Tloc(r, 0);
725 6 : lvars = l->tvheap->base;
726 6 : rvars = r->tvheap->base;
727 6 : lwidth = l->twidth;
728 6 : rwidth = r->twidth;
729 :
730 6 : if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
731 6 : (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
732 0 : reclaim_bats(2, r1t, r2t);
733 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
734 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
735 0 : goto exit;
736 : }
737 :
738 6 : r1t->tsorted = r1t->trevsorted = false;
739 6 : r2t->tsorted = r2t->trevsorted = false;
740 :
741 6 : if ((lsi = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
742 6 : (rsi = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
743 0 : reclaim_bats(2, r1t, r2t);
744 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
745 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
746 0 : goto exit;
747 : }
748 :
749 12 : PREP_BAT_STRITEM(l, lci, lsi);
750 24 : PREP_BAT_STRITEM(r, rci, rsi);
751 6 : qsort(lsi, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
752 6 : qsort(rsi, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
753 :
754 6 : if ((buffer = GDKmalloc((lsi[0].len + 1) * sizeof(int))) == NULL) {
755 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
756 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
757 0 : goto exit;
758 : }
759 :
760 : /* join loop */
761 24 : for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
762 34 : for (n = lstart; n < lci.ncand; n++) {
763 : /* first and cheapest filter */
764 18 : if ((lsi[n].len) > k + rsi[rstart].len) {
765 0 : lstart++;
766 0 : continue; /* no possible matches yet for this r */
767 18 : } else if (rsi[rstart].len > k + lsi[n].len) {
768 : break; /* no more possible matches from this r */
769 : }
770 : /* filter by comparing alphabet bitmaps (imprecise but fast filter) */
771 16 : d = MAX(lsi[n].abm_popcount,
772 : rsi[rstart].abm_popcount) -
773 16 : popcount64(lsi[n].abm & rsi[rstart].abm);
774 16 : if (d > k)
775 4 : continue;
776 : /* final and most expensive test: Levenshtein distance */
777 12 : d = maxlevenshtein_extcol_stritem(&lsi[n], &rsi[rstart], buffer,
778 : (const unsigned int) k);
779 12 : if (d > k)
780 6 : continue;
781 : /* The match test succeeded */
782 6 : lsi[n].matches++;
783 6 : rsi[rstart].matches++;
784 6 : if (bunfastappTYPE(oid, r1t, &(lsi[n].o)) != GDK_SUCCEED) {
785 0 : reclaim_bats(2, r1t, r2t);
786 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
787 : OPERATION_FAILED "Failed bun append");
788 0 : goto exit;
789 : }
790 6 : if (bunfastappTYPE(oid, r2t, &(rsi[rstart].o)) != GDK_SUCCEED) {
791 0 : reclaim_bats(2, r1t, r2t);
792 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
793 : OPERATION_FAILED "Failed bun append");
794 0 : goto exit;
795 : }
796 : }
797 : }
798 :
799 29 : FINALIZE_BATS(r1t, r2t, lci, rci, lsi, rsi);
800 6 : *r1 = r1t;
801 6 : *r2 = r2t;
802 :
803 6 : exit:
804 12 : for (n = 0; lsi && n < lci.ncand; n++)
805 6 : GDKfree(lsi[n].cp_sequence);
806 24 : for (n = 0; rsi && n < rci.ncand; n++)
807 18 : GDKfree(rsi[n].cp_sequence);
808 6 : GDKfree(lsi);
809 6 : GDKfree(rsi);
810 6 : GDKfree(buffer);
811 6 : return msg;
812 : }
813 :
814 : static str
815 6 : TXTSIMmaxlevenshteinjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
816 : const bat *kid, const bat *slid, const bat *srid,
817 : const bit *nil_matches, const lng *estimate,
818 : const bit *anti)
819 : {
820 6 : (void) nil_matches;
821 6 : (void) estimate;
822 6 : (void) anti;
823 :
824 6 : BAT *bleft = NULL, *bright = NULL, *bk = NULL,
825 6 : *bcandleft = NULL, *bcandright = NULL, *r1t = NULL, *r2t = NULL;
826 6 : int k = 0;
827 6 : str msg = MAL_SUCCEED;
828 :
829 6 : if ((bleft = BATdescriptor(*lid)) == NULL ||
830 6 : (bright = BATdescriptor(*rid)) == NULL ||
831 6 : (bk = BATdescriptor(*kid)) == NULL) {
832 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
833 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
834 0 : goto exit;
835 : }
836 :
837 6 : if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
838 6 : (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
839 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
840 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
841 0 : goto exit;
842 : }
843 :
844 6 : if (BATcount(bk) > 0) {
845 6 : BATiter ki = bat_iterator(bk);
846 6 : k = *(int *) BUNtloc(ki, 0);
847 6 : bat_iterator_end(&ki);
848 : }
849 :
850 6 : if ((msg = maxlevenshteinjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, k)) != MAL_SUCCEED)
851 0 : goto exit;
852 :
853 6 : *r1 = r1t->batCacheid;
854 6 : *r2 = r2t->batCacheid;
855 6 : BBPkeepref(r1t);
856 6 : BBPkeepref(r2t);
857 :
858 6 : exit:
859 6 : reclaim_bats(5, bleft, bright, bcandleft, bcandright, bk);
860 6 : if (msg != MAL_SUCCEED)
861 0 : reclaim_bats(2, r1t, r2t);
862 6 : return msg;
863 : }
864 :
865 : static inline void
866 9 : jarowinkler_rangebounds(int *lb, int *ub, const str_item *a, const double lp,
867 : const double threshold)
868 : {
869 9 : *lb = (int) floor(3.0 * a->len * (threshold - lp) / (1.0 - lp) -
870 9 : (2.0 * a->len));
871 9 : *ub = (int) ceil(a->len / ((3.0 * (threshold - lp) / (1.0 - lp)) - 2.0));
872 9 : }
873 :
874 : /* version with given lp and m, and t = 0*/
875 : static inline double
876 19 : jarowinkler_lp_m_t0(const str_item *lsi, const str_item *rsi, double lp, int m)
877 : {
878 19 : double dw;
879 : /* Jaro similarity */
880 19 : dw = (((double) m / lsi->len) + ((double) m / rsi->len) + 1.0) / 3.0;
881 : /* Jaro-Winkler similarity */
882 19 : dw = dw + (lp * (1 - dw));
883 19 : return dw;
884 : }
885 :
886 : static str
887 9 : minjarowinklerjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
888 : const dbl threshold)
889 : {
890 9 : BAT *r1t = NULL, *r2t = NULL;
891 9 : BUN n;
892 9 : struct canditer lci, rci;
893 9 : const char *lvals, *rvals, *lvars, *rvars;
894 9 : int lwidth, rwidth, lb = 0, ub = 0, m = -1, *x_flags = NULL, *y_flags = NULL;
895 9 : str_item *ssl = NULL, *ssr = NULL, shortest;
896 9 : str msg = MAL_SUCCEED;
897 9 : const bool sliding_window_allowed = threshold > (2.01 + JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR) / 3.0;
898 9 : double s, lp = 0;
899 :
900 27 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
901 9 : assert(ATOMtype(l->ttype) == TYPE_str);
902 :
903 9 : canditer_init(&lci, l, sl);
904 9 : canditer_init(&rci, r, sr);
905 :
906 9 : if (lci.ncand == 0 || rci.ncand == 0)
907 0 : goto exit;
908 :
909 9 : lvals = (const char *) Tloc(l, 0);
910 9 : rvals = (const char *) Tloc(r, 0);
911 9 : assert(r->ttype);
912 9 : lvars = l->tvheap->base;
913 9 : rvars = r->tvheap->base;
914 9 : lwidth = l->twidth;
915 9 : rwidth = r->twidth;
916 :
917 9 : if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
918 9 : (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
919 0 : reclaim_bats(2, r1t, r2t);
920 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
921 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
922 0 : goto exit;
923 : }
924 :
925 9 : r1t->tsorted = r1t->trevsorted = false;
926 9 : r2t->tsorted = r2t->trevsorted = false;
927 :
928 9 : if ((ssl = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
929 9 : (ssr = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
930 0 : reclaim_bats(2, r1t, r2t);
931 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
932 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
933 0 : goto exit;
934 : }
935 :
936 18 : PREP_BAT_STRITEM(l, lci, ssl);
937 36 : PREP_BAT_STRITEM(r, rci, ssr);
938 9 : qsort(ssl, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
939 9 : qsort(ssr, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
940 :
941 9 : if ((x_flags = GDKmalloc(ssl[0].len * sizeof(int))) == NULL ||
942 9 : (y_flags = GDKmalloc(ssr[0].len * sizeof(int))) == NULL) {
943 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
944 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
945 0 : goto exit;
946 : }
947 : // lp used for filters. Use -1 for actual JW (forces to compute actual lp)
948 36 : lp = JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR;
949 :
950 : /* join loop */
951 36 : for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
952 27 : if (sliding_window_allowed)
953 9 : jarowinkler_rangebounds(&lb, &ub, &ssr[rstart], lp, threshold);
954 47 : for (n = lstart; n < lci.ncand; n++) {
955 : /* Update sliding window */
956 : /* This is the first and cheapest filter */
957 27 : if (sliding_window_allowed) {
958 9 : if (ssl[n].len > ub) { /* no possible matches yet for this r */
959 1 : lstart++;
960 1 : continue;
961 : }
962 8 : if (ssl[n].len < lb) { /* no more possible matches from this r */
963 : break;
964 : }
965 : }
966 : /* filter by comparing alphabet bitmaps */
967 : /* find the best possible m: the length of the shorter string
968 : * minus the number of characters that surely cannot match */
969 19 : shortest = ssl[n].len < ssr[rstart].len ? ssl[n] : ssr[rstart];
970 19 : m = shortest.len - popcount64(shortest.abm -
971 19 : (ssl[n].abm & ssr[rstart].abm));
972 : /* equivalent to:
973 : m = shortest.len - popcount64(shortest.abm & (~(ssl[n].abm & ssr[rstart].abm))); */
974 19 : s = jarowinkler_lp_m_t0(&ssl[n], &ssr[rstart], lp, m);
975 19 : if (s < threshold) {
976 6 : continue;
977 : }
978 : /* final and most expensive test: Jaro-Winkler similarity */
979 13 : s = jarowinkler(&ssl[n], &ssr[rstart], -1, x_flags, y_flags);
980 13 : if (s < threshold) {
981 4 : continue;
982 : }
983 : /* The match test succeeded */
984 9 : ssl[n].matches++;
985 9 : ssr[rstart].matches++;
986 9 : if (bunfastappTYPE(oid, r1t, &(ssl[n].o)) != GDK_SUCCEED) {
987 0 : reclaim_bats(2, r1t, r2t);
988 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
989 : OPERATION_FAILED "Failed bun append");
990 0 : goto exit;
991 : }
992 9 : if (bunfastappTYPE(oid, r2t, &(ssr[rstart].o)) != GDK_SUCCEED) {
993 0 : reclaim_bats(2, r1t, r2t);
994 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
995 : OPERATION_FAILED "Failed bun append");
996 0 : goto exit;
997 : }
998 : }
999 : }
1000 :
1001 42 : FINALIZE_BATS(r1t, r2t, lci, rci, ssl, ssr);
1002 9 : *r1 = r1t;
1003 9 : *r2 = r2t;
1004 :
1005 9 : exit:
1006 9 : if (ssl)
1007 18 : for (n = 0; n < lci.ncand; n++)
1008 9 : GDKfree(ssl[n].cp_sequence);
1009 9 : if (ssr)
1010 36 : for (n = 0; n < rci.ncand; n++)
1011 27 : GDKfree(ssr[n].cp_sequence);
1012 9 : GDKfree(x_flags);
1013 9 : GDKfree(y_flags);
1014 9 : GDKfree(ssl);
1015 9 : GDKfree(ssr);
1016 9 : return msg;
1017 : }
1018 :
1019 : static str
1020 9 : TXTSIMminjarowinklerjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
1021 : const bat *thresholdid, const bat *slid,
1022 : const bat *srid, const bit *nil_matches,
1023 : const lng *estimate, const bit *anti)
1024 : {
1025 9 : (void) nil_matches;
1026 9 : (void) estimate;
1027 9 : (void) anti;
1028 :
1029 9 : BAT *bleft = NULL, *bright = NULL,
1030 9 : *bcandleft = NULL, *bcandright = NULL, *bthreshold = NULL,
1031 9 : *r1t = NULL, *r2t = NULL;
1032 9 : dbl threshold = 1;
1033 9 : str msg = MAL_SUCCEED;
1034 :
1035 9 : if ((bleft = BATdescriptor(*lid)) == NULL ||
1036 9 : (bright = BATdescriptor(*rid)) == NULL ||
1037 9 : (bthreshold = BATdescriptor(*thresholdid)) == NULL) {
1038 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
1039 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1040 0 : goto exit;
1041 : }
1042 :
1043 9 : if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
1044 9 : (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
1045 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
1046 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1047 0 : goto exit;
1048 : }
1049 :
1050 9 : if (BATcount(bthreshold) > 0) {
1051 9 : BATiter thresholdi = bat_iterator(bthreshold);
1052 9 : threshold = *(dbl *) BUNtail(thresholdi, 0);
1053 9 : bat_iterator_end(&thresholdi);
1054 : }
1055 :
1056 9 : if ((msg = minjarowinklerjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, threshold)) != MAL_SUCCEED)
1057 0 : goto exit;
1058 :
1059 9 : *r1 = r1t->batCacheid;
1060 9 : *r2 = r2t->batCacheid;
1061 9 : BBPkeepref(r1t);
1062 9 : BBPkeepref(r2t);
1063 :
1064 9 : exit:
1065 9 : reclaim_bats(5, bleft, bright, bcandleft, bcandright, bthreshold);
1066 9 : if (msg != MAL_SUCCEED)
1067 0 : reclaim_bats(2, r1t, r2t);
1068 9 : return msg;
1069 : }
1070 :
1071 : #define SoundexLen 4 /* length of a soundex code */
1072 : #define SoundexKey "Z000" /* default key for soundex code */
1073 :
1074 : /* set letter values */
1075 : static const int Code[] = { 0, 1, 2, 3, 0, 1,
1076 : 2, 0, 0, 2, 2, 4,
1077 : 5, 5, 0, 1, 2, 6,
1078 : 2, 3, 0, 1, 0, 2, 0, 2
1079 : };
1080 :
1081 : #define RETURN_NIL_IF(b,t) \
1082 : if (b) { \
1083 : if (ATOMextern(t)) { \
1084 : *(ptr*) res = (ptr) ATOMnil(t); \
1085 : if ( *(ptr *) res == NULL) \
1086 : throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
1087 : } else { \
1088 : memcpy(res, ATOMnilptr(t), ATOMsize(t)); \
1089 : } \
1090 : return MAL_SUCCEED; \
1091 : }
1092 :
1093 : static inline char
1094 151 : SCode(unsigned char c)
1095 : {
1096 151 : if (c == 95)
1097 : return (2); /* german sz */
1098 151 : return (Code[toupper(c) - 'A']);
1099 : }
1100 :
1101 : static str
1102 67 : soundex_code(const char *Name, char *Key)
1103 : {
1104 67 : char LastLetter;
1105 67 : int Index;
1106 :
1107 303 : for (const char *p = Name; *p; p++)
1108 237 : if ((*p & 0x80) != 0)
1109 1 : throw(MAL, "soundex",
1110 : SQLSTATE(42000)
1111 : "Soundex function not available for non ASCII strings");
1112 :
1113 : /* set default key */
1114 66 : strcpy(Key, SoundexKey);
1115 :
1116 : /* keep first letter */
1117 66 : Key[0] = *Name;
1118 66 : if (!isupper((unsigned char) (Key[0])))
1119 23 : Key[0] = toupper(Key[0]);
1120 :
1121 66 : LastLetter = *Name;
1122 66 : if (!*Name)
1123 : return MAL_SUCCEED;
1124 56 : Name++;
1125 :
1126 : /* scan rest of string */
1127 222 : for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
1128 : /* use only letters */
1129 166 : if (isalpha((unsigned char) (*Name))) {
1130 : /* ignore duplicate successive chars */
1131 163 : if (LastLetter != *Name) {
1132 : /* new LastLetter */
1133 151 : LastLetter = *Name;
1134 :
1135 : /* ignore letters with code 0 */
1136 151 : if (SCode(*Name) != 0) {
1137 72 : Key[Index] = '0' + SCode(*Name);
1138 72 : Index ++;
1139 : }
1140 : }
1141 : }
1142 : }
1143 : return MAL_SUCCEED;
1144 : }
1145 :
1146 : static str
1147 71 : soundex(str *res, str *Name)
1148 : {
1149 71 : str msg = MAL_SUCCEED;
1150 :
1151 71 : GDKfree(*res);
1152 75 : RETURN_NIL_IF(strNil(*Name), TYPE_str);
1153 :
1154 67 : *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
1155 67 : if (*res == NULL)
1156 0 : throw(MAL, "soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1157 :
1158 : /* calculate Key for Name */
1159 67 : if ((msg = soundex_code(*Name, *res))) {
1160 1 : GDKfree(*res);
1161 1 : *res = NULL;
1162 1 : return msg;
1163 : }
1164 : return msg;
1165 : }
1166 :
1167 : static str
1168 30 : stringdiff(int *res, str *s1, str *s2)
1169 : {
1170 30 : str r = MAL_SUCCEED;
1171 30 : char *S1 = NULL, *S2 = NULL;
1172 :
1173 30 : r = soundex(&S1, s1);
1174 30 : if (r != MAL_SUCCEED)
1175 : return r;
1176 30 : r = soundex(&S2, s2);
1177 30 : if (r != MAL_SUCCEED) {
1178 0 : GDKfree(S1);
1179 0 : return r;
1180 : }
1181 30 : r = TXTSIMdameraulevenshtein1(res, &S1, &S2);
1182 30 : GDKfree(S1);
1183 30 : GDKfree(S2);
1184 30 : return r;
1185 : }
1186 :
1187 : /******************************
1188 : * QGRAMNORMALIZE
1189 : *
1190 : * This function 'normalizes' a string so valid q-grams can be made of it:
1191 : * All characters are transformed to uppercase, and all characters
1192 : * which are not letters or digits are stripped to a single space.
1193 : *
1194 : * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
1195 : * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
1196 : *
1197 : *****************************/
1198 : static str
1199 13 : qgram_normalize(str *res, str *Input)
1200 : {
1201 13 : char *input = *Input;
1202 13 : int i, j = 0;
1203 13 : char c, last = ' ';
1204 :
1205 13 : GDKfree(*res);
1206 13 : RETURN_NIL_IF(strNil(input), TYPE_str);
1207 13 : *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1)); /* normalized strings are never longer than original */
1208 13 : if (*res == NULL)
1209 0 : throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1210 :
1211 100 : for (i = 0; input[i]; i++) {
1212 87 : c = toupper(input[i]);
1213 87 : if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
1214 87 : c = ' ';
1215 87 : if (c != ' ' || last != ' ') {
1216 78 : (*res)[j++] = c;
1217 : }
1218 87 : last = c;
1219 : }
1220 13 : (*res)[j] = 0;
1221 : /* strip final whitespace */
1222 16 : while (j > 0 && (*res)[--j] == ' ')
1223 3 : (*res)[j] = 0;
1224 :
1225 : return MAL_SUCCEED;
1226 : }
1227 :
1228 : static str
1229 0 : qgram_selfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid,
1230 : flt *c, int *k)
1231 : {
1232 0 : BAT *qgram, *id, *pos, *len;
1233 0 : BUN n;
1234 0 : BUN i, j;
1235 0 : BAT *bn, *bn2;
1236 0 : oid *qbuf;
1237 0 : int *ibuf;
1238 0 : int *pbuf;
1239 0 : int *lbuf;
1240 0 : str msg = MAL_SUCCEED;
1241 :
1242 0 : qgram = BATdescriptor(*qid);
1243 0 : id = BATdescriptor(*bid);
1244 0 : pos = BATdescriptor(*pid);
1245 0 : len = BATdescriptor(*lid);
1246 0 : if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
1247 0 : BBPreclaim(qgram);
1248 0 : BBPreclaim(id);
1249 0 : BBPreclaim(pos);
1250 0 : BBPreclaim(len);
1251 0 : throw(MAL, "txtsim.qgramselfjoin",
1252 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1253 : }
1254 :
1255 0 : BATiter qgrami = bat_iterator(qgram);
1256 0 : BATiter idi = bat_iterator(id);
1257 0 : BATiter posi = bat_iterator(pos);
1258 0 : BATiter leni = bat_iterator(len);
1259 0 : if (qgrami.type != TYPE_oid)
1260 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1261 : SEMANTIC_TYPE_MISMATCH
1262 : ": tail of BAT qgram must be oid");
1263 0 : else if (idi.type != TYPE_int)
1264 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1265 : SEMANTIC_TYPE_MISMATCH
1266 : ": tail of BAT id must be int");
1267 0 : else if (posi.type != TYPE_int)
1268 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1269 : SEMANTIC_TYPE_MISMATCH
1270 : ": tail of BAT pos must be int");
1271 0 : else if (leni.type != TYPE_int)
1272 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1273 : SEMANTIC_TYPE_MISMATCH
1274 : ": tail of BAT len must be int");
1275 0 : if (msg) {
1276 0 : bat_iterator_end(&qgrami);
1277 0 : bat_iterator_end(&idi);
1278 0 : bat_iterator_end(&posi);
1279 0 : bat_iterator_end(&leni);
1280 0 : BBPunfix(qgram->batCacheid);
1281 0 : BBPunfix(id->batCacheid);
1282 0 : BBPunfix(pos->batCacheid);
1283 0 : BBPunfix(len->batCacheid);
1284 0 : return msg;
1285 : }
1286 :
1287 0 : n = BATcount(qgram);
1288 :
1289 : /* if (BATcount(qgram)>1 && !qgrami.sorted) throw(MAL, "txtsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
1290 :
1291 0 : if (!ALIGNsynced(qgram, id))
1292 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1293 : SEMANTIC_TYPE_MISMATCH
1294 : ": qgram and id are not synced");
1295 :
1296 0 : else if (!ALIGNsynced(qgram, pos))
1297 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1298 : SEMANTIC_TYPE_MISMATCH
1299 : ": qgram and pos are not synced");
1300 0 : else if (!ALIGNsynced(qgram, len))
1301 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1302 : SEMANTIC_TYPE_MISMATCH
1303 : ": qgram and len are not synced");
1304 :
1305 0 : else if (qgrami.width != ATOMsize(qgrami.type))
1306 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1307 : SEMANTIC_TYPE_MISMATCH
1308 : ": qgram is not a true void bat");
1309 0 : else if (idi.width != ATOMsize(idi.type))
1310 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1311 : SEMANTIC_TYPE_MISMATCH
1312 : ": id is not a true void bat");
1313 :
1314 0 : else if (posi.width != ATOMsize(posi.type))
1315 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1316 : SEMANTIC_TYPE_MISMATCH
1317 : ": pos is not a true void bat");
1318 0 : else if (leni.width != ATOMsize(leni.type))
1319 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1320 : SEMANTIC_TYPE_MISMATCH
1321 : ": len is not a true void bat");
1322 0 : if (msg) {
1323 0 : bat_iterator_end(&qgrami);
1324 0 : bat_iterator_end(&idi);
1325 0 : bat_iterator_end(&posi);
1326 0 : bat_iterator_end(&leni);
1327 0 : BBPunfix(qgram->batCacheid);
1328 0 : BBPunfix(id->batCacheid);
1329 0 : BBPunfix(pos->batCacheid);
1330 0 : BBPunfix(len->batCacheid);
1331 0 : return msg;
1332 : }
1333 :
1334 0 : bn = COLnew(0, TYPE_int, n, TRANSIENT);
1335 0 : bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
1336 0 : if (bn == NULL || bn2 == NULL) {
1337 0 : bat_iterator_end(&qgrami);
1338 0 : bat_iterator_end(&idi);
1339 0 : bat_iterator_end(&posi);
1340 0 : bat_iterator_end(&leni);
1341 0 : BBPreclaim(bn);
1342 0 : BBPreclaim(bn2);
1343 0 : BBPunfix(qgram->batCacheid);
1344 0 : BBPunfix(id->batCacheid);
1345 0 : BBPunfix(pos->batCacheid);
1346 0 : BBPunfix(len->batCacheid);
1347 0 : throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1348 : }
1349 :
1350 0 : qbuf = (oid *) qgrami.base;
1351 0 : ibuf = (int *) idi.base;
1352 0 : pbuf = (int *) posi.base;
1353 0 : lbuf = (int *) leni.base;
1354 0 : for (i = 0; i < n - 1; i++) {
1355 0 : for (j = i + 1;
1356 0 : (j < n && qbuf[j] == qbuf[i]
1357 0 : && pbuf[j] <= (pbuf[i] + (*k + *c * MIN(lbuf[i], lbuf[j]))));
1358 0 : j++) {
1359 0 : if (ibuf[i] != ibuf[j]
1360 0 : && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MIN(lbuf[i], lbuf[j]))) {
1361 0 : if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED
1362 0 : || BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
1363 0 : bat_iterator_end(&qgrami);
1364 0 : bat_iterator_end(&idi);
1365 0 : bat_iterator_end(&posi);
1366 0 : bat_iterator_end(&leni);
1367 0 : BBPunfix(qgram->batCacheid);
1368 0 : BBPunfix(id->batCacheid);
1369 0 : BBPunfix(pos->batCacheid);
1370 0 : BBPunfix(len->batCacheid);
1371 0 : BBPreclaim(bn);
1372 0 : BBPreclaim(bn2);
1373 0 : throw(MAL, "txtsim.qgramselfjoin",
1374 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1375 : }
1376 : }
1377 : }
1378 : }
1379 0 : bat_iterator_end(&qgrami);
1380 0 : bat_iterator_end(&idi);
1381 0 : bat_iterator_end(&posi);
1382 0 : bat_iterator_end(&leni);
1383 :
1384 0 : BBPunfix(qgram->batCacheid);
1385 0 : BBPunfix(id->batCacheid);
1386 0 : BBPunfix(pos->batCacheid);
1387 0 : BBPunfix(len->batCacheid);
1388 :
1389 0 : *res1 = bn->batCacheid;
1390 0 : BBPkeepref(bn);
1391 0 : *res2 = bn2->batCacheid;
1392 0 : BBPkeepref(bn2);
1393 :
1394 0 : return MAL_SUCCEED;
1395 : }
1396 :
1397 : /* copy up to utf8len UTF-8 encoded characters from src to buf
1398 : * stop early if buf (size given by bufsize) is too small, or if src runs out
1399 : * return number of UTF-8 characters copied (excluding NUL)
1400 : * close with NUL if enough space */
1401 : static size_t
1402 26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
1403 : {
1404 26 : size_t cnt = 0;
1405 :
1406 128 : while (utf8len != 0 && *src != 0 && bufsize != 0) {
1407 102 : bufsize--;
1408 102 : utf8len--;
1409 102 : cnt++;
1410 102 : if (((*buf++ = *src++) & 0x80) != 0) {
1411 40 : while ((*src & 0xC0) == 0x80 && bufsize != 0) {
1412 20 : *buf++ = *src++;
1413 20 : bufsize--;
1414 : }
1415 : }
1416 : }
1417 26 : if (bufsize != 0)
1418 26 : *buf = 0;
1419 26 : return cnt;
1420 : }
1421 :
1422 : static str
1423 2 : str_2_qgrams(bat *ret, str *val)
1424 : {
1425 2 : BAT *bn;
1426 2 : size_t i, len = strlen(*val) + 5;
1427 2 : str s = GDKmalloc(len);
1428 2 : char qgram[4 * 6 + 1]; /* 4 UTF-8 code points plus NULL byte */
1429 :
1430 2 : if (s == NULL)
1431 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1432 2 : strcpy(s, "##");
1433 2 : strcpy(s + 2, *val);
1434 2 : strcpy(s + len - 3, "$$");
1435 2 : bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
1436 2 : if (bn == NULL) {
1437 0 : GDKfree(s);
1438 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1439 : }
1440 :
1441 : i = 0;
1442 26 : while (s[i]) {
1443 26 : if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
1444 : break;
1445 24 : if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
1446 0 : BBPreclaim(bn);
1447 0 : GDKfree(s);
1448 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1449 : }
1450 24 : if ((s[i++] & 0xC0) == 0xC0) {
1451 8 : while ((s[i] & 0xC0) == 0x80)
1452 4 : i++;
1453 : }
1454 : }
1455 2 : *ret = bn->batCacheid;
1456 2 : BBPkeepref(bn);
1457 2 : GDKfree(s);
1458 2 : return MAL_SUCCEED;
1459 : }
1460 :
1461 : #include "mel.h"
1462 : mel_func txtsim_init_funcs[] = {
1463 : pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1, transposition = 2)", args(1,3,arg("",int),arg("x",str),arg("y",str))),
1464 : pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
1465 : command("txtsim", "editdistance", TXTSIMdameraulevenshtein1, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1466 : command("txtsim", "editdistance2", TXTSIMdameraulevenshtein2, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 1", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1467 : pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1)", args(1,3,arg("",int),arg("s",str),arg("t",str))),
1468 : pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, variable operation costs (ins/del, replacement)", args(1,5,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int))),
1469 : pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
1470 : pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with basic costs but up to a MAX", args(1, 4, arg("",int), arg("l",str),arg("r",str),arg("k",int))),
1471 : pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with variable costs but up to a MAX", args(1, 6, arg("",int), arg("l",str),arg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
1472 : pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 4, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int))),
1473 : pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 6, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
1474 : command("txtsim", "maxlevenshteinjoin", TXTSIMmaxlevenshteinjoin, false, "", args(2,10, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("k",int),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
1475 : command("txtsim", "soundex", soundex, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
1476 : command("txtsim", "stringdiff", stringdiff, false, "Calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
1477 : command("txtsim", "qgramnormalize", qgram_normalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
1478 : command("txtsim", "qgramselfjoin", qgram_selfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
1479 : command("txtsim", "str2qgrams", str_2_qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
1480 : command("txtsim", "jarowinkler", TXTSIMjarowinkler, false, "Calculate Jaro Winkler similarity", args(1,3, arg("",dbl),arg("x",str),arg("y",str))),
1481 : command("txtsim", "minjarowinkler", TXTSIMminjarowinkler, false, "", args(1, 4, arg("",bit), arg("l",str),arg("r",str),arg("threshold",dbl))),
1482 : command("txtsim", "minjarowinklerjoin", TXTSIMminjarowinklerjoin, false, "", args(2, 10, batarg("",oid),batarg("",oid), batarg("l",str),batarg("r",str),batarg("threshold",dbl),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
1483 : { .imp=NULL }
1484 : };
1485 : #include "mal_import.h"
1486 : #ifdef _MSC_VER
1487 : #undef read
1488 : #pragma section(".CRT$XCU",read)
1489 : #endif
1490 329 : LIB_STARTUP_FUNC(init_txtsim_mal)
1491 329 : { mal_module("txtsim", NULL, txtsim_init_funcs); }
|