Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : #include "monetdb_config.h"
14 : #include "gdk.h"
15 : #include "str.h"
16 : #include "mal.h"
17 : #include "mal_exception.h"
18 : #include "mal_interpreter.h"
19 : #include "mutf8.h"
20 : #include <string.h>
21 : #include <limits.h>
22 :
23 : #define MIN3(X,Y,Z) (MIN(MIN((X),(Y)),(Z)))
24 : #define MAX3(X,Y,Z) (MAX(MAX((X),(Y)),(Z)))
25 : #define MIN4(W,X,Y,Z) (MIN(MIN(MIN(W,X),Y),Z))
26 :
27 : static void
28 15 : reclaim_bats(int nargs, ...)
29 : {
30 15 : va_list valist;
31 :
32 15 : va_start(valist, nargs);
33 90 : for (int i = 0; i < nargs; i++) {
34 75 : BAT *b = va_arg(valist, BAT *);
35 120 : BBPreclaim(b);
36 : }
37 15 : va_end(valist);
38 15 : }
39 :
40 : static inline int *
41 4086 : damerau_get_cellpointer(int *pOrigin, int col, int row, int nCols)
42 : {
43 4086 : return pOrigin + col + (row * (nCols + 1));
44 : }
45 :
46 : static inline int
47 2252 : damerau_getat(int *pOrigin, int col, int row, int nCols)
48 : {
49 2252 : int *pCell;
50 2252 : pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
51 2252 : return *pCell;
52 : }
53 :
54 : static inline void
55 1834 : damerau_putat(int *pOrigin, int col, int row, int nCols, int x)
56 : {
57 1834 : int *pCell;
58 1834 : pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
59 1834 : *pCell = x;
60 : }
61 :
62 : static str
63 82 : dameraulevenshtein(int *res, const char *s, const char *t, int insdel_cost,
64 : int replace_cost, int transpose_cost)
65 : {
66 82 : int *d; /* pointer to matrix */
67 82 : int n; /* length of s */
68 82 : int m; /* length of t */
69 82 : int i; /* iterates through s */
70 82 : int j; /* iterates through t */
71 82 : char s_i; /* ith character of s */
72 82 : char t_j; /* jth character of t */
73 82 : int cost; /* cost */
74 82 : int cell; /* contents of target cell */
75 82 : int above; /* contents of cell immediately above */
76 82 : int left; /* contents of cell immediately to left */
77 82 : int diag; /* contents of cell immediately above and to left */
78 82 : lng sz; /* number of cells in matrix */
79 82 : int diag2 = 0, cost2 = 0;
80 :
81 157 : if (strNil(s) || strNil(t)) {
82 10 : *res = int_nil;
83 10 : return MAL_SUCCEED;
84 : }
85 :
86 : /* Step 1 */
87 72 : n = (int) strlen(s); /* 64bit: assume strings are less than 2 GB */
88 72 : m = (int) strlen(t);
89 72 : if (n == 0) {
90 8 : *res = m;
91 8 : return MAL_SUCCEED;
92 : }
93 64 : if (m == 0) {
94 8 : *res = n;
95 8 : return MAL_SUCCEED;
96 : }
97 56 : sz = (((lng)n) + 1) * (((lng)m) + 1) * sizeof(int);
98 56 : if (sz > (LL_CONSTANT(1)<<28))
99 1 : throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
100 55 : d = (int *) GDKmalloc((size_t)sz);
101 56 : if (d == NULL)
102 0 : throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
103 :
104 : /* Step 2 */
105 341 : for (i = 0; i <= n; i++) {
106 285 : damerau_putat(d, i, 0, n, i);
107 : }
108 319 : for (j = 0; j <= m; j++) {
109 263 : damerau_putat(d, 0, j, n, j);
110 : }
111 : /* Step 3 */
112 282 : for (i = 1; i <= n; i++) {
113 226 : s_i = s[i - 1];
114 : /* Step 4 */
115 1512 : for (j = 1; j <= m; j++) {
116 1286 : t_j = t[j - 1];
117 : /* Step 5 */
118 1286 : if (s_i == t_j) {
119 : cost = 0;
120 : } else {
121 974 : cost = replace_cost;
122 : }
123 : /* Step 6 */
124 1286 : above = damerau_getat(d, i - 1, j, n);
125 1286 : left = damerau_getat(d, i, j - 1, n);
126 1286 : diag = damerau_getat(d, i - 1, j - 1, n);
127 1286 : if (j >= 2 && i >= 2) {
128 : /* NEW: detect transpositions */
129 910 : diag2 = damerau_getat(d, i - 2, j - 2, n);
130 910 : if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
131 : cost2 = transpose_cost;
132 : } else {
133 838 : cost2 = 2;
134 : }
135 910 : cell = MIN4(above + insdel_cost, left + insdel_cost,
136 : diag + cost, diag2 + cost2);
137 : } else {
138 376 : cell = MIN3(above + insdel_cost, left + insdel_cost,
139 : diag + cost);
140 : }
141 1286 : damerau_putat(d, i, j, n, cell);
142 : }
143 : }
144 : /* Step 7 */
145 56 : *res = damerau_getat(d, n, m, n);
146 56 : GDKfree(d);
147 56 : return MAL_SUCCEED;
148 : }
149 :
150 : static str
151 38 : TXTSIMdameraulevenshtein1(int *result, const char *const *s, const char *const *t)
152 : {
153 38 : return dameraulevenshtein(result, *s, *t, 1, 1, 2);
154 : }
155 :
156 : static str
157 9 : TXTSIMdameraulevenshtein2(int *result, const char *const *s, const char *const *t)
158 : {
159 9 : return dameraulevenshtein(result, *s, *t, 1, 1, 1);
160 : }
161 :
162 : static str
163 2 : TXTSIMdameraulevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
164 : InstrPtr pci)
165 : {
166 2 : (void) cntxt;
167 2 : (void) mb;
168 2 : int *res = getArgReference_int(stk, pci, 0);
169 2 : const char *x = *getArgReference_str(stk, pci, 1),
170 2 : *y = *getArgReference_str(stk, pci, 2);
171 2 : int insdel_cost, replace_cost, transpose_cost;
172 :
173 2 : assert(pci->argc == 3 || pci->argc == 6);
174 :
175 2 : if (pci->argc == 3) {
176 : insdel_cost = 1;
177 : replace_cost = 1;
178 : transpose_cost = 2;
179 : } else {
180 1 : insdel_cost = *getArgReference_int(stk, pci, 3);
181 1 : replace_cost = *getArgReference_int(stk, pci, 4);
182 1 : transpose_cost = *getArgReference_int(stk, pci, 5);
183 : }
184 :
185 2 : return dameraulevenshtein(res, x, y, insdel_cost, replace_cost,
186 : transpose_cost);
187 : }
188 :
189 : static inline str
190 23 : levenshtein(int *res, const char *x, const char *y, int insdel_cost,
191 : int replace_cost, int max)
192 : {
193 23 : unsigned int xlen, ylen, i = 0, j = 0;
194 23 : unsigned int last_diagonal, old_diagonal;
195 23 : uint32_t cx, cy;
196 23 : unsigned int *column, min;
197 :
198 44 : if (strNil(x) || strNil(y)) {
199 3 : *res = int_nil;
200 3 : return MAL_SUCCEED;
201 : }
202 :
203 20 : xlen = UTF8_strlen(x);
204 20 : ylen = UTF8_strlen(y);
205 :
206 20 : if (xlen == ylen && (strcmp(x, y) == 0)) {
207 6 : *res = 0;
208 6 : return MAL_SUCCEED;
209 : }
210 :
211 14 : column = GDKmalloc((xlen + 1) * sizeof(unsigned int));
212 14 : if (column == NULL)
213 0 : throw(MAL, "levenshtein", MAL_MALLOC_FAIL);
214 :
215 91 : for (i = 1; i <= xlen; i++)
216 77 : column[i] = i;
217 :
218 108 : for (j = 1; j <= ylen; j++) {
219 94 : column[0] = j;
220 94 : min = INT_MAX;
221 94 : const char *x_iter = x;
222 94 : y = nextchar(y, &cy);
223 94 : if (y == NULL)
224 0 : goto illegal;
225 518 : for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
226 424 : x_iter = nextchar(x_iter, &cx);
227 424 : if (x_iter == NULL)
228 0 : goto illegal;
229 424 : old_diagonal = column[i];
230 424 : column[i] = MIN3(column[i] + insdel_cost,
231 : column[i - 1] + insdel_cost,
232 : last_diagonal + (cx == cy ? 0 : replace_cost));
233 424 : last_diagonal = old_diagonal;
234 424 : if (last_diagonal < min)
235 : min = last_diagonal;
236 : }
237 94 : if (max != -1 && min > (unsigned int) max) {
238 0 : *res = INT_MAX;
239 0 : GDKfree(column);
240 0 : return MAL_SUCCEED;
241 : }
242 : }
243 :
244 14 : *res = column[xlen];
245 14 : GDKfree(column);
246 14 : return MAL_SUCCEED;
247 0 : illegal:
248 0 : GDKfree(column);
249 0 : throw(MAL, "txtsim.levenshtein", "Illegal unicode code point");
250 : }
251 :
252 : /* Levenshtein OP but with column externally allocated */
253 : static inline int
254 0 : levenshtein2(const char *x, const char *y, const size_t xlen, const size_t ylen,
255 : unsigned int *column, const int insdel_cost,
256 : const int replace_cost, const int max)
257 : {
258 0 : unsigned int i = 0, j = 0, min;;
259 0 : unsigned int last_diagonal, old_diagonal;
260 0 : uint32_t cx, cy;
261 :
262 0 : if (strNil(x) || strNil(y))
263 0 : return int_nil;
264 :
265 0 : if (xlen == ylen && (strcmp(x, y) == 0))
266 : return 0;
267 :
268 0 : for (i = 1; i <= xlen; i++)
269 0 : column[i] = i;
270 :
271 0 : for (j = 1; j <= ylen; j++) {
272 0 : column[0] = j;
273 0 : min = INT_MAX;
274 0 : const char *x_iter = x;
275 0 : y = nextchar(y, &cy);
276 0 : if (y == NULL)
277 0 : goto illegal;
278 0 : for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
279 0 : x_iter = nextchar(x_iter, &cx);
280 0 : if (x_iter == NULL)
281 0 : goto illegal;
282 0 : old_diagonal = column[i];
283 0 : column[i] = MIN3(column[i] + insdel_cost,
284 : column[i - 1] + insdel_cost,
285 : last_diagonal + (cx == cy ? 0 : replace_cost));
286 0 : last_diagonal = old_diagonal;
287 0 : if (last_diagonal < min)
288 : min = last_diagonal;
289 : }
290 0 : if (max != -1 && min > (unsigned int) max)
291 : return INT_MAX;
292 : }
293 0 : return column[xlen];
294 : illegal:
295 : return INT_MAX;
296 : }
297 :
298 : static str
299 56 : TXTSIMlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
300 : {
301 56 : (void) cntxt;
302 56 : (void) mb;
303 56 : int *res = getArgReference_int(stk, pci, 0);
304 56 : const char *x = *getArgReference_str(stk, pci, 1),
305 56 : *y = *getArgReference_str(stk, pci, 2);
306 56 : int insdel_cost, replace_cost;
307 :
308 56 : if (pci->argc == 3) {
309 : insdel_cost = 1;
310 : replace_cost = 1;
311 34 : } else if (pci->argc == 5 || pci->argc == 6) {
312 34 : insdel_cost = *getArgReference_int(stk, pci, 3);
313 34 : replace_cost = *getArgReference_int(stk, pci, 4);
314 : /* Backwards compatibility purposes */
315 34 : if (pci->argc == 6) {
316 33 : int transposition_cost = *getArgReference_int(stk, pci, 5);
317 33 : return dameraulevenshtein(res, x, y, insdel_cost, replace_cost,
318 : transposition_cost);
319 : }
320 : } else {
321 0 : throw(MAL, "txtsim.levenshtein", RUNTIME_SIGNATURE_MISSING);
322 : }
323 :
324 23 : return levenshtein(res, x, y, insdel_cost, replace_cost, -1);;
325 : }
326 :
327 : static str
328 0 : TXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
329 : {
330 0 : (void) cntxt;
331 0 : (void) mb;
332 0 : int *res = getArgReference_int(stk, pci, 0);
333 0 : const char *x = *getArgReference_str(stk, pci, 1),
334 0 : *y = *getArgReference_str(stk, pci, 2);
335 0 : const int *k = getArgReference_int(stk, pci, 3);
336 0 : int insdel_cost, replace_cost;
337 :
338 0 : if (pci->argc == 4) {
339 : insdel_cost = 1;
340 : replace_cost = 1;
341 0 : } else if (pci->argc == 6) {
342 0 : insdel_cost = *getArgReference_int(stk, pci, 4);
343 0 : replace_cost = *getArgReference_int(stk, pci, 5);
344 : } else {
345 0 : throw(MAL, "txtsim.maxlevenshtein", RUNTIME_SIGNATURE_MISSING);
346 : }
347 :
348 0 : return levenshtein(res, x, y, insdel_cost, replace_cost, *k);
349 : }
350 :
351 : static str
352 0 : BATTXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
353 : {
354 0 : (void) cntxt;
355 0 : (void) mb;
356 0 : bat *res = getArgReference_bat(stk, pci, 0);
357 0 : bat *lid = getArgReference_bat(stk, pci, 1);
358 0 : bat *rid = getArgReference_bat(stk, pci, 2);
359 0 : int *k = getArgReference_int(stk, pci, 3);
360 0 : int insdel_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 4) : 1,
361 0 : replace_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 5) : 1;
362 0 : BAT *left = NULL, *right = NULL, *bn = NULL;
363 0 : BUN p, q;
364 0 : BATiter li, ri;
365 0 : unsigned int *buffer = NULL;
366 0 : const char *lv, *rv;
367 0 : str msg = MAL_SUCCEED;
368 0 : size_t llen = 0, rlen = 0, maxlen = 0;
369 0 : int d;
370 0 : bit v;
371 :
372 0 : if ((left = BATdescriptor(*lid)) == NULL) {
373 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
374 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
375 0 : goto exit;
376 : }
377 0 : if ((right = BATdescriptor(*rid)) == NULL) {
378 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
379 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
380 0 : goto exit;
381 : }
382 0 : if (BATcount(left) != BATcount(right)) {
383 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
384 : "Columns must be aligned");
385 0 : goto exit;
386 : }
387 0 : if ((bn = COLnew(0, TYPE_bit, BATcount(left), TRANSIENT)) == NULL) {
388 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
389 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
390 0 : goto exit;
391 : }
392 :
393 0 : li = bat_iterator(left);
394 0 : ri = bat_iterator(right);
395 0 : BATloop(left, p, q) {
396 0 : lv = BUNtail(li, p);
397 0 : rv = BUNtail(ri, p);
398 0 : llen = UTF8_strlen(lv);
399 0 : rlen = UTF8_strlen(rv);
400 0 : if (abs((int) llen - (int) rlen) > (int) *k)
401 0 : v = false;
402 : else {
403 0 : if (llen > maxlen) {
404 0 : maxlen = llen;
405 0 : unsigned int *tmp = GDKrealloc(buffer, (maxlen + 1) * sizeof(unsigned int));
406 0 : if (tmp == NULL) {
407 0 : bat_iterator_end(&li);
408 0 : bat_iterator_end(&ri);
409 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
410 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
411 0 : goto exit;
412 : }
413 : buffer = tmp;
414 : }
415 0 : d = levenshtein2(lv, rv, llen, rlen, buffer, insdel_cost,
416 : replace_cost, (int) *k);
417 0 : v = (bit) (d <= (int) *k);
418 : }
419 0 : if (BUNappend(bn, (const void *) &v, false) != GDK_SUCCEED) {
420 0 : bat_iterator_end(&li);
421 0 : bat_iterator_end(&ri);
422 0 : msg = createException(MAL, "battxtsim.maxlevenshtein",
423 : "BUNappend failed");
424 0 : goto exit;
425 : }
426 : }
427 0 : bat_iterator_end(&li);
428 0 : bat_iterator_end(&ri);
429 :
430 0 : *res = bn->batCacheid;
431 0 : BBPkeepref(bn);
432 0 : exit:
433 0 : GDKfree(buffer);
434 0 : BBPreclaim(left);
435 0 : BBPreclaim(right);
436 0 : if (msg != MAL_SUCCEED)
437 0 : BBPreclaim(bn);
438 0 : return msg;
439 : }
440 :
441 : #define JARO_WINKLER_SCALING_FACTOR 0.1
442 : #define JARO_WINKLER_PREFIX_LEN 4
443 :
444 : typedef struct {
445 : size_t matches; /* accumulator for number of matches for this item */
446 : BUN o; /* position in the BAT */
447 : const char *val; /* string value */
448 : int *cp_sequence; /* string as array of Unicode codepoints */
449 : int len; /* string length in characters (multi-byte characters count as 1) */
450 : int cp_seq_len; /* string length in bytes */
451 : uint64_t abm; /* 64bit alphabet bitmap */
452 : int abm_popcount; /* hamming weight of abm */
453 : } str_item;
454 :
455 : static inline int
456 96 : popcount64(uint64_t x)
457 : {
458 : #ifdef __has_builtin
459 : #if __has_builtin(__builtin_popcountll)
460 96 : return (int) __builtin_popcountll(x);
461 : #define BUILTIN_USED
462 : #endif
463 : #endif
464 : #ifndef BUILTIN_USED
465 : #if defined(_MSC_VER)
466 : #if SIZEOF_OID == 4
467 : /* no __popcnt64 on 32 bit Windows */
468 : return (int) (__popcnt((uint32_t) x) + __popcnt((uint32_t) (x >> 32)));
469 : #else
470 : return (int) __popcnt64(x);
471 : #endif
472 : #else
473 : x = (x & UINT64_C(0x5555555555555555)) + ((x >> 1) & UINT64_C(0x5555555555555555));
474 : x = (x & UINT64_C(0x3333333333333333)) + ((x >> 2) & UINT64_C(0x3333333333333333));
475 : x = (x & UINT64_C(0x0F0F0F0F0F0F0F0F)) + ((x >> 4) & UINT64_C(0x0F0F0F0F0F0F0F0F));
476 : return (int) ((x * UINT64_C(0x0101010101010101)) >> 56);
477 : #endif
478 : #endif
479 : #undef BUILTIN_USED
480 : }
481 :
482 : static int
483 43 : str_item_lenrev_cmp(const void *a, const void *b)
484 : {
485 43 : return ((int) ((str_item *) b)->len - (int) ((str_item *) a)->len);
486 : }
487 :
488 : static str
489 64 : str_2_codepointseq(str_item *s)
490 : {
491 64 : const uint8_t *p = (const uint8_t *) s->val;
492 :
493 64 : if (s->len == 0) {
494 0 : s->cp_sequence = NULL;
495 0 : return MAL_SUCCEED;
496 : }
497 64 : s->cp_sequence = GDKmalloc(s->len * sizeof(int));
498 64 : if (s->cp_sequence == NULL)
499 0 : throw(MAL, "str_2_byteseq", SQLSTATE(HY013) MAL_MALLOC_FAIL);
500 :
501 64 : uint32_t state = 0;
502 64 : uint32_t codepoint;
503 64 : uint32_t *buf = (uint32_t *) s->cp_sequence;
504 428 : while (*p) {
505 364 : switch (decode(&state, &codepoint, *p++)) {
506 364 : case UTF8_ACCEPT:
507 364 : *buf++ = codepoint;
508 364 : break;
509 0 : case UTF8_REJECT:
510 0 : throw(MAL, "str_2_byteseq", SQLSTATE(42000) "Illegal unicode code point");
511 : default:
512 : break;
513 : }
514 : }
515 : return MAL_SUCCEED;
516 : }
517 :
518 : static void
519 60 : str_alphabet_bitmap(str_item *s)
520 : {
521 60 : int i;
522 :
523 60 : s->abm = UINT64_C(0);
524 :
525 400 : for (i = 0; i < s->len; i++)
526 340 : s->abm |= UINT64_C(1) << (s->cp_sequence[i] % 64);
527 :
528 60 : s->abm_popcount = popcount64(s->abm);
529 60 : }
530 :
531 : static inline double
532 13 : jarowinkler_lp(const str_item *a, const str_item *b)
533 : {
534 13 : unsigned int l;
535 :
536 : /* calculate common string prefix up to prefixlen chars */
537 13 : l = 0;
538 65 : for (int i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN); i++)
539 52 : l += (a->cp_sequence[i] == b->cp_sequence[i]);
540 :
541 13 : return (double) l *JARO_WINKLER_SCALING_FACTOR;
542 : }
543 :
544 : static inline double
545 15 : jarowinkler(const str_item *x, const str_item *y, double lp, int *x_flags,
546 : int *y_flags)
547 : {
548 15 : int xlen = x->len, ylen = y->len;
549 15 : int range = MAX(0, MAX(xlen, ylen) / 2 - 1);
550 15 : int *x1 = x->cp_sequence, *x2 = y->cp_sequence;
551 15 : int m = 0, t = 0;
552 15 : int i, j, l;
553 15 : double dw;
554 :
555 15 : if (!xlen || !ylen)
556 : return 0.0;
557 89 : for (i = 0; i < xlen; i++)
558 74 : x_flags[i] = 0;
559 102 : for (i = 0; i < ylen; i++)
560 87 : y_flags[i] = 0;
561 : /* matching chars */
562 102 : for (i = 0; i < ylen; i++) {
563 280 : for (j = MAX(i - range, 0), l = MIN(i + range + 1, xlen); j < l; j++) {
564 225 : if (x2[i] == x1[j] && !x_flags[j]) {
565 32 : x_flags[j] = 1;
566 32 : y_flags[i] = 1;
567 32 : m++;
568 32 : break;
569 : }
570 : }
571 : }
572 15 : if (!m)
573 : return 0.0;
574 : /* char transpositions */
575 : l = 0;
576 90 : for (i = 0; i < ylen; i++) {
577 77 : if (y_flags[i] == 1) {
578 48 : for (j = l; j < xlen; j++) {
579 48 : if (x_flags[j] == 1) {
580 32 : l = j + 1;
581 32 : break;
582 : }
583 : }
584 32 : if (x2[i] != x1[j])
585 4 : t++;
586 : }
587 : }
588 13 : t /= 2;
589 :
590 : /* jaro similarity */
591 13 : dw = (((double) m / xlen) + ((double) m / ylen) +
592 13 : ((double) (m - t) / m)) / 3.0;
593 : /* calculate common string prefix up to prefixlen chars */
594 13 : if (lp == -1)
595 13 : lp = jarowinkler_lp(x, y);
596 : /* jaro-winkler similarity */
597 13 : dw = dw + (lp * (1 - dw));
598 13 : return dw;
599 : }
600 :
601 : static str
602 3 : TXTSIMjarowinkler(dbl *res, const char *const *x, const char *const *y)
603 : {
604 3 : int *x_flags = NULL, *y_flags = NULL;
605 3 : str_item xi, yi;
606 3 : str msg = MAL_SUCCEED;
607 :
608 6 : if (strNil(*x) || strNil(*y)) {
609 1 : *res = dbl_nil;
610 1 : return MAL_SUCCEED;
611 : }
612 4 : xi = (str_item) {
613 : .val = *x,
614 2 : .len = UTF8_strlen(*x),
615 : };
616 4 : yi = (str_item) {
617 2 : .val = *y,
618 2 : .len = UTF8_strlen(*y),
619 : };
620 2 : if (xi.len == 0 || yi.len == 0) {
621 0 : *res = 0;
622 0 : return MAL_SUCCEED;
623 : }
624 :
625 2 : if ((msg = str_2_codepointseq(&xi)) != MAL_SUCCEED)
626 0 : goto bailout;
627 :
628 2 : if ((msg = str_2_codepointseq(&yi)) != MAL_SUCCEED)
629 0 : goto bailout;
630 :
631 2 : x_flags = GDKmalloc(xi.len * sizeof(int));
632 2 : y_flags = GDKmalloc(yi.len * sizeof(int));
633 :
634 2 : if (x_flags == NULL || y_flags == NULL)
635 0 : goto bailout;
636 :
637 2 : *res = jarowinkler(&xi, &yi, -1, x_flags, y_flags);
638 :
639 2 : bailout:
640 2 : GDKfree(x_flags);
641 2 : GDKfree(y_flags);
642 2 : GDKfree(xi.cp_sequence);
643 2 : GDKfree(yi.cp_sequence);
644 2 : return msg;
645 : }
646 :
647 : static str
648 0 : TXTSIMminjarowinkler(bit *res, const char *const *x, const char *const *y, const dbl *threshold)
649 : {
650 0 : str msg = MAL_SUCCEED;
651 0 : double s = 1;
652 :
653 0 : msg = TXTSIMjarowinkler(&s, x, y);
654 0 : if (msg != MAL_SUCCEED)
655 0 : throw(MAL, "txt.minjarowinkler", OPERATION_FAILED);
656 :
657 0 : *res = (s > *threshold);
658 0 : return MAL_SUCCEED;
659 : }
660 :
661 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width))
662 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
663 :
664 : #define PREP_BAT_STRITEM(B, CI, SI) \
665 : do { \
666 : for (n = 0; n < CI.ncand; n++) { \
667 : SI[n].matches = 0; \
668 : SI[n].o = canditer_next(&CI); \
669 : SI[n].val = VALUE(B, SI[n].o - B->hseqbase); \
670 : SI[n].cp_sequence = NULL; \
671 : SI[n].len = UTF8_strlen(SI[n].val); \
672 : SI[n].cp_seq_len = str_strlen(SI[n].val); \
673 : if ((msg = str_2_codepointseq(&SI[n])) != MAL_SUCCEED) \
674 : goto exit; \
675 : str_alphabet_bitmap(&SI[n]); \
676 : } \
677 : } while (false)
678 :
679 : #define FINALIZE_BATS(L, R, LCI, RCI, LSI, RSI) \
680 : do { \
681 : assert(BATcount(L) == BATcount(R)); \
682 : BATsetcount(L, BATcount(L)); \
683 : BATsetcount(R, BATcount(R)); \
684 : for (n = 0; n < LCI.ncand; n++) { \
685 : if (LSI[n].matches > 1) { \
686 : L->tkey = false; \
687 : break; \
688 : } \
689 : } \
690 : if (n == LCI.ncand) { \
691 : L->tkey = true; \
692 : } \
693 : for (n = 0; n < RCI.ncand; n++) { \
694 : if (RSI[n].matches > 1) { \
695 : R->tkey = false; \
696 : break; \
697 : } \
698 : } \
699 : if (n == RCI.ncand) { \
700 : R->tkey = true; \
701 : } \
702 : BATordered(L); \
703 : BATordered(R); \
704 : L->theap->dirty |= BATcount(L) > 0; \
705 : R->theap->dirty |= BATcount(R) > 0; \
706 : } while (false)
707 :
708 : static inline int
709 12 : maxlevenshtein_extcol_stritem(const str_item *si1, const str_item *si2,
710 : unsigned int *column, const int k)
711 : {
712 12 : unsigned int lastdiag, olddiag;
713 12 : int c1, c2, x, y;
714 12 : unsigned int min;
715 12 : int s1len = si1->len, s2len = si2->len;
716 12 : int *s1 = si1->cp_sequence, *s2 = si2->cp_sequence;
717 : /* first test if the strings are equal */
718 12 : if (s1len == s2len) {
719 1 : for (x = 0; x < s1len; x++)
720 1 : if (s1[x] != s2[x])
721 : break;
722 1 : if (x == s1len)
723 : return 0;
724 : }
725 68 : for (y = 1; y <= s1len; y++)
726 56 : column[y] = y;
727 73 : for (x = 1; x <= s2len; x++) {
728 64 : c2 = s2[x - 1];
729 64 : column[0] = x;
730 64 : min = INT_MAX;
731 370 : for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
732 306 : olddiag = column[y];
733 306 : c1 = s1[y - 1];
734 306 : column[y] = MIN3(column[y] + 1, column[y - 1] + 1, lastdiag + (c1 == c2 ? 0 : 1));
735 306 : lastdiag = olddiag;
736 306 : if (lastdiag < min)
737 : min = lastdiag;
738 : }
739 64 : if (min > (unsigned int) k)
740 : return INT_MAX;
741 : }
742 9 : return column[s1len];
743 : }
744 :
745 : static str
746 6 : maxlevenshteinjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int k)
747 : {
748 6 : BAT *r1t = NULL, *r2t = NULL;
749 6 : BUN n;
750 6 : struct canditer lci, rci;
751 6 : const char *lvals, *rvals, *lvars, *rvars;;
752 6 : int lwidth, rwidth, d;
753 6 : str_item *lsi = NULL, *rsi = NULL;
754 6 : str msg = MAL_SUCCEED;
755 6 : unsigned int *buffer = NULL;
756 :
757 18 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
758 6 : assert(ATOMtype(l->ttype) == TYPE_str);
759 :
760 6 : canditer_init(&lci, l, sl);
761 6 : canditer_init(&rci, r, sr);
762 :
763 6 : if (lci.ncand == 0 || rci.ncand == 0)
764 0 : goto exit;
765 :
766 6 : lvals = (const char *) Tloc(l, 0);
767 6 : rvals = (const char *) Tloc(r, 0);
768 6 : lvars = l->tvheap->base;
769 6 : rvars = r->tvheap->base;
770 6 : lwidth = l->twidth;
771 6 : rwidth = r->twidth;
772 :
773 6 : if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
774 6 : (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
775 0 : reclaim_bats(2, r1t, r2t);
776 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
777 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
778 0 : goto exit;
779 : }
780 :
781 6 : r1t->tsorted = r1t->trevsorted = false;
782 6 : r2t->tsorted = r2t->trevsorted = false;
783 :
784 6 : if ((lsi = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
785 6 : (rsi = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
786 0 : reclaim_bats(2, r1t, r2t);
787 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
788 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
789 0 : goto exit;
790 : }
791 :
792 12 : PREP_BAT_STRITEM(l, lci, lsi);
793 24 : PREP_BAT_STRITEM(r, rci, rsi);
794 6 : qsort(lsi, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
795 6 : qsort(rsi, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
796 :
797 5 : if ((buffer = GDKmalloc((lsi[0].len + 1) * sizeof(int))) == NULL) {
798 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
799 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
800 0 : goto exit;
801 : }
802 :
803 : /* join loop */
804 23 : for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
805 34 : for (n = lstart; n < lci.ncand; n++) {
806 : /* first and cheapest filter */
807 19 : if ((lsi[n].len) > k + rsi[rstart].len) {
808 0 : lstart++;
809 0 : continue; /* no possible matches yet for this r */
810 19 : } else if (rsi[rstart].len > k + lsi[n].len) {
811 : break; /* no more possible matches from this r */
812 : }
813 : /* filter by comparing alphabet bitmaps (imprecise but fast filter) */
814 17 : d = MAX(lsi[n].abm_popcount,
815 : rsi[rstart].abm_popcount) -
816 17 : popcount64(lsi[n].abm & rsi[rstart].abm);
817 17 : if (d > k)
818 4 : continue;
819 : /* final and most expensive test: Levenshtein distance */
820 13 : d = maxlevenshtein_extcol_stritem(&lsi[n], &rsi[rstart], buffer,
821 : (const unsigned int) k);
822 12 : if (d > k)
823 6 : continue;
824 : /* The match test succeeded */
825 6 : lsi[n].matches++;
826 6 : rsi[rstart].matches++;
827 6 : if (bunfastappOID(r1t, lsi[n].o) != GDK_SUCCEED) {
828 0 : reclaim_bats(2, r1t, r2t);
829 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
830 : OPERATION_FAILED "Failed bun append");
831 0 : goto exit;
832 : }
833 6 : if (bunfastappOID(r2t, rsi[rstart].o) != GDK_SUCCEED) {
834 0 : reclaim_bats(2, r1t, r2t);
835 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
836 : OPERATION_FAILED "Failed bun append");
837 0 : goto exit;
838 : }
839 : }
840 : }
841 :
842 24 : FINALIZE_BATS(r1t, r2t, lci, rci, lsi, rsi);
843 6 : *r1 = r1t;
844 6 : *r2 = r2t;
845 :
846 6 : exit:
847 12 : for (n = 0; lsi && n < lci.ncand; n++)
848 6 : GDKfree(lsi[n].cp_sequence);
849 24 : for (n = 0; rsi && n < rci.ncand; n++)
850 18 : GDKfree(rsi[n].cp_sequence);
851 6 : GDKfree(lsi);
852 6 : GDKfree(rsi);
853 6 : GDKfree(buffer);
854 6 : return msg;
855 : }
856 :
857 : static str
858 6 : TXTSIMmaxlevenshteinjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
859 : const bat *kid, const bat *slid, const bat *srid,
860 : const bit *nil_matches, const lng *estimate,
861 : const bit *anti)
862 : {
863 6 : (void) nil_matches;
864 6 : (void) estimate;
865 6 : (void) anti;
866 :
867 6 : BAT *bleft = NULL, *bright = NULL, *bk = NULL,
868 6 : *bcandleft = NULL, *bcandright = NULL, *r1t = NULL, *r2t = NULL;
869 6 : int k = 0;
870 6 : str msg = MAL_SUCCEED;
871 :
872 6 : if ((bleft = BATdescriptor(*lid)) == NULL ||
873 6 : (bright = BATdescriptor(*rid)) == NULL ||
874 6 : (bk = BATdescriptor(*kid)) == NULL) {
875 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
876 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
877 0 : goto exit;
878 : }
879 :
880 6 : if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
881 6 : (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
882 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
883 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
884 0 : goto exit;
885 : }
886 :
887 6 : if (BATcount(bk) > 0) {
888 6 : BATiter ki = bat_iterator(bk);
889 6 : k = *(int *) BUNtloc(ki, 0);
890 6 : bat_iterator_end(&ki);
891 : }
892 :
893 6 : if ((msg = maxlevenshteinjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, k)) != MAL_SUCCEED)
894 0 : goto exit;
895 :
896 6 : *r1 = r1t->batCacheid;
897 6 : *r2 = r2t->batCacheid;
898 6 : BBPkeepref(r1t);
899 6 : BBPkeepref(r2t);
900 :
901 6 : exit:
902 6 : reclaim_bats(5, bleft, bright, bcandleft, bcandright, bk);
903 6 : if (msg != MAL_SUCCEED)
904 0 : reclaim_bats(2, r1t, r2t);
905 6 : return msg;
906 : }
907 :
908 : static inline void
909 9 : jarowinkler_rangebounds(int *lb, int *ub, const str_item *a, const double lp,
910 : const double threshold)
911 : {
912 9 : *lb = (int) floor(3.0 * a->len * (threshold - lp) / (1.0 - lp) -
913 9 : (2.0 * a->len));
914 9 : *ub = (int) ceil(a->len / ((3.0 * (threshold - lp) / (1.0 - lp)) - 2.0));
915 9 : }
916 :
917 : /* version with given lp and m, and t = 0*/
918 : static inline double
919 19 : jarowinkler_lp_m_t0(const str_item *lsi, const str_item *rsi, double lp, int m)
920 : {
921 19 : double dw;
922 : /* Jaro similarity */
923 19 : dw = (((double) m / lsi->len) + ((double) m / rsi->len) + 1.0) / 3.0;
924 : /* Jaro-Winkler similarity */
925 19 : dw = dw + (lp * (1 - dw));
926 19 : return dw;
927 : }
928 :
929 : static str
930 9 : minjarowinklerjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
931 : const dbl threshold)
932 : {
933 9 : BAT *r1t = NULL, *r2t = NULL;
934 9 : BUN n;
935 9 : struct canditer lci, rci;
936 9 : const char *lvals, *rvals, *lvars, *rvars;
937 9 : int lwidth, rwidth, lb = 0, ub = 0, m = -1, *x_flags = NULL, *y_flags = NULL;
938 9 : str_item *ssl = NULL, *ssr = NULL, shortest;
939 9 : str msg = MAL_SUCCEED;
940 9 : const bool sliding_window_allowed = threshold > (2.01 + JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR) / 3.0;
941 9 : double s, lp = 0;
942 :
943 27 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
944 9 : assert(ATOMtype(l->ttype) == TYPE_str);
945 :
946 9 : canditer_init(&lci, l, sl);
947 9 : canditer_init(&rci, r, sr);
948 :
949 9 : if (lci.ncand == 0 || rci.ncand == 0)
950 0 : goto exit;
951 :
952 9 : lvals = (const char *) Tloc(l, 0);
953 9 : rvals = (const char *) Tloc(r, 0);
954 9 : assert(r->ttype);
955 9 : lvars = l->tvheap->base;
956 9 : rvars = r->tvheap->base;
957 9 : lwidth = l->twidth;
958 9 : rwidth = r->twidth;
959 :
960 9 : if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
961 9 : (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
962 0 : reclaim_bats(2, r1t, r2t);
963 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
964 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
965 0 : goto exit;
966 : }
967 :
968 9 : r1t->tsorted = r1t->trevsorted = false;
969 9 : r2t->tsorted = r2t->trevsorted = false;
970 :
971 9 : if ((ssl = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
972 9 : (ssr = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
973 0 : reclaim_bats(2, r1t, r2t);
974 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
975 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
976 0 : goto exit;
977 : }
978 :
979 18 : PREP_BAT_STRITEM(l, lci, ssl);
980 36 : PREP_BAT_STRITEM(r, rci, ssr);
981 9 : qsort(ssl, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
982 9 : qsort(ssr, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
983 :
984 9 : if ((x_flags = GDKmalloc(ssl[0].len * sizeof(int))) == NULL ||
985 9 : (y_flags = GDKmalloc(ssr[0].len * sizeof(int))) == NULL) {
986 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
987 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
988 0 : goto exit;
989 : }
990 : // lp used for filters. Use -1 for actual JW (forces to compute actual lp)
991 36 : lp = JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR;
992 :
993 : /* join loop */
994 36 : for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
995 27 : if (sliding_window_allowed)
996 9 : jarowinkler_rangebounds(&lb, &ub, &ssr[rstart], lp, threshold);
997 47 : for (n = lstart; n < lci.ncand; n++) {
998 : /* Update sliding window */
999 : /* This is the first and cheapest filter */
1000 27 : if (sliding_window_allowed) {
1001 9 : if (ssl[n].len > ub) { /* no possible matches yet for this r */
1002 1 : lstart++;
1003 1 : continue;
1004 : }
1005 8 : if (ssl[n].len < lb) { /* no more possible matches from this r */
1006 : break;
1007 : }
1008 : }
1009 : /* filter by comparing alphabet bitmaps */
1010 : /* find the best possible m: the length of the shorter string
1011 : * minus the number of characters that surely cannot match */
1012 19 : shortest = ssl[n].len < ssr[rstart].len ? ssl[n] : ssr[rstart];
1013 19 : m = shortest.len - popcount64(shortest.abm -
1014 19 : (ssl[n].abm & ssr[rstart].abm));
1015 : /* equivalent to:
1016 : m = shortest.len - popcount64(shortest.abm & (~(ssl[n].abm & ssr[rstart].abm))); */
1017 19 : s = jarowinkler_lp_m_t0(&ssl[n], &ssr[rstart], lp, m);
1018 19 : if (s < threshold) {
1019 6 : continue;
1020 : }
1021 : /* final and most expensive test: Jaro-Winkler similarity */
1022 13 : s = jarowinkler(&ssl[n], &ssr[rstart], -1, x_flags, y_flags);
1023 13 : if (s < threshold) {
1024 4 : continue;
1025 : }
1026 : /* The match test succeeded */
1027 9 : ssl[n].matches++;
1028 9 : ssr[rstart].matches++;
1029 9 : if (bunfastappOID(r1t, ssl[n].o) != GDK_SUCCEED) {
1030 0 : reclaim_bats(2, r1t, r2t);
1031 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
1032 : OPERATION_FAILED "Failed bun append");
1033 0 : goto exit;
1034 : }
1035 9 : if (bunfastappOID(r2t, ssr[rstart].o) != GDK_SUCCEED) {
1036 0 : reclaim_bats(2, r1t, r2t);
1037 0 : msg = createException(MAL, "txtsim.maxlevenshteinjoin",
1038 : OPERATION_FAILED "Failed bun append");
1039 0 : goto exit;
1040 : }
1041 : }
1042 : }
1043 :
1044 42 : FINALIZE_BATS(r1t, r2t, lci, rci, ssl, ssr);
1045 9 : *r1 = r1t;
1046 9 : *r2 = r2t;
1047 :
1048 9 : exit:
1049 9 : if (ssl)
1050 18 : for (n = 0; n < lci.ncand; n++)
1051 9 : GDKfree(ssl[n].cp_sequence);
1052 9 : if (ssr)
1053 36 : for (n = 0; n < rci.ncand; n++)
1054 27 : GDKfree(ssr[n].cp_sequence);
1055 9 : GDKfree(x_flags);
1056 9 : GDKfree(y_flags);
1057 9 : GDKfree(ssl);
1058 9 : GDKfree(ssr);
1059 9 : return msg;
1060 : }
1061 :
1062 : static str
1063 9 : TXTSIMminjarowinklerjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
1064 : const bat *thresholdid, const bat *slid,
1065 : const bat *srid, const bit *nil_matches,
1066 : const lng *estimate, const bit *anti)
1067 : {
1068 9 : (void) nil_matches;
1069 9 : (void) estimate;
1070 9 : (void) anti;
1071 :
1072 9 : BAT *bleft = NULL, *bright = NULL,
1073 9 : *bcandleft = NULL, *bcandright = NULL, *bthreshold = NULL,
1074 9 : *r1t = NULL, *r2t = NULL;
1075 9 : dbl threshold = 1;
1076 9 : str msg = MAL_SUCCEED;
1077 :
1078 9 : if ((bleft = BATdescriptor(*lid)) == NULL ||
1079 9 : (bright = BATdescriptor(*rid)) == NULL ||
1080 9 : (bthreshold = BATdescriptor(*thresholdid)) == NULL) {
1081 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
1082 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1083 0 : goto exit;
1084 : }
1085 :
1086 9 : if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
1087 9 : (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
1088 0 : msg = createException(MAL, "txtsim.minjarowinklerjoin",
1089 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1090 0 : goto exit;
1091 : }
1092 :
1093 9 : if (BATcount(bthreshold) > 0) {
1094 9 : BATiter thresholdi = bat_iterator(bthreshold);
1095 9 : threshold = *(dbl *) BUNtail(thresholdi, 0);
1096 9 : bat_iterator_end(&thresholdi);
1097 : }
1098 :
1099 9 : if ((msg = minjarowinklerjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, threshold)) != MAL_SUCCEED)
1100 0 : goto exit;
1101 :
1102 9 : *r1 = r1t->batCacheid;
1103 9 : *r2 = r2t->batCacheid;
1104 9 : BBPkeepref(r1t);
1105 9 : BBPkeepref(r2t);
1106 :
1107 9 : exit:
1108 9 : reclaim_bats(5, bleft, bright, bcandleft, bcandright, bthreshold);
1109 9 : if (msg != MAL_SUCCEED)
1110 0 : reclaim_bats(2, r1t, r2t);
1111 9 : return msg;
1112 : }
1113 :
1114 : #define SoundexLen 4 /* length of a soundex code */
1115 : #define SoundexKey "Z000" /* default key for soundex code */
1116 :
1117 : /* set letter values */
1118 : static const int Code[] = { 0, 1, 2, 3, 0, 1,
1119 : 2, 0, 0, 2, 2, 4,
1120 : 5, 5, 0, 1, 2, 6,
1121 : 2, 3, 0, 1, 0, 2, 0, 2
1122 : };
1123 :
1124 : #define RETURN_NIL_IF(b,t) \
1125 : if (b) { \
1126 : if (ATOMextern(t)) { \
1127 : *(ptr*) res = (ptr) ATOMnil(t); \
1128 : if ( *(ptr *) res == NULL) \
1129 : throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
1130 : } else { \
1131 : memcpy(res, ATOMnilptr(t), ATOMsize(t)); \
1132 : } \
1133 : return MAL_SUCCEED; \
1134 : }
1135 :
1136 : static inline char
1137 155 : SCode(unsigned char c)
1138 : {
1139 155 : if (c == 95)
1140 : return (2); /* german sz */
1141 155 : return (Code[toupper(c) - 'A']);
1142 : }
1143 :
1144 : static str
1145 65 : soundex_code(const char *Name, char *Key)
1146 : {
1147 65 : char LastLetter;
1148 65 : int Index;
1149 :
1150 301 : for (const char *p = Name; *p; p++)
1151 237 : if ((*p & 0x80) != 0)
1152 1 : throw(MAL, "soundex",
1153 : SQLSTATE(42000)
1154 : "Soundex function not available for non ASCII strings");
1155 :
1156 : /* set default key */
1157 64 : strcpy(Key, SoundexKey);
1158 :
1159 : /* keep first letter */
1160 64 : Key[0] = *Name;
1161 64 : if (!isupper((unsigned char) (Key[0])))
1162 22 : Key[0] = toupper(Key[0]);
1163 :
1164 64 : LastLetter = *Name;
1165 64 : if (!*Name)
1166 : return MAL_SUCCEED;
1167 55 : Name++;
1168 :
1169 : /* scan rest of string */
1170 227 : for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
1171 : /* use only letters */
1172 172 : if (isalpha((unsigned char) (*Name))) {
1173 : /* ignore duplicate successive chars */
1174 169 : if (LastLetter != *Name) {
1175 : /* new LastLetter */
1176 155 : LastLetter = *Name;
1177 :
1178 : /* ignore letters with code 0 */
1179 155 : if (SCode(*Name) != 0) {
1180 74 : Key[Index] = '0' + SCode(*Name);
1181 74 : Index ++;
1182 : }
1183 : }
1184 : }
1185 : }
1186 : return MAL_SUCCEED;
1187 : }
1188 :
1189 : static str
1190 70 : soundex(str *res, const char *const *Name)
1191 : {
1192 70 : str msg = MAL_SUCCEED;
1193 :
1194 70 : GDKfree(*res);
1195 74 : RETURN_NIL_IF(strNil(*Name), TYPE_str);
1196 :
1197 66 : *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
1198 65 : if (*res == NULL)
1199 0 : throw(MAL, "soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1200 :
1201 : /* calculate Key for Name */
1202 65 : if ((msg = soundex_code(*Name, *res))) {
1203 1 : GDKfree(*res);
1204 1 : *res = NULL;
1205 1 : return msg;
1206 : }
1207 : return msg;
1208 : }
1209 :
1210 : static str
1211 29 : stringdiff(int *res, const char *const *s1, const char *const *s2)
1212 : {
1213 29 : str r = MAL_SUCCEED;
1214 29 : char *S1 = NULL, *S2 = NULL;
1215 :
1216 29 : r = soundex(&S1, s1);
1217 30 : if (r != MAL_SUCCEED)
1218 : return r;
1219 30 : r = soundex(&S2, s2);
1220 30 : if (r != MAL_SUCCEED) {
1221 0 : GDKfree(S1);
1222 0 : return r;
1223 : }
1224 30 : r = TXTSIMdameraulevenshtein1(res, &(const char *){S1}, &(const char *){S2});
1225 30 : GDKfree(S1);
1226 30 : GDKfree(S2);
1227 30 : return r;
1228 : }
1229 :
1230 : /******************************
1231 : * QGRAMNORMALIZE
1232 : *
1233 : * This function 'normalizes' a string so valid q-grams can be made of it:
1234 : * All characters are transformed to uppercase, and all characters
1235 : * which are not letters or digits are stripped to a single space.
1236 : *
1237 : * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
1238 : * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
1239 : *
1240 : *****************************/
1241 : static str
1242 13 : qgram_normalize(str *res, const char *const *Input)
1243 : {
1244 13 : const char *input = *Input;
1245 13 : int i, j = 0;
1246 13 : char c, last = ' ';
1247 :
1248 13 : GDKfree(*res);
1249 13 : RETURN_NIL_IF(strNil(input), TYPE_str);
1250 13 : *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1)); /* normalized strings are never longer than original */
1251 13 : if (*res == NULL)
1252 0 : throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1253 :
1254 100 : for (i = 0; input[i]; i++) {
1255 87 : c = toupper(input[i]);
1256 87 : if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
1257 87 : c = ' ';
1258 87 : if (c != ' ' || last != ' ') {
1259 78 : (*res)[j++] = c;
1260 : }
1261 87 : last = c;
1262 : }
1263 13 : (*res)[j] = 0;
1264 : /* strip final whitespace */
1265 16 : while (j > 0 && (*res)[--j] == ' ')
1266 3 : (*res)[j] = 0;
1267 :
1268 : return MAL_SUCCEED;
1269 : }
1270 :
1271 : static str
1272 0 : qgram_selfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid,
1273 : flt *c, int *k)
1274 : {
1275 0 : BAT *qgram, *id, *pos, *len;
1276 0 : BUN n;
1277 0 : BUN i, j;
1278 0 : BAT *bn, *bn2;
1279 0 : oid *qbuf;
1280 0 : int *ibuf;
1281 0 : int *pbuf;
1282 0 : int *lbuf;
1283 0 : str msg = MAL_SUCCEED;
1284 :
1285 0 : qgram = BATdescriptor(*qid);
1286 0 : id = BATdescriptor(*bid);
1287 0 : pos = BATdescriptor(*pid);
1288 0 : len = BATdescriptor(*lid);
1289 0 : if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
1290 0 : BBPreclaim(qgram);
1291 0 : BBPreclaim(id);
1292 0 : BBPreclaim(pos);
1293 0 : BBPreclaim(len);
1294 0 : throw(MAL, "txtsim.qgramselfjoin",
1295 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1296 : }
1297 :
1298 0 : BATiter qgrami = bat_iterator(qgram);
1299 0 : BATiter idi = bat_iterator(id);
1300 0 : BATiter posi = bat_iterator(pos);
1301 0 : BATiter leni = bat_iterator(len);
1302 0 : if (qgrami.type != TYPE_oid)
1303 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1304 : SEMANTIC_TYPE_MISMATCH
1305 : ": tail of BAT qgram must be oid");
1306 0 : else if (idi.type != TYPE_int)
1307 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1308 : SEMANTIC_TYPE_MISMATCH
1309 : ": tail of BAT id must be int");
1310 0 : else if (posi.type != TYPE_int)
1311 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1312 : SEMANTIC_TYPE_MISMATCH
1313 : ": tail of BAT pos must be int");
1314 0 : else if (leni.type != TYPE_int)
1315 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1316 : SEMANTIC_TYPE_MISMATCH
1317 : ": tail of BAT len must be int");
1318 0 : if (msg) {
1319 0 : bat_iterator_end(&qgrami);
1320 0 : bat_iterator_end(&idi);
1321 0 : bat_iterator_end(&posi);
1322 0 : bat_iterator_end(&leni);
1323 0 : BBPunfix(qgram->batCacheid);
1324 0 : BBPunfix(id->batCacheid);
1325 0 : BBPunfix(pos->batCacheid);
1326 0 : BBPunfix(len->batCacheid);
1327 0 : return msg;
1328 : }
1329 :
1330 0 : n = BATcount(qgram);
1331 :
1332 : /* if (BATcount(qgram)>1 && !qgrami.sorted) throw(MAL, "txtsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
1333 :
1334 0 : if (!ALIGNsynced(qgram, id))
1335 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1336 : SEMANTIC_TYPE_MISMATCH
1337 : ": qgram and id are not synced");
1338 :
1339 0 : else if (!ALIGNsynced(qgram, pos))
1340 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1341 : SEMANTIC_TYPE_MISMATCH
1342 : ": qgram and pos are not synced");
1343 0 : else if (!ALIGNsynced(qgram, len))
1344 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1345 : SEMANTIC_TYPE_MISMATCH
1346 : ": qgram and len are not synced");
1347 :
1348 0 : else if (qgrami.width != ATOMsize(qgrami.type))
1349 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1350 : SEMANTIC_TYPE_MISMATCH
1351 : ": qgram is not a true void bat");
1352 0 : else if (idi.width != ATOMsize(idi.type))
1353 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1354 : SEMANTIC_TYPE_MISMATCH
1355 : ": id is not a true void bat");
1356 :
1357 0 : else if (posi.width != ATOMsize(posi.type))
1358 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1359 : SEMANTIC_TYPE_MISMATCH
1360 : ": pos is not a true void bat");
1361 0 : else if (leni.width != ATOMsize(leni.type))
1362 0 : msg = createException(MAL, "txtsim.qgramselfjoin",
1363 : SEMANTIC_TYPE_MISMATCH
1364 : ": len is not a true void bat");
1365 0 : if (msg) {
1366 0 : bat_iterator_end(&qgrami);
1367 0 : bat_iterator_end(&idi);
1368 0 : bat_iterator_end(&posi);
1369 0 : bat_iterator_end(&leni);
1370 0 : BBPunfix(qgram->batCacheid);
1371 0 : BBPunfix(id->batCacheid);
1372 0 : BBPunfix(pos->batCacheid);
1373 0 : BBPunfix(len->batCacheid);
1374 0 : return msg;
1375 : }
1376 :
1377 0 : bn = COLnew(0, TYPE_int, n, TRANSIENT);
1378 0 : bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
1379 0 : if (bn == NULL || bn2 == NULL) {
1380 0 : bat_iterator_end(&qgrami);
1381 0 : bat_iterator_end(&idi);
1382 0 : bat_iterator_end(&posi);
1383 0 : bat_iterator_end(&leni);
1384 0 : BBPreclaim(bn);
1385 0 : BBPreclaim(bn2);
1386 0 : BBPunfix(qgram->batCacheid);
1387 0 : BBPunfix(id->batCacheid);
1388 0 : BBPunfix(pos->batCacheid);
1389 0 : BBPunfix(len->batCacheid);
1390 0 : throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1391 : }
1392 :
1393 0 : qbuf = (oid *) qgrami.base;
1394 0 : ibuf = (int *) idi.base;
1395 0 : pbuf = (int *) posi.base;
1396 0 : lbuf = (int *) leni.base;
1397 0 : for (i = 0; i < n - 1; i++) {
1398 0 : for (j = i + 1;
1399 0 : (j < n && qbuf[j] == qbuf[i]
1400 0 : && pbuf[j] <= (pbuf[i] + (*k + *c * MIN(lbuf[i], lbuf[j]))));
1401 0 : j++) {
1402 0 : if (ibuf[i] != ibuf[j]
1403 0 : && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MIN(lbuf[i], lbuf[j]))) {
1404 0 : if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED
1405 0 : || BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
1406 0 : bat_iterator_end(&qgrami);
1407 0 : bat_iterator_end(&idi);
1408 0 : bat_iterator_end(&posi);
1409 0 : bat_iterator_end(&leni);
1410 0 : BBPunfix(qgram->batCacheid);
1411 0 : BBPunfix(id->batCacheid);
1412 0 : BBPunfix(pos->batCacheid);
1413 0 : BBPunfix(len->batCacheid);
1414 0 : BBPreclaim(bn);
1415 0 : BBPreclaim(bn2);
1416 0 : throw(MAL, "txtsim.qgramselfjoin",
1417 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1418 : }
1419 : }
1420 : }
1421 : }
1422 0 : bat_iterator_end(&qgrami);
1423 0 : bat_iterator_end(&idi);
1424 0 : bat_iterator_end(&posi);
1425 0 : bat_iterator_end(&leni);
1426 :
1427 0 : BBPunfix(qgram->batCacheid);
1428 0 : BBPunfix(id->batCacheid);
1429 0 : BBPunfix(pos->batCacheid);
1430 0 : BBPunfix(len->batCacheid);
1431 :
1432 0 : *res1 = bn->batCacheid;
1433 0 : BBPkeepref(bn);
1434 0 : *res2 = bn2->batCacheid;
1435 0 : BBPkeepref(bn2);
1436 :
1437 0 : return MAL_SUCCEED;
1438 : }
1439 :
1440 : /* copy up to utf8len UTF-8 encoded characters from src to buf
1441 : * stop early if buf (size given by bufsize) is too small, or if src runs out
1442 : * return number of UTF-8 characters copied (excluding NUL)
1443 : * close with NUL if enough space */
1444 : static size_t
1445 26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
1446 : {
1447 26 : size_t cnt = 0;
1448 :
1449 128 : while (utf8len != 0 && *src != 0 && bufsize != 0) {
1450 102 : bufsize--;
1451 102 : utf8len--;
1452 102 : cnt++;
1453 102 : if (((*buf++ = *src++) & 0x80) != 0) {
1454 40 : while ((*src & 0xC0) == 0x80 && bufsize != 0) {
1455 20 : *buf++ = *src++;
1456 20 : bufsize--;
1457 : }
1458 : }
1459 : }
1460 26 : if (bufsize != 0)
1461 26 : *buf = 0;
1462 26 : return cnt;
1463 : }
1464 :
1465 : static str
1466 2 : str_2_qgrams(bat *ret, const char *const *val)
1467 : {
1468 2 : BAT *bn;
1469 2 : size_t i, len = strlen(*val) + 5;
1470 2 : str s = GDKmalloc(len);
1471 2 : char qgram[4 * 6 + 1]; /* 4 UTF-8 code points plus NULL byte */
1472 :
1473 2 : if (s == NULL)
1474 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1475 2 : strcpy(s, "##");
1476 2 : strcpy(s + 2, *val);
1477 2 : strcpy(s + len - 3, "$$");
1478 2 : bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
1479 2 : if (bn == NULL) {
1480 0 : GDKfree(s);
1481 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1482 : }
1483 :
1484 : i = 0;
1485 26 : while (s[i]) {
1486 26 : if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
1487 : break;
1488 24 : if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
1489 0 : BBPreclaim(bn);
1490 0 : GDKfree(s);
1491 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1492 : }
1493 24 : if ((s[i++] & 0xC0) == 0xC0) {
1494 8 : while ((s[i] & 0xC0) == 0x80)
1495 4 : i++;
1496 : }
1497 : }
1498 2 : *ret = bn->batCacheid;
1499 2 : BBPkeepref(bn);
1500 2 : GDKfree(s);
1501 2 : return MAL_SUCCEED;
1502 : }
1503 :
1504 : #include "mel.h"
1505 : mel_func txtsim_init_funcs[] = {
1506 : pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1, transposition = 2)", args(1,3,arg("",int),arg("x",str),arg("y",str))),
1507 : pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
1508 : command("txtsim", "editdistance", TXTSIMdameraulevenshtein1, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1509 : command("txtsim", "editdistance2", TXTSIMdameraulevenshtein2, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 1", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1510 : pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1)", args(1,3,arg("",int),arg("s",str),arg("t",str))),
1511 : pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, variable operation costs (ins/del, replacement)", args(1,5,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int))),
1512 : pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
1513 : pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with basic costs but up to a MAX", args(1, 4, arg("",int), arg("l",str),arg("r",str),arg("k",int))),
1514 : pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with variable costs but up to a MAX", args(1, 6, arg("",int), arg("l",str),arg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
1515 : pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 4, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int))),
1516 : pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 6, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
1517 : command("txtsim", "maxlevenshteinjoin", TXTSIMmaxlevenshteinjoin, false, "", args(2,10, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("k",int),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
1518 : command("txtsim", "soundex", soundex, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
1519 : command("txtsim", "stringdiff", stringdiff, false, "Calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
1520 : command("txtsim", "qgramnormalize", qgram_normalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
1521 : command("txtsim", "qgramselfjoin", qgram_selfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
1522 : command("txtsim", "str2qgrams", str_2_qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
1523 : command("txtsim", "jarowinkler", TXTSIMjarowinkler, false, "Calculate Jaro Winkler similarity", args(1,3, arg("",dbl),arg("x",str),arg("y",str))),
1524 : command("txtsim", "minjarowinkler", TXTSIMminjarowinkler, false, "", args(1, 4, arg("",bit), arg("l",str),arg("r",str),arg("threshold",dbl))),
1525 : command("txtsim", "minjarowinklerjoin", TXTSIMminjarowinklerjoin, false, "", args(2, 10, batarg("",oid),batarg("",oid), batarg("l",str),batarg("r",str),batarg("threshold",dbl),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
1526 : { .imp=NULL }
1527 : };
1528 : #include "mal_import.h"
1529 : #ifdef _MSC_VER
1530 : #undef read
1531 : #pragma section(".CRT$XCU",read)
1532 : #endif
1533 345 : LIB_STARTUP_FUNC(init_txtsim_mal)
1534 345 : { mal_module("txtsim", NULL, txtsim_init_funcs); }
|