LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - txtsim.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 543 849 64.0 %
Date: 2024-12-20 21:24:02 Functions: 27 32 84.4 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : #include "monetdb_config.h"
      14             : #include "gdk.h"
      15             : #include "str.h"
      16             : #include "mal.h"
      17             : #include "mal_exception.h"
      18             : #include "mal_interpreter.h"
      19             : #include "mutf8.h"
      20             : #include <string.h>
      21             : #include <limits.h>
      22             : 
      23             : #define MIN3(X,Y,Z)   (MIN(MIN((X),(Y)),(Z)))
      24             : #define MAX3(X,Y,Z)   (MAX(MAX((X),(Y)),(Z)))
      25             : #define MIN4(W,X,Y,Z) (MIN(MIN(MIN(W,X),Y),Z))
      26             : 
      27             : static void
      28          15 : reclaim_bats(int nargs, ...)
      29             : {
      30          15 :         va_list valist;
      31             : 
      32          15 :         va_start(valist, nargs);
      33          90 :         for (int i = 0; i < nargs; i++) {
      34          75 :                 BAT *b = va_arg(valist, BAT *);
      35         120 :                 BBPreclaim(b);
      36             :         }
      37          15 :         va_end(valist);
      38          15 : }
      39             : 
      40             : static inline int *
      41        4086 : damerau_get_cellpointer(int *pOrigin, int col, int row, int nCols)
      42             : {
      43        4086 :         return pOrigin + col + (row * (nCols + 1));
      44             : }
      45             : 
      46             : static inline int
      47        2252 : damerau_getat(int *pOrigin, int col, int row, int nCols)
      48             : {
      49        2252 :         int *pCell;
      50        2252 :         pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
      51        2252 :         return *pCell;
      52             : }
      53             : 
      54             : static inline void
      55        1834 : damerau_putat(int *pOrigin, int col, int row, int nCols, int x)
      56             : {
      57        1834 :         int *pCell;
      58        1834 :         pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
      59        1834 :         *pCell = x;
      60             : }
      61             : 
      62             : static str
      63          82 : dameraulevenshtein(int *res, const char *s, const char *t, int insdel_cost,
      64             :                                    int replace_cost, int transpose_cost)
      65             : {
      66          82 :         int *d;                                         /* pointer to matrix */
      67          82 :         int n;                                          /* length of s */
      68          82 :         int m;                                          /* length of t */
      69          82 :         int i;                                          /* iterates through s */
      70          82 :         int j;                                          /* iterates through t */
      71          82 :         char s_i;                                       /* ith character of s */
      72          82 :         char t_j;                                       /* jth character of t */
      73          82 :         int cost;                                       /* cost */
      74          82 :         int cell;                                       /* contents of target cell */
      75          82 :         int above;                                      /* contents of cell immediately above */
      76          82 :         int left;                                       /* contents of cell immediately to left */
      77          82 :         int diag;                                       /* contents of cell immediately above and to left */
      78          82 :         lng sz;                                 /* number of cells in matrix */
      79          82 :         int diag2 = 0, cost2 = 0;
      80             : 
      81         157 :         if (strNil(s) || strNil(t)) {
      82          10 :                 *res = int_nil;
      83          10 :                 return MAL_SUCCEED;
      84             :         }
      85             : 
      86             :         /* Step 1 */
      87          72 :         n = (int) strlen(s);            /* 64bit: assume strings are less than 2 GB */
      88          72 :         m = (int) strlen(t);
      89          72 :         if (n == 0) {
      90           8 :                 *res = m;
      91           8 :                 return MAL_SUCCEED;
      92             :         }
      93          64 :         if (m == 0) {
      94           8 :                 *res = n;
      95           8 :                 return MAL_SUCCEED;
      96             :         }
      97          56 :         sz = (((lng)n) + 1) * (((lng)m) + 1) * sizeof(int);
      98          56 :         if (sz > (LL_CONSTANT(1)<<28))
      99           1 :                 throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     100          55 :         d = (int *) GDKmalloc((size_t)sz);
     101          56 :         if (d == NULL)
     102           0 :                 throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     103             : 
     104             :         /* Step 2 */
     105         341 :         for (i = 0; i <= n; i++) {
     106         285 :                 damerau_putat(d, i, 0, n, i);
     107             :         }
     108         319 :         for (j = 0; j <= m; j++) {
     109         263 :                 damerau_putat(d, 0, j, n, j);
     110             :         }
     111             :         /* Step 3 */
     112         282 :         for (i = 1; i <= n; i++) {
     113         226 :                 s_i = s[i - 1];
     114             :                 /* Step 4 */
     115        1512 :                 for (j = 1; j <= m; j++) {
     116        1286 :                         t_j = t[j - 1];
     117             :                         /* Step 5 */
     118        1286 :                         if (s_i == t_j) {
     119             :                                 cost = 0;
     120             :                         } else {
     121         974 :                                 cost = replace_cost;
     122             :                         }
     123             :                         /* Step 6 */
     124        1286 :                         above = damerau_getat(d, i - 1, j, n);
     125        1286 :                         left = damerau_getat(d, i, j - 1, n);
     126        1286 :                         diag = damerau_getat(d, i - 1, j - 1, n);
     127        1286 :                         if (j >= 2 && i >= 2) {
     128             :                                 /* NEW: detect transpositions */
     129         910 :                                 diag2 = damerau_getat(d, i - 2, j - 2, n);
     130         910 :                                 if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
     131             :                                         cost2 = transpose_cost;
     132             :                                 } else {
     133         838 :                                         cost2 = 2;
     134             :                                 }
     135         910 :                                 cell = MIN4(above + insdel_cost, left + insdel_cost,
     136             :                                                         diag + cost, diag2 + cost2);
     137             :                         } else {
     138         376 :                                 cell = MIN3(above + insdel_cost, left + insdel_cost,
     139             :                                                         diag + cost);
     140             :                         }
     141        1286 :                         damerau_putat(d, i, j, n, cell);
     142             :                 }
     143             :         }
     144             :         /* Step 7 */
     145          56 :         *res = damerau_getat(d, n, m, n);
     146          56 :         GDKfree(d);
     147          56 :         return MAL_SUCCEED;
     148             : }
     149             : 
     150             : static str
     151          38 : TXTSIMdameraulevenshtein1(int *result, const char *const *s, const char *const *t)
     152             : {
     153          38 :         return dameraulevenshtein(result, *s, *t, 1, 1, 2);
     154             : }
     155             : 
     156             : static str
     157           9 : TXTSIMdameraulevenshtein2(int *result, const char *const *s, const char *const *t)
     158             : {
     159           9 :         return dameraulevenshtein(result, *s, *t, 1, 1, 1);
     160             : }
     161             : 
     162             : static str
     163           2 : TXTSIMdameraulevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
     164             :                                                  InstrPtr pci)
     165             : {
     166           2 :         (void) cntxt;
     167           2 :         (void) mb;
     168           2 :         int *res = getArgReference_int(stk, pci, 0);
     169           2 :         const char *x = *getArgReference_str(stk, pci, 1),
     170           2 :                 *y = *getArgReference_str(stk, pci, 2);
     171           2 :         int insdel_cost, replace_cost, transpose_cost;
     172             : 
     173           2 :         assert(pci->argc == 3 || pci->argc == 6);
     174             : 
     175           2 :         if (pci->argc == 3) {
     176             :                 insdel_cost = 1;
     177             :                 replace_cost = 1;
     178             :                 transpose_cost = 2;
     179             :         } else {
     180           1 :                 insdel_cost = *getArgReference_int(stk, pci, 3);
     181           1 :                 replace_cost = *getArgReference_int(stk, pci, 4);
     182           1 :                 transpose_cost = *getArgReference_int(stk, pci, 5);
     183             :         }
     184             : 
     185           2 :         return dameraulevenshtein(res, x, y, insdel_cost, replace_cost,
     186             :                                                           transpose_cost);
     187             : }
     188             : 
     189             : static inline str
     190          23 : levenshtein(int *res, const char *x, const char *y, int insdel_cost,
     191             :                         int replace_cost, int max)
     192             : {
     193          23 :         unsigned int xlen, ylen, i = 0, j = 0;
     194          23 :         unsigned int last_diagonal, old_diagonal;
     195          23 :         uint32_t cx, cy;
     196          23 :         unsigned int *column, min;
     197             : 
     198          44 :         if (strNil(x) || strNil(y)) {
     199           3 :                 *res = int_nil;
     200           3 :                 return MAL_SUCCEED;
     201             :         }
     202             : 
     203          20 :         xlen = UTF8_strlen(x);
     204          20 :         ylen = UTF8_strlen(y);
     205             : 
     206          20 :         if (xlen == ylen && (strcmp(x, y) == 0)) {
     207           6 :                 *res = 0;
     208           6 :                 return MAL_SUCCEED;
     209             :         }
     210             : 
     211          14 :         column = GDKmalloc((xlen + 1) * sizeof(unsigned int));
     212          14 :         if (column == NULL)
     213           0 :                 throw(MAL, "levenshtein", MAL_MALLOC_FAIL);
     214             : 
     215          91 :         for (i = 1; i <= xlen; i++)
     216          77 :                 column[i] = i;
     217             : 
     218         108 :         for (j = 1; j <= ylen; j++) {
     219          94 :                 column[0] = j;
     220          94 :                 min = INT_MAX;
     221          94 :                 const char *x_iter = x;
     222          94 :                 y = nextchar(y, &cy);
     223          94 :                 if (y == NULL)
     224           0 :                         goto illegal;
     225         518 :                 for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
     226         424 :                         x_iter = nextchar(x_iter, &cx);
     227         424 :                         if (x_iter == NULL)
     228           0 :                                 goto illegal;
     229         424 :                         old_diagonal = column[i];
     230         424 :                         column[i] = MIN3(column[i] + insdel_cost,
     231             :                                                          column[i - 1] + insdel_cost,
     232             :                                                          last_diagonal + (cx == cy ? 0 : replace_cost));
     233         424 :                         last_diagonal = old_diagonal;
     234         424 :                         if (last_diagonal < min)
     235             :                                 min = last_diagonal;
     236             :                 }
     237          94 :                 if (max != -1 && min > (unsigned int) max) {
     238           0 :                         *res = INT_MAX;
     239           0 :                         GDKfree(column);
     240           0 :                         return MAL_SUCCEED;
     241             :                 }
     242             :         }
     243             : 
     244          14 :         *res = column[xlen];
     245          14 :         GDKfree(column);
     246          14 :         return MAL_SUCCEED;
     247           0 :   illegal:
     248           0 :         GDKfree(column);
     249           0 :         throw(MAL, "txtsim.levenshtein", "Illegal unicode code point");
     250             : }
     251             : 
     252             : /* Levenshtein OP but with column externally allocated */
     253             : static inline int
     254           0 : levenshtein2(const char *x, const char *y, const size_t xlen, const size_t ylen,
     255             :                          unsigned int *column, const int insdel_cost,
     256             :                          const int replace_cost, const int max)
     257             : {
     258           0 :         unsigned int i = 0, j = 0, min;;
     259           0 :         unsigned int last_diagonal, old_diagonal;
     260           0 :         uint32_t cx, cy;
     261             : 
     262           0 :         if (strNil(x) || strNil(y))
     263           0 :                 return int_nil;
     264             : 
     265           0 :         if (xlen == ylen && (strcmp(x, y) == 0))
     266             :                 return 0;
     267             : 
     268           0 :         for (i = 1; i <= xlen; i++)
     269           0 :                 column[i] = i;
     270             : 
     271           0 :         for (j = 1; j <= ylen; j++) {
     272           0 :                 column[0] = j;
     273           0 :                 min = INT_MAX;
     274           0 :                 const char *x_iter = x;
     275           0 :                 y = nextchar(y, &cy);
     276           0 :                 if (y == NULL)
     277           0 :                         goto illegal;
     278           0 :                 for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
     279           0 :                         x_iter = nextchar(x_iter, &cx);
     280           0 :                         if (x_iter == NULL)
     281           0 :                                 goto illegal;
     282           0 :                         old_diagonal = column[i];
     283           0 :                         column[i] = MIN3(column[i] + insdel_cost,
     284             :                                                          column[i - 1] + insdel_cost,
     285             :                                                          last_diagonal + (cx == cy ? 0 : replace_cost));
     286           0 :                         last_diagonal = old_diagonal;
     287           0 :                         if (last_diagonal < min)
     288             :                                 min = last_diagonal;
     289             :                 }
     290           0 :                 if (max != -1 && min > (unsigned int) max)
     291             :                         return INT_MAX;
     292             :         }
     293           0 :         return column[xlen];
     294             :   illegal:
     295             :         return INT_MAX;
     296             : }
     297             : 
     298             : static str
     299          56 : TXTSIMlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     300             : {
     301          56 :         (void) cntxt;
     302          56 :         (void) mb;
     303          56 :         int *res = getArgReference_int(stk, pci, 0);
     304          56 :         const char *x = *getArgReference_str(stk, pci, 1),
     305          56 :                 *y = *getArgReference_str(stk, pci, 2);
     306          56 :         int insdel_cost, replace_cost;
     307             : 
     308          56 :         if (pci->argc == 3) {
     309             :                 insdel_cost = 1;
     310             :                 replace_cost = 1;
     311          34 :         } else if (pci->argc == 5 || pci->argc == 6) {
     312          34 :                 insdel_cost = *getArgReference_int(stk, pci, 3);
     313          34 :                 replace_cost = *getArgReference_int(stk, pci, 4);
     314             :                 /* Backwards compatibility purposes */
     315          34 :                 if (pci->argc == 6) {
     316          33 :                         int transposition_cost = *getArgReference_int(stk, pci, 5);
     317          33 :                         return dameraulevenshtein(res, x, y, insdel_cost, replace_cost,
     318             :                                                                           transposition_cost);
     319             :                 }
     320             :         } else {
     321           0 :                 throw(MAL, "txtsim.levenshtein", RUNTIME_SIGNATURE_MISSING);
     322             :         }
     323             : 
     324          23 :         return levenshtein(res, x, y, insdel_cost, replace_cost, -1);;
     325             : }
     326             : 
     327             : static str
     328           0 : TXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     329             : {
     330           0 :         (void) cntxt;
     331           0 :         (void) mb;
     332           0 :         int *res = getArgReference_int(stk, pci, 0);
     333           0 :         const char *x = *getArgReference_str(stk, pci, 1),
     334           0 :                 *y = *getArgReference_str(stk, pci, 2);
     335           0 :         const int *k = getArgReference_int(stk, pci, 3);
     336           0 :         int insdel_cost, replace_cost;
     337             : 
     338           0 :         if (pci->argc == 4) {
     339             :                 insdel_cost = 1;
     340             :                 replace_cost = 1;
     341           0 :         } else if (pci->argc == 6) {
     342           0 :                 insdel_cost = *getArgReference_int(stk, pci, 4);
     343           0 :                 replace_cost = *getArgReference_int(stk, pci, 5);
     344             :         } else {
     345           0 :                 throw(MAL, "txtsim.maxlevenshtein", RUNTIME_SIGNATURE_MISSING);
     346             :         }
     347             : 
     348           0 :         return levenshtein(res, x, y, insdel_cost, replace_cost, *k);
     349             : }
     350             : 
     351             : static str
     352           0 : BATTXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     353             : {
     354           0 :         (void) cntxt;
     355           0 :         (void) mb;
     356           0 :         bat *res = getArgReference_bat(stk, pci, 0);
     357           0 :         bat *lid = getArgReference_bat(stk, pci, 1);
     358           0 :         bat *rid = getArgReference_bat(stk, pci, 2);
     359           0 :         int *k = getArgReference_int(stk, pci, 3);
     360           0 :         int insdel_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 4) : 1,
     361           0 :                         replace_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 5) : 1;
     362           0 :         BAT *left = NULL, *right = NULL, *bn = NULL;
     363           0 :         BUN p, q;
     364           0 :         BATiter li, ri;
     365           0 :         unsigned int *buffer = NULL;
     366           0 :         const char *lv, *rv;
     367           0 :         str msg = MAL_SUCCEED;
     368           0 :         size_t llen = 0, rlen = 0, maxlen = 0;
     369           0 :         int d;
     370           0 :         bit v;
     371             : 
     372           0 :         if ((left = BATdescriptor(*lid)) == NULL) {
     373           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     374             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     375           0 :                 goto exit;
     376             :         }
     377           0 :         if ((right = BATdescriptor(*rid)) == NULL) {
     378           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     379             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     380           0 :                 goto exit;
     381             :         }
     382           0 :         if (BATcount(left) != BATcount(right)) {
     383           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     384             :                                                           "Columns must be aligned");
     385           0 :                 goto exit;
     386             :         }
     387           0 :         if ((bn = COLnew(0, TYPE_bit, BATcount(left), TRANSIENT)) == NULL) {
     388           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     389             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     390           0 :                 goto exit;
     391             :         }
     392             : 
     393           0 :         li = bat_iterator(left);
     394           0 :         ri = bat_iterator(right);
     395           0 :         BATloop(left, p, q) {
     396           0 :                 lv = BUNtail(li, p);
     397           0 :                 rv = BUNtail(ri, p);
     398           0 :                 llen = UTF8_strlen(lv);
     399           0 :                 rlen = UTF8_strlen(rv);
     400           0 :                 if (abs((int) llen - (int) rlen) > (int) *k)
     401           0 :                         v = false;
     402             :                 else {
     403           0 :                         if (llen > maxlen) {
     404           0 :                                 maxlen = llen;
     405           0 :                                 unsigned int *tmp = GDKrealloc(buffer, (maxlen + 1) * sizeof(unsigned int));
     406           0 :                                 if (tmp == NULL) {
     407           0 :                                         bat_iterator_end(&li);
     408           0 :                                         bat_iterator_end(&ri);
     409           0 :                                         msg = createException(MAL, "battxtsim.maxlevenshtein",
     410             :                                                                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
     411           0 :                                         goto exit;
     412             :                                 }
     413             :                                 buffer = tmp;
     414             :                         }
     415           0 :                         d = levenshtein2(lv, rv, llen, rlen, buffer, insdel_cost,
     416             :                                                          replace_cost, (int) *k);
     417           0 :                         v = (bit) (d <= (int) *k);
     418             :                 }
     419           0 :                 if (BUNappend(bn, (const void *) &v, false) != GDK_SUCCEED) {
     420           0 :                         bat_iterator_end(&li);
     421           0 :                         bat_iterator_end(&ri);
     422           0 :                         msg = createException(MAL, "battxtsim.maxlevenshtein",
     423             :                                                                   "BUNappend failed");
     424           0 :                         goto exit;
     425             :                 }
     426             :         }
     427           0 :         bat_iterator_end(&li);
     428           0 :         bat_iterator_end(&ri);
     429             : 
     430           0 :         *res = bn->batCacheid;
     431           0 :         BBPkeepref(bn);
     432           0 :   exit:
     433           0 :         GDKfree(buffer);
     434           0 :         BBPreclaim(left);
     435           0 :         BBPreclaim(right);
     436           0 :         if (msg != MAL_SUCCEED)
     437           0 :                 BBPreclaim(bn);
     438           0 :         return msg;
     439             : }
     440             : 
     441             : #define JARO_WINKLER_SCALING_FACTOR 0.1
     442             : #define JARO_WINKLER_PREFIX_LEN 4
     443             : 
     444             : typedef struct {
     445             :         size_t matches;                         /* accumulator for number of matches for this item */
     446             :         BUN o;                                          /* position in the BAT */
     447             :         const char *val;                        /* string value */
     448             :         int *cp_sequence;                       /* string as array of Unicode codepoints */
     449             :         int len;                                        /* string length in characters (multi-byte characters count as 1) */
     450             :         int cp_seq_len;                         /* string length in bytes */
     451             :         uint64_t abm;                           /* 64bit alphabet bitmap */
     452             :         int abm_popcount;                       /* hamming weight of abm */
     453             : } str_item;
     454             : 
     455             : static inline int
     456          96 : popcount64(uint64_t x)
     457             : {
     458             : #ifdef __has_builtin
     459             : #if __has_builtin(__builtin_popcountll)
     460          96 :         return (int) __builtin_popcountll(x);
     461             : #define BUILTIN_USED
     462             : #endif
     463             : #endif
     464             : #ifndef BUILTIN_USED
     465             : #if defined(_MSC_VER)
     466             : #if SIZEOF_OID == 4
     467             :         /* no __popcnt64 on 32 bit Windows */
     468             :         return (int) (__popcnt((uint32_t) x) + __popcnt((uint32_t) (x >> 32)));
     469             : #else
     470             :         return (int) __popcnt64(x);
     471             : #endif
     472             : #else
     473             :         x = (x & UINT64_C(0x5555555555555555)) + ((x >> 1) & UINT64_C(0x5555555555555555));
     474             :         x = (x & UINT64_C(0x3333333333333333)) + ((x >> 2) & UINT64_C(0x3333333333333333));
     475             :         x = (x & UINT64_C(0x0F0F0F0F0F0F0F0F)) + ((x >> 4) & UINT64_C(0x0F0F0F0F0F0F0F0F));
     476             :         return (int) ((x * UINT64_C(0x0101010101010101)) >> 56);
     477             : #endif
     478             : #endif
     479             : #undef BUILTIN_USED
     480             : }
     481             : 
     482             : static int
     483          43 : str_item_lenrev_cmp(const void *a, const void *b)
     484             : {
     485          43 :         return ((int) ((str_item *) b)->len - (int) ((str_item *) a)->len);
     486             : }
     487             : 
     488             : static str
     489          64 : str_2_codepointseq(str_item *s)
     490             : {
     491          64 :         const uint8_t *p = (const uint8_t *) s->val;
     492             : 
     493          64 :         if (s->len == 0) {
     494           0 :                 s->cp_sequence = NULL;
     495           0 :                 return MAL_SUCCEED;
     496             :         }
     497          64 :         s->cp_sequence = GDKmalloc(s->len * sizeof(int));
     498          64 :         if (s->cp_sequence == NULL)
     499           0 :                 throw(MAL, "str_2_byteseq", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     500             : 
     501          64 :         uint32_t state = 0;
     502          64 :         uint32_t codepoint;
     503          64 :         uint32_t *buf = (uint32_t *) s->cp_sequence;
     504         428 :         while (*p) {
     505         364 :                 switch (decode(&state, &codepoint, *p++)) {
     506         364 :                 case UTF8_ACCEPT:
     507         364 :                         *buf++ = codepoint;
     508         364 :                         break;
     509           0 :                 case UTF8_REJECT:
     510           0 :                         throw(MAL, "str_2_byteseq", SQLSTATE(42000) "Illegal unicode code point");
     511             :                 default:
     512             :                         break;
     513             :                 }
     514             :         }
     515             :         return MAL_SUCCEED;
     516             : }
     517             : 
     518             : static void
     519          60 : str_alphabet_bitmap(str_item *s)
     520             : {
     521          60 :         int i;
     522             : 
     523          60 :         s->abm = UINT64_C(0);
     524             : 
     525         400 :         for (i = 0; i < s->len; i++)
     526         340 :                 s->abm |= UINT64_C(1) << (s->cp_sequence[i] % 64);
     527             : 
     528          60 :         s->abm_popcount = popcount64(s->abm);
     529          60 : }
     530             : 
     531             : static inline double
     532          13 : jarowinkler_lp(const str_item *a, const str_item *b)
     533             : {
     534          13 :         unsigned int l;
     535             : 
     536             :         /* calculate common string prefix up to prefixlen chars */
     537          13 :         l = 0;
     538          65 :         for (int i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN); i++)
     539          52 :                 l += (a->cp_sequence[i] == b->cp_sequence[i]);
     540             : 
     541          13 :         return (double) l *JARO_WINKLER_SCALING_FACTOR;
     542             : }
     543             : 
     544             : static inline double
     545          15 : jarowinkler(const str_item *x, const str_item *y, double lp, int *x_flags,
     546             :                         int *y_flags)
     547             : {
     548          15 :         int xlen = x->len, ylen = y->len;
     549          15 :         int range = MAX(0, MAX(xlen, ylen) / 2 - 1);
     550          15 :         int *x1 = x->cp_sequence, *x2 = y->cp_sequence;
     551          15 :         int m = 0, t = 0;
     552          15 :         int i, j, l;
     553          15 :         double dw;
     554             : 
     555          15 :         if (!xlen || !ylen)
     556             :                 return 0.0;
     557          89 :         for (i = 0; i < xlen; i++)
     558          74 :                 x_flags[i] = 0;
     559         102 :         for (i = 0; i < ylen; i++)
     560          87 :                 y_flags[i] = 0;
     561             :         /* matching chars */
     562         102 :         for (i = 0; i < ylen; i++) {
     563         280 :                 for (j = MAX(i - range, 0), l = MIN(i + range + 1, xlen); j < l; j++) {
     564         225 :                         if (x2[i] == x1[j] && !x_flags[j]) {
     565          32 :                                 x_flags[j] = 1;
     566          32 :                                 y_flags[i] = 1;
     567          32 :                                 m++;
     568          32 :                                 break;
     569             :                         }
     570             :                 }
     571             :         }
     572          15 :         if (!m)
     573             :                 return 0.0;
     574             :         /* char transpositions */
     575             :         l = 0;
     576          90 :         for (i = 0; i < ylen; i++) {
     577          77 :                 if (y_flags[i] == 1) {
     578          48 :                         for (j = l; j < xlen; j++) {
     579          48 :                                 if (x_flags[j] == 1) {
     580          32 :                                         l = j + 1;
     581          32 :                                         break;
     582             :                                 }
     583             :                         }
     584          32 :                         if (x2[i] != x1[j])
     585           4 :                                 t++;
     586             :                 }
     587             :         }
     588          13 :         t /= 2;
     589             : 
     590             :         /* jaro similarity */
     591          13 :         dw = (((double) m / xlen) + ((double) m / ylen) +
     592          13 :                   ((double) (m - t) / m)) / 3.0;
     593             :         /* calculate common string prefix up to prefixlen chars */
     594          13 :         if (lp == -1)
     595          13 :                 lp = jarowinkler_lp(x, y);
     596             :         /* jaro-winkler similarity */
     597          13 :         dw = dw + (lp * (1 - dw));
     598          13 :         return dw;
     599             : }
     600             : 
     601             : static str
     602           3 : TXTSIMjarowinkler(dbl *res, const char *const *x, const char *const *y)
     603             : {
     604           3 :         int *x_flags = NULL, *y_flags = NULL;
     605           3 :         str_item xi, yi;
     606           3 :         str msg = MAL_SUCCEED;
     607             : 
     608           6 :         if (strNil(*x) || strNil(*y)) {
     609           1 :                 *res = dbl_nil;
     610           1 :                 return MAL_SUCCEED;
     611             :         }
     612           4 :         xi = (str_item) {
     613             :                 .val = *x,
     614           2 :                 .len = UTF8_strlen(*x),
     615             :         };
     616           4 :         yi = (str_item) {
     617           2 :                 .val = *y,
     618           2 :                 .len = UTF8_strlen(*y),
     619             :         };
     620           2 :         if (xi.len == 0 || yi.len == 0) {
     621           0 :                 *res = 0;
     622           0 :                 return MAL_SUCCEED;
     623             :         }
     624             : 
     625           2 :         if ((msg = str_2_codepointseq(&xi)) != MAL_SUCCEED)
     626           0 :                 goto bailout;
     627             : 
     628           2 :         if ((msg = str_2_codepointseq(&yi)) != MAL_SUCCEED)
     629           0 :                 goto bailout;
     630             : 
     631           2 :         x_flags = GDKmalloc(xi.len * sizeof(int));
     632           2 :         y_flags = GDKmalloc(yi.len * sizeof(int));
     633             : 
     634           2 :         if (x_flags == NULL || y_flags == NULL)
     635           0 :                 goto bailout;
     636             : 
     637           2 :         *res = jarowinkler(&xi, &yi, -1, x_flags, y_flags);
     638             : 
     639           2 :   bailout:
     640           2 :         GDKfree(x_flags);
     641           2 :         GDKfree(y_flags);
     642           2 :         GDKfree(xi.cp_sequence);
     643           2 :         GDKfree(yi.cp_sequence);
     644           2 :         return msg;
     645             : }
     646             : 
     647             : static str
     648           0 : TXTSIMminjarowinkler(bit *res, const char *const *x, const char *const *y, const dbl *threshold)
     649             : {
     650           0 :         str msg = MAL_SUCCEED;
     651           0 :         double s = 1;
     652             : 
     653           0 :         msg = TXTSIMjarowinkler(&s, x, y);
     654           0 :         if (msg != MAL_SUCCEED)
     655           0 :                 throw(MAL, "txt.minjarowinkler", OPERATION_FAILED);
     656             : 
     657           0 :         *res = (s > *threshold);
     658           0 :         return MAL_SUCCEED;
     659             : }
     660             : 
     661             : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width))
     662             : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
     663             : 
     664             : #define PREP_BAT_STRITEM(B, CI, SI)                                                                             \
     665             :                 do {                                                                                                                    \
     666             :                         for (n = 0; n < CI.ncand; n++) {                                                     \
     667             :                                 SI[n].matches = 0;                                                                              \
     668             :                                 SI[n].o = canditer_next(&CI);                                                       \
     669             :                                 SI[n].val = VALUE(B, SI[n].o - B->hseqbase);                 \
     670             :                                 SI[n].cp_sequence = NULL;                                                               \
     671             :                                 SI[n].len = UTF8_strlen(SI[n].val);                                             \
     672             :                                 SI[n].cp_seq_len = str_strlen(SI[n].val);                               \
     673             :                                 if ((msg = str_2_codepointseq(&SI[n])) != MAL_SUCCEED)      \
     674             :                                         goto exit;                                                                                      \
     675             :                                 str_alphabet_bitmap(&SI[n]);                                                        \
     676             :                         }                                                                                                                       \
     677             :                 } while (false)
     678             : 
     679             : #define FINALIZE_BATS(L, R, LCI, RCI, LSI, RSI) \
     680             :                 do {                                                                    \
     681             :                         assert(BATcount(L) == BATcount(R));     \
     682             :                         BATsetcount(L, BATcount(L));            \
     683             :                         BATsetcount(R, BATcount(R));            \
     684             :                         for (n = 0; n < LCI.ncand; n++) {    \
     685             :                                 if (LSI[n].matches > 1) {            \
     686             :                                         L->tkey = false;                     \
     687             :                                         break;                                          \
     688             :                                 }                                                               \
     689             :                         }                                                                       \
     690             :                         if (n == LCI.ncand) {                           \
     691             :                                 L->tkey = true;                                      \
     692             :                         }                                                                       \
     693             :                         for (n = 0; n < RCI.ncand; n++) {    \
     694             :                                 if (RSI[n].matches > 1) {            \
     695             :                                         R->tkey = false;                     \
     696             :                                         break;                                          \
     697             :                                 }                                                               \
     698             :                         }                                                                       \
     699             :                         if (n == RCI.ncand) {                           \
     700             :                                 R->tkey = true;                                      \
     701             :                         }                                                                       \
     702             :                         BATordered(L);                                          \
     703             :                         BATordered(R);                                          \
     704             :                         L->theap->dirty |= BATcount(L) > 0;    \
     705             :                         R->theap->dirty |= BATcount(R) > 0;    \
     706             :                 } while (false)
     707             : 
     708             : static inline int
     709          12 : maxlevenshtein_extcol_stritem(const str_item *si1, const str_item *si2,
     710             :                                                           unsigned int *column, const int k)
     711             : {
     712          12 :         unsigned int lastdiag, olddiag;
     713          12 :         int c1, c2, x, y;
     714          12 :         unsigned int min;
     715          12 :         int s1len = si1->len, s2len = si2->len;
     716          12 :         int *s1 = si1->cp_sequence, *s2 = si2->cp_sequence;
     717             :         /* first test if the strings are equal */
     718          12 :         if (s1len == s2len) {
     719           1 :                 for (x = 0; x < s1len; x++)
     720           1 :                         if (s1[x] != s2[x])
     721             :                                 break;
     722           1 :                 if (x == s1len)
     723             :                         return 0;
     724             :         }
     725          68 :         for (y = 1; y <= s1len; y++)
     726          56 :                 column[y] = y;
     727          73 :         for (x = 1; x <= s2len; x++) {
     728          64 :                 c2 = s2[x - 1];
     729          64 :                 column[0] = x;
     730          64 :                 min = INT_MAX;
     731         370 :                 for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
     732         306 :                         olddiag = column[y];
     733         306 :                         c1 = s1[y - 1];
     734         306 :                         column[y] = MIN3(column[y] + 1, column[y - 1] + 1, lastdiag + (c1 == c2 ? 0 : 1));
     735         306 :                         lastdiag = olddiag;
     736         306 :                         if (lastdiag < min)
     737             :                                 min = lastdiag;
     738             :                 }
     739          64 :                 if (min > (unsigned int) k)
     740             :                         return INT_MAX;
     741             :         }
     742           9 :         return column[s1len];
     743             : }
     744             : 
     745             : static str
     746           6 : maxlevenshteinjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int k)
     747             : {
     748           6 :         BAT *r1t = NULL, *r2t = NULL;
     749           6 :         BUN n;
     750           6 :         struct canditer lci, rci;
     751           6 :         const char *lvals, *rvals, *lvars, *rvars;;
     752           6 :         int lwidth, rwidth, d;
     753           6 :         str_item *lsi = NULL, *rsi = NULL;
     754           6 :         str msg = MAL_SUCCEED;
     755           6 :         unsigned int *buffer = NULL;
     756             : 
     757          18 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
     758           6 :         assert(ATOMtype(l->ttype) == TYPE_str);
     759             : 
     760           6 :         canditer_init(&lci, l, sl);
     761           6 :         canditer_init(&rci, r, sr);
     762             : 
     763           6 :         if (lci.ncand == 0 || rci.ncand == 0)
     764           0 :                 goto exit;
     765             : 
     766           6 :         lvals = (const char *) Tloc(l, 0);
     767           6 :         rvals = (const char *) Tloc(r, 0);
     768           6 :         lvars = l->tvheap->base;
     769           6 :         rvars = r->tvheap->base;
     770           6 :         lwidth = l->twidth;
     771           6 :         rwidth = r->twidth;
     772             : 
     773           6 :         if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
     774           6 :                 (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
     775           0 :                 reclaim_bats(2, r1t, r2t);
     776           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     777             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     778           0 :                 goto exit;
     779             :         }
     780             : 
     781           6 :         r1t->tsorted = r1t->trevsorted = false;
     782           6 :         r2t->tsorted = r2t->trevsorted = false;
     783             : 
     784           6 :         if ((lsi = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
     785           6 :                 (rsi = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
     786           0 :                 reclaim_bats(2, r1t, r2t);
     787           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     788             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     789           0 :                 goto exit;
     790             :         }
     791             : 
     792          12 :         PREP_BAT_STRITEM(l, lci, lsi);
     793          24 :         PREP_BAT_STRITEM(r, rci, rsi);
     794           6 :         qsort(lsi, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     795           6 :         qsort(rsi, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     796             : 
     797           5 :         if ((buffer = GDKmalloc((lsi[0].len + 1) * sizeof(int))) == NULL) {
     798           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     799             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     800           0 :                 goto exit;
     801             :         }
     802             : 
     803             :         /* join loop */
     804          23 :         for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
     805          34 :                 for (n = lstart; n < lci.ncand; n++) {
     806             :                         /* first and cheapest filter */
     807          19 :                         if ((lsi[n].len) > k + rsi[rstart].len) {
     808           0 :                                 lstart++;
     809           0 :                                 continue;               /* no possible matches yet for this r */
     810          19 :                         } else if (rsi[rstart].len > k + lsi[n].len) {
     811             :                                 break;                  /* no more possible matches from this r */
     812             :                         }
     813             :                         /* filter by comparing alphabet bitmaps (imprecise but fast filter) */
     814          17 :                         d = MAX(lsi[n].abm_popcount,
     815             :                                         rsi[rstart].abm_popcount) -
     816          17 :                                         popcount64(lsi[n].abm & rsi[rstart].abm);
     817          17 :                         if (d > k)
     818           4 :                                 continue;
     819             :                         /* final and most expensive test: Levenshtein distance */
     820          13 :                         d = maxlevenshtein_extcol_stritem(&lsi[n], &rsi[rstart], buffer,
     821             :                                                                                           (const unsigned int) k);
     822          12 :                         if (d > k)
     823           6 :                                 continue;
     824             :                         /* The match test succeeded */
     825           6 :                         lsi[n].matches++;
     826           6 :                         rsi[rstart].matches++;
     827           6 :                         if (bunfastappOID(r1t, lsi[n].o) != GDK_SUCCEED) {
     828           0 :                                 reclaim_bats(2, r1t, r2t);
     829           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     830             :                                                                           OPERATION_FAILED "Failed bun append");
     831           0 :                                 goto exit;
     832             :                         }
     833           6 :                         if (bunfastappOID(r2t, rsi[rstart].o) != GDK_SUCCEED) {
     834           0 :                                 reclaim_bats(2, r1t, r2t);
     835           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     836             :                                                                           OPERATION_FAILED "Failed bun append");
     837           0 :                                 goto exit;
     838             :                         }
     839             :                 }
     840             :         }
     841             : 
     842          24 :         FINALIZE_BATS(r1t, r2t, lci, rci, lsi, rsi);
     843           6 :         *r1 = r1t;
     844           6 :         *r2 = r2t;
     845             : 
     846           6 :   exit:
     847          12 :         for (n = 0; lsi && n < lci.ncand; n++)
     848           6 :                 GDKfree(lsi[n].cp_sequence);
     849          24 :         for (n = 0; rsi && n < rci.ncand; n++)
     850          18 :                 GDKfree(rsi[n].cp_sequence);
     851           6 :         GDKfree(lsi);
     852           6 :         GDKfree(rsi);
     853           6 :         GDKfree(buffer);
     854           6 :         return msg;
     855             : }
     856             : 
     857             : static str
     858           6 : TXTSIMmaxlevenshteinjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
     859             :                                                  const bat *kid, const bat *slid, const bat *srid,
     860             :                                                  const bit *nil_matches, const lng *estimate,
     861             :                                                  const bit *anti)
     862             : {
     863           6 :         (void) nil_matches;
     864           6 :         (void) estimate;
     865           6 :         (void) anti;
     866             : 
     867           6 :         BAT *bleft = NULL, *bright = NULL, *bk = NULL,
     868           6 :                         *bcandleft = NULL, *bcandright = NULL, *r1t = NULL, *r2t = NULL;
     869           6 :         int k = 0;
     870           6 :         str msg = MAL_SUCCEED;
     871             : 
     872           6 :         if ((bleft = BATdescriptor(*lid)) == NULL ||
     873           6 :                 (bright = BATdescriptor(*rid)) == NULL ||
     874           6 :                 (bk = BATdescriptor(*kid)) == NULL) {
     875           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     876             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     877           0 :                 goto exit;
     878             :         }
     879             : 
     880           6 :         if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
     881           6 :                 (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
     882           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     883             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     884           0 :                 goto exit;
     885             :         }
     886             : 
     887           6 :         if (BATcount(bk) > 0) {
     888           6 :                 BATiter ki = bat_iterator(bk);
     889           6 :                 k = *(int *) BUNtloc(ki, 0);
     890           6 :                 bat_iterator_end(&ki);
     891             :         }
     892             : 
     893           6 :         if ((msg = maxlevenshteinjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, k)) != MAL_SUCCEED)
     894           0 :                 goto exit;
     895             : 
     896           6 :         *r1 = r1t->batCacheid;
     897           6 :         *r2 = r2t->batCacheid;
     898           6 :         BBPkeepref(r1t);
     899           6 :         BBPkeepref(r2t);
     900             : 
     901           6 :   exit:
     902           6 :         reclaim_bats(5, bleft, bright, bcandleft, bcandright, bk);
     903           6 :         if (msg != MAL_SUCCEED)
     904           0 :                 reclaim_bats(2, r1t, r2t);
     905           6 :         return msg;
     906             : }
     907             : 
     908             : static inline void
     909           9 : jarowinkler_rangebounds(int *lb, int *ub, const str_item *a, const double lp,
     910             :                                                 const double threshold)
     911             : {
     912           9 :         *lb = (int) floor(3.0 * a->len * (threshold - lp) / (1.0 - lp) -
     913           9 :                                           (2.0 * a->len));
     914           9 :         *ub = (int) ceil(a->len / ((3.0 * (threshold - lp) / (1.0 - lp)) - 2.0));
     915           9 : }
     916             : 
     917             : /* version with given lp and m, and t = 0*/
     918             : static inline double
     919          19 : jarowinkler_lp_m_t0(const str_item *lsi, const str_item *rsi, double lp, int m)
     920             : {
     921          19 :         double dw;
     922             :         /* Jaro similarity */
     923          19 :         dw = (((double) m / lsi->len) + ((double) m / rsi->len) + 1.0) / 3.0;
     924             :         /* Jaro-Winkler similarity */
     925          19 :         dw = dw + (lp * (1 - dw));
     926          19 :         return dw;
     927             : }
     928             : 
     929             : static str
     930           9 : minjarowinklerjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
     931             :                                    const dbl threshold)
     932             : {
     933           9 :         BAT *r1t = NULL, *r2t = NULL;
     934           9 :         BUN n;
     935           9 :         struct canditer lci, rci;
     936           9 :         const char *lvals, *rvals, *lvars, *rvars;
     937           9 :         int lwidth, rwidth, lb = 0, ub = 0, m = -1, *x_flags = NULL, *y_flags = NULL;
     938           9 :         str_item *ssl = NULL, *ssr = NULL, shortest;
     939           9 :         str msg = MAL_SUCCEED;
     940           9 :         const bool sliding_window_allowed = threshold > (2.01 + JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR) / 3.0;
     941           9 :         double s, lp = 0;
     942             : 
     943          27 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
     944           9 :         assert(ATOMtype(l->ttype) == TYPE_str);
     945             : 
     946           9 :         canditer_init(&lci, l, sl);
     947           9 :         canditer_init(&rci, r, sr);
     948             : 
     949           9 :         if (lci.ncand == 0 || rci.ncand == 0)
     950           0 :                 goto exit;
     951             : 
     952           9 :         lvals = (const char *) Tloc(l, 0);
     953           9 :         rvals = (const char *) Tloc(r, 0);
     954           9 :         assert(r->ttype);
     955           9 :         lvars = l->tvheap->base;
     956           9 :         rvars = r->tvheap->base;
     957           9 :         lwidth = l->twidth;
     958           9 :         rwidth = r->twidth;
     959             : 
     960           9 :         if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
     961           9 :                 (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
     962           0 :                 reclaim_bats(2, r1t, r2t);
     963           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
     964             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     965           0 :                 goto exit;
     966             :         }
     967             : 
     968           9 :         r1t->tsorted = r1t->trevsorted = false;
     969           9 :         r2t->tsorted = r2t->trevsorted = false;
     970             : 
     971           9 :         if ((ssl = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
     972           9 :                 (ssr = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
     973           0 :                 reclaim_bats(2, r1t, r2t);
     974           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     975             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     976           0 :                 goto exit;
     977             :         }
     978             : 
     979          18 :         PREP_BAT_STRITEM(l, lci, ssl);
     980          36 :         PREP_BAT_STRITEM(r, rci, ssr);
     981           9 :         qsort(ssl, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     982           9 :         qsort(ssr, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     983             : 
     984           9 :         if ((x_flags = GDKmalloc(ssl[0].len * sizeof(int))) == NULL ||
     985           9 :                 (y_flags = GDKmalloc(ssr[0].len * sizeof(int))) == NULL) {
     986           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
     987             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     988           0 :                 goto exit;
     989             :         }
     990             :         // lp used for filters. Use -1 for actual JW (forces to compute actual lp)
     991          36 :         lp = JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR;
     992             : 
     993             :         /* join loop */
     994          36 :         for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
     995          27 :                 if (sliding_window_allowed)
     996           9 :                         jarowinkler_rangebounds(&lb, &ub, &ssr[rstart], lp, threshold);
     997          47 :                 for (n = lstart; n < lci.ncand; n++) {
     998             :                         /* Update sliding window */
     999             :                         /* This is the first and cheapest filter */
    1000          27 :                         if (sliding_window_allowed) {
    1001           9 :                                 if (ssl[n].len > ub) {       /* no possible matches yet for this r */
    1002           1 :                                         lstart++;
    1003           1 :                                         continue;
    1004             :                                 }
    1005           8 :                                 if (ssl[n].len < lb) {       /* no more possible matches from this r */
    1006             :                                         break;
    1007             :                                 }
    1008             :                         }
    1009             :                         /* filter by comparing alphabet bitmaps */
    1010             :                         /* find the best possible m: the length of the shorter string
    1011             :                          * minus the number of characters that surely cannot match */
    1012          19 :                         shortest = ssl[n].len < ssr[rstart].len ? ssl[n] : ssr[rstart];
    1013          19 :                         m = shortest.len - popcount64(shortest.abm -
    1014          19 :                                                                                   (ssl[n].abm & ssr[rstart].abm));
    1015             :                         /* equivalent to:
    1016             :                            m = shortest.len - popcount64(shortest.abm & (~(ssl[n].abm & ssr[rstart].abm))); */
    1017          19 :                         s = jarowinkler_lp_m_t0(&ssl[n], &ssr[rstart], lp, m);
    1018          19 :                         if (s < threshold) {
    1019           6 :                                 continue;
    1020             :                         }
    1021             :                         /* final and most expensive test: Jaro-Winkler similarity */
    1022          13 :                         s = jarowinkler(&ssl[n], &ssr[rstart], -1, x_flags, y_flags);
    1023          13 :                         if (s < threshold) {
    1024           4 :                                 continue;
    1025             :                         }
    1026             :                         /* The match test succeeded */
    1027           9 :                         ssl[n].matches++;
    1028           9 :                         ssr[rstart].matches++;
    1029           9 :                         if (bunfastappOID(r1t, ssl[n].o) != GDK_SUCCEED) {
    1030           0 :                                 reclaim_bats(2, r1t, r2t);
    1031           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
    1032             :                                                                           OPERATION_FAILED "Failed bun append");
    1033           0 :                                 goto exit;
    1034             :                         }
    1035           9 :                         if (bunfastappOID(r2t, ssr[rstart].o) != GDK_SUCCEED) {
    1036           0 :                                 reclaim_bats(2, r1t, r2t);
    1037           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
    1038             :                                                                           OPERATION_FAILED "Failed bun append");
    1039           0 :                                 goto exit;
    1040             :                         }
    1041             :                 }
    1042             :         }
    1043             : 
    1044          42 :         FINALIZE_BATS(r1t, r2t, lci, rci, ssl, ssr);
    1045           9 :         *r1 = r1t;
    1046           9 :         *r2 = r2t;
    1047             : 
    1048           9 :   exit:
    1049           9 :         if (ssl)
    1050          18 :                 for (n = 0; n < lci.ncand; n++)
    1051           9 :                         GDKfree(ssl[n].cp_sequence);
    1052           9 :         if (ssr)
    1053          36 :                 for (n = 0; n < rci.ncand; n++)
    1054          27 :                         GDKfree(ssr[n].cp_sequence);
    1055           9 :         GDKfree(x_flags);
    1056           9 :         GDKfree(y_flags);
    1057           9 :         GDKfree(ssl);
    1058           9 :         GDKfree(ssr);
    1059           9 :         return msg;
    1060             : }
    1061             : 
    1062             : static str
    1063           9 : TXTSIMminjarowinklerjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
    1064             :                                                  const bat *thresholdid, const bat *slid,
    1065             :                                                  const bat *srid, const bit *nil_matches,
    1066             :                                                  const lng *estimate, const bit *anti)
    1067             : {
    1068           9 :         (void) nil_matches;
    1069           9 :         (void) estimate;
    1070           9 :         (void) anti;
    1071             : 
    1072           9 :         BAT *bleft = NULL, *bright = NULL,
    1073           9 :                         *bcandleft = NULL, *bcandright = NULL, *bthreshold = NULL,
    1074           9 :                         *r1t = NULL, *r2t = NULL;
    1075           9 :         dbl threshold = 1;
    1076           9 :         str msg = MAL_SUCCEED;
    1077             : 
    1078           9 :         if ((bleft = BATdescriptor(*lid)) == NULL ||
    1079           9 :                 (bright = BATdescriptor(*rid)) == NULL ||
    1080           9 :                 (bthreshold = BATdescriptor(*thresholdid)) == NULL) {
    1081           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
    1082             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1083           0 :                 goto exit;
    1084             :         }
    1085             : 
    1086           9 :         if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
    1087           9 :                 (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
    1088           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
    1089             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1090           0 :                 goto exit;
    1091             :         }
    1092             : 
    1093           9 :         if (BATcount(bthreshold) > 0) {
    1094           9 :                 BATiter thresholdi = bat_iterator(bthreshold);
    1095           9 :                 threshold = *(dbl *) BUNtail(thresholdi, 0);
    1096           9 :                 bat_iterator_end(&thresholdi);
    1097             :         }
    1098             : 
    1099           9 :         if ((msg = minjarowinklerjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, threshold)) != MAL_SUCCEED)
    1100           0 :                 goto exit;
    1101             : 
    1102           9 :         *r1 = r1t->batCacheid;
    1103           9 :         *r2 = r2t->batCacheid;
    1104           9 :         BBPkeepref(r1t);
    1105           9 :         BBPkeepref(r2t);
    1106             : 
    1107           9 :   exit:
    1108           9 :         reclaim_bats(5, bleft, bright, bcandleft, bcandright, bthreshold);
    1109           9 :         if (msg != MAL_SUCCEED)
    1110           0 :                 reclaim_bats(2, r1t, r2t);
    1111           9 :         return msg;
    1112             : }
    1113             : 
    1114             : #define SoundexLen 4                    /* length of a soundex code */
    1115             : #define SoundexKey "Z000"             /* default key for soundex code */
    1116             : 
    1117             : /* set letter values */
    1118             : static const int Code[] = { 0, 1, 2, 3, 0, 1,
    1119             :         2, 0, 0, 2, 2, 4,
    1120             :         5, 5, 0, 1, 2, 6,
    1121             :         2, 3, 0, 1, 0, 2, 0, 2
    1122             : };
    1123             : 
    1124             : #define RETURN_NIL_IF(b,t)                                                                                              \
    1125             :         if (b) {                                                                                                                        \
    1126             :                 if (ATOMextern(t)) {                                                                                    \
    1127             :                         *(ptr*) res = (ptr) ATOMnil(t);                                                 \
    1128             :                         if ( *(ptr *) res == NULL)                                                                      \
    1129             :                                 throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
    1130             :                 } else {                                                                                                                \
    1131             :                         memcpy(res, ATOMnilptr(t), ATOMsize(t));                                        \
    1132             :                 }                                                                                                                               \
    1133             :                 return MAL_SUCCEED;                                                                                     \
    1134             :         }
    1135             : 
    1136             : static inline char
    1137         155 : SCode(unsigned char c)
    1138             : {
    1139         155 :         if (c == 95)
    1140             :                 return (2);                             /* german sz */
    1141         155 :         return (Code[toupper(c) - 'A']);
    1142             : }
    1143             : 
    1144             : static str
    1145          65 : soundex_code(const char *Name, char *Key)
    1146             : {
    1147          65 :         char LastLetter;
    1148          65 :         int Index;
    1149             : 
    1150         301 :         for (const char *p = Name; *p; p++)
    1151         237 :                 if ((*p & 0x80) != 0)
    1152           1 :                         throw(MAL, "soundex",
    1153             :                                   SQLSTATE(42000)
    1154             :                                   "Soundex function not available for non ASCII strings");
    1155             : 
    1156             :         /* set default key */
    1157          64 :         strcpy(Key, SoundexKey);
    1158             : 
    1159             :         /* keep first letter */
    1160          64 :         Key[0] = *Name;
    1161          64 :         if (!isupper((unsigned char) (Key[0])))
    1162          22 :                 Key[0] = toupper(Key[0]);
    1163             : 
    1164          64 :         LastLetter = *Name;
    1165          64 :         if (!*Name)
    1166             :                 return MAL_SUCCEED;
    1167          55 :         Name++;
    1168             : 
    1169             :         /* scan rest of string */
    1170         227 :         for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
    1171             :                 /* use only letters */
    1172         172 :                 if (isalpha((unsigned char) (*Name))) {
    1173             :                         /* ignore duplicate successive chars */
    1174         169 :                         if (LastLetter != *Name) {
    1175             :                                 /* new LastLetter */
    1176         155 :                                 LastLetter = *Name;
    1177             : 
    1178             :                                 /* ignore letters with code 0 */
    1179         155 :                                 if (SCode(*Name) != 0) {
    1180          74 :                                         Key[Index] = '0' + SCode(*Name);
    1181          74 :                                         Index ++;
    1182             :                                 }
    1183             :                         }
    1184             :                 }
    1185             :         }
    1186             :         return MAL_SUCCEED;
    1187             : }
    1188             : 
    1189             : static str
    1190          70 : soundex(str *res, const char *const *Name)
    1191             : {
    1192          70 :         str msg = MAL_SUCCEED;
    1193             : 
    1194          70 :         GDKfree(*res);
    1195          74 :         RETURN_NIL_IF(strNil(*Name), TYPE_str);
    1196             : 
    1197          66 :         *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
    1198          65 :         if (*res == NULL)
    1199           0 :                 throw(MAL, "soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1200             : 
    1201             :         /* calculate Key for Name */
    1202          65 :         if ((msg = soundex_code(*Name, *res))) {
    1203           1 :                 GDKfree(*res);
    1204           1 :                 *res = NULL;
    1205           1 :                 return msg;
    1206             :         }
    1207             :         return msg;
    1208             : }
    1209             : 
    1210             : static str
    1211          29 : stringdiff(int *res, const char *const *s1, const char *const *s2)
    1212             : {
    1213          29 :         str r = MAL_SUCCEED;
    1214          29 :         char *S1 = NULL, *S2 = NULL;
    1215             : 
    1216          29 :         r = soundex(&S1, s1);
    1217          30 :         if (r != MAL_SUCCEED)
    1218             :                 return r;
    1219          30 :         r = soundex(&S2, s2);
    1220          30 :         if (r != MAL_SUCCEED) {
    1221           0 :                 GDKfree(S1);
    1222           0 :                 return r;
    1223             :         }
    1224          30 :         r = TXTSIMdameraulevenshtein1(res, &(const char *){S1}, &(const char *){S2});
    1225          30 :         GDKfree(S1);
    1226          30 :         GDKfree(S2);
    1227          30 :         return r;
    1228             : }
    1229             : 
    1230             : /******************************
    1231             :  * QGRAMNORMALIZE
    1232             :  *
    1233             :  * This function 'normalizes' a string so valid q-grams can  be made of it:
    1234             :  * All characters are transformed to uppercase, and all characters
    1235             :  * which are not letters or digits are stripped to a single space.
    1236             :  *
    1237             :  * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
    1238             :  * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
    1239             :  *
    1240             :  *****************************/
    1241             : static str
    1242          13 : qgram_normalize(str *res, const char *const *Input)
    1243             : {
    1244          13 :         const char *input = *Input;
    1245          13 :         int i, j = 0;
    1246          13 :         char c, last = ' ';
    1247             : 
    1248          13 :         GDKfree(*res);
    1249          13 :         RETURN_NIL_IF(strNil(input), TYPE_str);
    1250          13 :         *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1));     /* normalized strings are never longer than original */
    1251          13 :         if (*res == NULL)
    1252           0 :                 throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1253             : 
    1254         100 :         for (i = 0; input[i]; i++) {
    1255          87 :                 c = toupper(input[i]);
    1256          87 :                 if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
    1257          87 :                         c = ' ';
    1258          87 :                 if (c != ' ' || last != ' ') {
    1259          78 :                         (*res)[j++] = c;
    1260             :                 }
    1261          87 :                 last = c;
    1262             :         }
    1263          13 :         (*res)[j] = 0;
    1264             :         /* strip final whitespace */
    1265          16 :         while (j > 0 && (*res)[--j] == ' ')
    1266           3 :                 (*res)[j] = 0;
    1267             : 
    1268             :         return MAL_SUCCEED;
    1269             : }
    1270             : 
    1271             : static str
    1272           0 : qgram_selfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid,
    1273             :                            flt *c, int *k)
    1274             : {
    1275           0 :         BAT *qgram, *id, *pos, *len;
    1276           0 :         BUN n;
    1277           0 :         BUN i, j;
    1278           0 :         BAT *bn, *bn2;
    1279           0 :         oid *qbuf;
    1280           0 :         int *ibuf;
    1281           0 :         int *pbuf;
    1282           0 :         int *lbuf;
    1283           0 :         str msg = MAL_SUCCEED;
    1284             : 
    1285           0 :         qgram = BATdescriptor(*qid);
    1286           0 :         id = BATdescriptor(*bid);
    1287           0 :         pos = BATdescriptor(*pid);
    1288           0 :         len = BATdescriptor(*lid);
    1289           0 :         if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
    1290           0 :                 BBPreclaim(qgram);
    1291           0 :                 BBPreclaim(id);
    1292           0 :                 BBPreclaim(pos);
    1293           0 :                 BBPreclaim(len);
    1294           0 :                 throw(MAL, "txtsim.qgramselfjoin",
    1295             :                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1296             :         }
    1297             : 
    1298           0 :         BATiter qgrami = bat_iterator(qgram);
    1299           0 :         BATiter idi = bat_iterator(id);
    1300           0 :         BATiter posi = bat_iterator(pos);
    1301           0 :         BATiter leni = bat_iterator(len);
    1302           0 :         if (qgrami.type != TYPE_oid)
    1303           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1304             :                                                           SEMANTIC_TYPE_MISMATCH
    1305             :                                                           ": tail of BAT qgram must be oid");
    1306           0 :         else if (idi.type != TYPE_int)
    1307           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1308             :                                                           SEMANTIC_TYPE_MISMATCH
    1309             :                                                           ": tail of BAT id must be int");
    1310           0 :         else if (posi.type != TYPE_int)
    1311           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1312             :                                                           SEMANTIC_TYPE_MISMATCH
    1313             :                                                           ": tail of BAT pos must be int");
    1314           0 :         else if (leni.type != TYPE_int)
    1315           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1316             :                                                           SEMANTIC_TYPE_MISMATCH
    1317             :                                                           ": tail of BAT len must be int");
    1318           0 :         if (msg) {
    1319           0 :                 bat_iterator_end(&qgrami);
    1320           0 :                 bat_iterator_end(&idi);
    1321           0 :                 bat_iterator_end(&posi);
    1322           0 :                 bat_iterator_end(&leni);
    1323           0 :                 BBPunfix(qgram->batCacheid);
    1324           0 :                 BBPunfix(id->batCacheid);
    1325           0 :                 BBPunfix(pos->batCacheid);
    1326           0 :                 BBPunfix(len->batCacheid);
    1327           0 :                 return msg;
    1328             :         }
    1329             : 
    1330           0 :         n = BATcount(qgram);
    1331             : 
    1332             :         /* if (BATcount(qgram)>1 && !qgrami.sorted) throw(MAL, "txtsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
    1333             : 
    1334           0 :         if (!ALIGNsynced(qgram, id))
    1335           0 :                  msg = createException(MAL, "txtsim.qgramselfjoin",
    1336             :                                                            SEMANTIC_TYPE_MISMATCH
    1337             :                                                            ": qgram and id are not synced");
    1338             : 
    1339           0 :         else if (!ALIGNsynced(qgram, pos))
    1340           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1341             :                                                           SEMANTIC_TYPE_MISMATCH
    1342             :                                                           ": qgram and pos are not synced");
    1343           0 :         else if (!ALIGNsynced(qgram, len))
    1344           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1345             :                                                           SEMANTIC_TYPE_MISMATCH
    1346             :                                                           ": qgram and len are not synced");
    1347             : 
    1348           0 :         else if (qgrami.width != ATOMsize(qgrami.type))
    1349           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1350             :                                                           SEMANTIC_TYPE_MISMATCH
    1351             :                                                           ": qgram is not a true void bat");
    1352           0 :         else if (idi.width != ATOMsize(idi.type))
    1353           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1354             :                                                           SEMANTIC_TYPE_MISMATCH
    1355             :                                                           ": id is not a true void bat");
    1356             : 
    1357           0 :         else if (posi.width != ATOMsize(posi.type))
    1358           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1359             :                                                           SEMANTIC_TYPE_MISMATCH
    1360             :                                                           ": pos is not a true void bat");
    1361           0 :         else if (leni.width != ATOMsize(leni.type))
    1362           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1363             :                                                           SEMANTIC_TYPE_MISMATCH
    1364             :                                                           ": len is not a true void bat");
    1365           0 :         if (msg) {
    1366           0 :                 bat_iterator_end(&qgrami);
    1367           0 :                 bat_iterator_end(&idi);
    1368           0 :                 bat_iterator_end(&posi);
    1369           0 :                 bat_iterator_end(&leni);
    1370           0 :                 BBPunfix(qgram->batCacheid);
    1371           0 :                 BBPunfix(id->batCacheid);
    1372           0 :                 BBPunfix(pos->batCacheid);
    1373           0 :                 BBPunfix(len->batCacheid);
    1374           0 :                 return msg;
    1375             :         }
    1376             : 
    1377           0 :         bn = COLnew(0, TYPE_int, n, TRANSIENT);
    1378           0 :         bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
    1379           0 :         if (bn == NULL || bn2 == NULL) {
    1380           0 :                 bat_iterator_end(&qgrami);
    1381           0 :                 bat_iterator_end(&idi);
    1382           0 :                 bat_iterator_end(&posi);
    1383           0 :                 bat_iterator_end(&leni);
    1384           0 :                 BBPreclaim(bn);
    1385           0 :                 BBPreclaim(bn2);
    1386           0 :                 BBPunfix(qgram->batCacheid);
    1387           0 :                 BBPunfix(id->batCacheid);
    1388           0 :                 BBPunfix(pos->batCacheid);
    1389           0 :                 BBPunfix(len->batCacheid);
    1390           0 :                 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1391             :         }
    1392             : 
    1393           0 :         qbuf = (oid *) qgrami.base;
    1394           0 :         ibuf = (int *) idi.base;
    1395           0 :         pbuf = (int *) posi.base;
    1396           0 :         lbuf = (int *) leni.base;
    1397           0 :         for (i = 0; i < n - 1; i++) {
    1398           0 :                 for (j = i + 1;
    1399           0 :                          (j < n && qbuf[j] == qbuf[i]
    1400           0 :                           && pbuf[j] <= (pbuf[i] + (*k + *c * MIN(lbuf[i], lbuf[j]))));
    1401           0 :                          j++) {
    1402           0 :                         if (ibuf[i] != ibuf[j]
    1403           0 :                                 && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MIN(lbuf[i], lbuf[j]))) {
    1404           0 :                                 if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED
    1405           0 :                                         || BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
    1406           0 :                                         bat_iterator_end(&qgrami);
    1407           0 :                                         bat_iterator_end(&idi);
    1408           0 :                                         bat_iterator_end(&posi);
    1409           0 :                                         bat_iterator_end(&leni);
    1410           0 :                                         BBPunfix(qgram->batCacheid);
    1411           0 :                                         BBPunfix(id->batCacheid);
    1412           0 :                                         BBPunfix(pos->batCacheid);
    1413           0 :                                         BBPunfix(len->batCacheid);
    1414           0 :                                         BBPreclaim(bn);
    1415           0 :                                         BBPreclaim(bn2);
    1416           0 :                                         throw(MAL, "txtsim.qgramselfjoin",
    1417             :                                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1418             :                                 }
    1419             :                         }
    1420             :                 }
    1421             :         }
    1422           0 :         bat_iterator_end(&qgrami);
    1423           0 :         bat_iterator_end(&idi);
    1424           0 :         bat_iterator_end(&posi);
    1425           0 :         bat_iterator_end(&leni);
    1426             : 
    1427           0 :         BBPunfix(qgram->batCacheid);
    1428           0 :         BBPunfix(id->batCacheid);
    1429           0 :         BBPunfix(pos->batCacheid);
    1430           0 :         BBPunfix(len->batCacheid);
    1431             : 
    1432           0 :         *res1 = bn->batCacheid;
    1433           0 :         BBPkeepref(bn);
    1434           0 :         *res2 = bn2->batCacheid;
    1435           0 :         BBPkeepref(bn2);
    1436             : 
    1437           0 :         return MAL_SUCCEED;
    1438             : }
    1439             : 
    1440             : /* copy up to utf8len UTF-8 encoded characters from src to buf
    1441             :  * stop early if buf (size given by bufsize) is too small, or if src runs out
    1442             :  * return number of UTF-8 characters copied (excluding NUL)
    1443             :  * close with NUL if enough space */
    1444             : static size_t
    1445          26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
    1446             : {
    1447          26 :         size_t cnt = 0;
    1448             : 
    1449         128 :         while (utf8len != 0 && *src != 0 && bufsize != 0) {
    1450         102 :                 bufsize--;
    1451         102 :                 utf8len--;
    1452         102 :                 cnt++;
    1453         102 :                 if (((*buf++ = *src++) & 0x80) != 0) {
    1454          40 :                         while ((*src & 0xC0) == 0x80 && bufsize != 0) {
    1455          20 :                                 *buf++ = *src++;
    1456          20 :                                 bufsize--;
    1457             :                         }
    1458             :                 }
    1459             :         }
    1460          26 :         if (bufsize != 0)
    1461          26 :                 *buf = 0;
    1462          26 :         return cnt;
    1463             : }
    1464             : 
    1465             : static str
    1466           2 : str_2_qgrams(bat *ret, const char *const *val)
    1467             : {
    1468           2 :         BAT *bn;
    1469           2 :         size_t i, len = strlen(*val) + 5;
    1470           2 :         str s = GDKmalloc(len);
    1471           2 :         char qgram[4 * 6 + 1];          /* 4 UTF-8 code points plus NULL byte */
    1472             : 
    1473           2 :         if (s == NULL)
    1474           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1475           2 :         strcpy(s, "##");
    1476           2 :         strcpy(s + 2, *val);
    1477           2 :         strcpy(s + len - 3, "$$");
    1478           2 :         bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
    1479           2 :         if (bn == NULL) {
    1480           0 :                 GDKfree(s);
    1481           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1482             :         }
    1483             : 
    1484             :         i = 0;
    1485          26 :         while (s[i]) {
    1486          26 :                 if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
    1487             :                         break;
    1488          24 :                 if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
    1489           0 :                         BBPreclaim(bn);
    1490           0 :                         GDKfree(s);
    1491           0 :                         throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1492             :                 }
    1493          24 :                 if ((s[i++] & 0xC0) == 0xC0) {
    1494           8 :                         while ((s[i] & 0xC0) == 0x80)
    1495           4 :                                 i++;
    1496             :                 }
    1497             :         }
    1498           2 :         *ret = bn->batCacheid;
    1499           2 :         BBPkeepref(bn);
    1500           2 :         GDKfree(s);
    1501           2 :         return MAL_SUCCEED;
    1502             : }
    1503             : 
    1504             : #include "mel.h"
    1505             : mel_func txtsim_init_funcs[] = {
    1506             :         pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1, transposition = 2)", args(1,3,arg("",int),arg("x",str),arg("y",str))),
    1507             :         pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
    1508             :         command("txtsim", "editdistance", TXTSIMdameraulevenshtein1, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1509             :         command("txtsim", "editdistance2", TXTSIMdameraulevenshtein2, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 1", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1510             :         pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1)", args(1,3,arg("",int),arg("s",str),arg("t",str))),
    1511             :         pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, variable operation costs (ins/del, replacement)", args(1,5,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int))),
    1512             :         pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
    1513             :         pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with basic costs but up to a MAX", args(1, 4, arg("",int), arg("l",str),arg("r",str),arg("k",int))),
    1514             :         pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with variable costs but up to a MAX", args(1, 6, arg("",int), arg("l",str),arg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
    1515             :         pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 4, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int))),
    1516             :         pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 6, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
    1517             :         command("txtsim", "maxlevenshteinjoin", TXTSIMmaxlevenshteinjoin, false, "", args(2,10, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("k",int),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    1518             :         command("txtsim", "soundex", soundex, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
    1519             :         command("txtsim", "stringdiff", stringdiff, false, "Calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
    1520             :         command("txtsim", "qgramnormalize", qgram_normalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
    1521             :         command("txtsim", "qgramselfjoin", qgram_selfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
    1522             :         command("txtsim", "str2qgrams", str_2_qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
    1523             :         command("txtsim", "jarowinkler", TXTSIMjarowinkler, false, "Calculate Jaro Winkler similarity", args(1,3, arg("",dbl),arg("x",str),arg("y",str))),
    1524             :         command("txtsim", "minjarowinkler", TXTSIMminjarowinkler, false, "", args(1, 4, arg("",bit), arg("l",str),arg("r",str),arg("threshold",dbl))),
    1525             :         command("txtsim", "minjarowinklerjoin", TXTSIMminjarowinklerjoin, false, "", args(2, 10, batarg("",oid),batarg("",oid), batarg("l",str),batarg("r",str),batarg("threshold",dbl),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    1526             :         { .imp=NULL }
    1527             : };
    1528             : #include "mal_import.h"
    1529             : #ifdef _MSC_VER
    1530             : #undef read
    1531             : #pragma section(".CRT$XCU",read)
    1532             : #endif
    1533         345 : LIB_STARTUP_FUNC(init_txtsim_mal)
    1534         345 : { mal_module("txtsim", NULL, txtsim_init_funcs); }

Generated by: LCOV version 1.14